CHANGES: - Added MeasuredDate as first column in regional.csv - Added MeasuredDate as first column in muscle_balance.csv - Updated README to document new column structure BENEFITS: ✅ Track regional changes over time (e.g., Arms fat % across scans) ✅ Easy time-series analysis with pandas/Excel ✅ Filter by date range for progress tracking ✅ Consistent date column across all 3 CSV files ✅ Enables queries like: 'Show me trunk fat % over last 6 months' EXAMPLE USAGE: import pandas as pd regional = pd.read_csv('regional.csv') arms = regional[regional['Region'] == 'Arms'] # Now you can track Arms progress over time! Each scan now adds: - 1 row to overall.csv - 6 rows to regional.csv (one per region) - 6 rows to muscle_balance.csv (one per limb comparison)
886 lines
35 KiB
Python
886 lines
35 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
BodySpec Insights - Body composition analytics for BodySpec DEXA scan PDFs
|
||
|
||
Extract measurements from BodySpec DEXA reports, compute 30+ derived metrics,
|
||
and output structured data for progress tracking.
|
||
|
||
Usage:
|
||
python dexa_extract.py /path/to/bodyspec-report.pdf --height-in 74 --weight-lb 212 --outdir ./data/results
|
||
|
||
Note: This script is specifically designed for BodySpec PDF reports.
|
||
|
||
Requires:
|
||
pip install pdfplumber pandas
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import os
|
||
import re
|
||
import sys
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
import pdfplumber
|
||
import pandas as pd
|
||
|
||
class ValidationError(Exception):
|
||
"""Custom exception for validation errors"""
|
||
pass
|
||
|
||
def get_processed_dates(outdir):
|
||
"""Get list of already-processed scan dates from existing CSV"""
|
||
overall_csv = Path(outdir) / "overall.csv"
|
||
if not overall_csv.exists():
|
||
return set()
|
||
|
||
try:
|
||
df = pd.read_csv(overall_csv)
|
||
if 'MeasuredDate' in df.columns:
|
||
return set(df['MeasuredDate'].dropna().unique())
|
||
except Exception:
|
||
pass
|
||
|
||
return set()
|
||
|
||
# ============================================================================
|
||
# EXTRACTION PATTERN DICTIONARY
|
||
# ============================================================================
|
||
# Centralized pattern definitions for extracting data from BodySpec PDFs.
|
||
#
|
||
# Each field has:
|
||
# - patterns: List of regex patterns to try (in order of preference)
|
||
# - group: Which capture group to extract from each pattern
|
||
# - cast: Function to convert extracted string to desired type
|
||
# - required: Whether this field must be present for valid extraction
|
||
# - description: Human-readable explanation of the field
|
||
#
|
||
# Benefits of this approach:
|
||
# 1. Self-documenting - patterns describe what they extract
|
||
# 2. Maintainable - add/modify patterns in one place
|
||
# 3. Robust - multiple patterns provide fallback options
|
||
# 4. Validated - know exactly which required fields are missing
|
||
#
|
||
# To add a new field:
|
||
# 1. Add entry to this dictionary
|
||
# 2. Pattern will automatically be used by extract_all_fields()
|
||
# 3. Validation will check if required fields are present
|
||
# ============================================================================
|
||
|
||
EXTRACTION_PATTERNS = {
|
||
# Primary body composition metrics (from SUMMARY RESULTS table or individual fields)
|
||
'body_fat_percent': {
|
||
'patterns': [
|
||
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 2)
|
||
r"Total Body Fat %\s+([\d\.]+)" # Individual field
|
||
],
|
||
'group': [2, 1], # Which capture group for each pattern
|
||
'cast': float,
|
||
'required': True,
|
||
'description': 'Total body fat percentage'
|
||
},
|
||
'total_mass_lb': {
|
||
'patterns': [
|
||
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 3)
|
||
r"Total Mass.*?\(lbs\)\s+([\d\.]+)" # Individual field
|
||
],
|
||
'group': [3, 1],
|
||
'cast': float,
|
||
'required': True,
|
||
'description': 'Total body mass in pounds'
|
||
},
|
||
'fat_mass_lb': {
|
||
'patterns': [
|
||
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 4)
|
||
r"Fat Tissue \(lbs\)\s+([\d\.]+)" # Individual field
|
||
],
|
||
'group': [4, 1],
|
||
'cast': float,
|
||
'required': True,
|
||
'description': 'Total fat tissue mass'
|
||
},
|
||
'lean_soft_tissue_lb': {
|
||
'patterns': [
|
||
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 5)
|
||
r"Lean Tissue \(lbs\)\s+([\d\.]+)" # Individual field
|
||
],
|
||
'group': [5, 1],
|
||
'cast': float,
|
||
'required': True,
|
||
'description': 'Lean soft tissue mass'
|
||
},
|
||
'bmc_lb': {
|
||
'patterns': [
|
||
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 6)
|
||
r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)" # Individual field
|
||
],
|
||
'group': [6, 1],
|
||
'cast': float,
|
||
'required': True,
|
||
'description': 'Bone mineral content'
|
||
},
|
||
|
||
# Date information
|
||
'measured_date': {
|
||
'patterns': [
|
||
r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", # After weight in client info
|
||
r"(\d{1,2}/\d{1,2}/\d{4})\s*$" # End of first line
|
||
],
|
||
'group': [1, 1],
|
||
'cast': str,
|
||
'required': True,
|
||
'description': 'Date of DEXA scan'
|
||
},
|
||
|
||
# Regional fat distribution
|
||
'android_percent': {
|
||
'patterns': [r"Android.*?([\d\.]+)%"],
|
||
'group': [1],
|
||
'cast': float,
|
||
'required': False,
|
||
'description': 'Android region fat percentage (belly area)'
|
||
},
|
||
'gynoid_percent': {
|
||
'patterns': [r"Gynoid.*?([\d\.]+)%"],
|
||
'group': [1],
|
||
'cast': float,
|
||
'required': False,
|
||
'description': 'Gynoid region fat percentage (hips/thighs)'
|
||
},
|
||
'ag_ratio': {
|
||
'patterns': [
|
||
r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", # On same line as RMR (group 3)
|
||
r"A/G Ratio\s+([\d\.]+)" # Standalone field
|
||
],
|
||
'group': [3, 1],
|
||
'cast': float,
|
||
'required': False,
|
||
'description': 'Android/Gynoid ratio (central vs peripheral fat)'
|
||
},
|
||
|
||
# Metabolic and supplemental metrics
|
||
'rmr_cal_per_day': {
|
||
'patterns': [r"([\d,]+)\s*cal/day"],
|
||
'group': [1],
|
||
'cast': lambda s: int(s.replace(',', '')),
|
||
'required': False,
|
||
'description': 'Resting metabolic rate (calories per day)'
|
||
},
|
||
'vat_mass_lb': {
|
||
'patterns': [r"Mass \(lbs\)\s+([\d\.]+)"],
|
||
'group': [1],
|
||
'cast': float,
|
||
'required': False,
|
||
'description': 'Visceral adipose tissue mass'
|
||
},
|
||
'vat_volume_in3': {
|
||
'patterns': [r"Volume \(in3\)\s+([\d\.]+)"],
|
||
'group': [1],
|
||
'cast': float,
|
||
'required': False,
|
||
'description': 'Visceral adipose tissue volume'
|
||
},
|
||
}
|
||
|
||
|
||
def read_pdf_text(pdf_path):
|
||
"""Extract all text from PDF pages"""
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
pages_text = [page.extract_text() or "" for page in pdf.pages]
|
||
return "\n".join(pages_text)
|
||
|
||
|
||
def extract_field(text, patterns, groups, cast=float, flags=re.IGNORECASE):
|
||
"""
|
||
Extract a field using multiple pattern attempts
|
||
|
||
Args:
|
||
text: Full PDF text
|
||
patterns: List of regex patterns to try (in order)
|
||
groups: List of capture group indices (one per pattern)
|
||
cast: Function to convert extracted string to desired type
|
||
flags: Regex flags
|
||
|
||
Returns:
|
||
Extracted value or None
|
||
"""
|
||
for pattern, group in zip(patterns, groups):
|
||
match = re.search(pattern, text, flags)
|
||
if match:
|
||
try:
|
||
val = match.group(group).replace(",", "").strip()
|
||
return cast(val) if cast else val
|
||
except (ValueError, IndexError, AttributeError):
|
||
continue
|
||
return None
|
||
|
||
|
||
def extract_all_fields(text, patterns_dict=None):
|
||
"""
|
||
Extract all defined fields from PDF text
|
||
|
||
Args:
|
||
text: Full PDF text content
|
||
patterns_dict: Field definitions (uses EXTRACTION_PATTERNS if None)
|
||
|
||
Returns:
|
||
Dictionary of extracted data and list of missing required fields
|
||
"""
|
||
if patterns_dict is None:
|
||
patterns_dict = EXTRACTION_PATTERNS
|
||
|
||
data = {}
|
||
missing_required = []
|
||
|
||
for field_name, config in patterns_dict.items():
|
||
value = extract_field(
|
||
text,
|
||
config['patterns'],
|
||
config['group'],
|
||
config['cast']
|
||
)
|
||
data[field_name] = value
|
||
|
||
if value is None and config['required']:
|
||
missing_required.append(field_name)
|
||
|
||
return data, missing_required
|
||
|
||
def convert_date_to_iso(date_str):
|
||
"""Convert MM/DD/YYYY to YYYY-MM-DD"""
|
||
if not date_str:
|
||
return None
|
||
try:
|
||
dt = datetime.strptime(date_str, "%m/%d/%Y")
|
||
return dt.strftime("%Y-%m-%d")
|
||
except:
|
||
return date_str
|
||
|
||
def inches_to_ft_in(inches):
|
||
"""Convert inches to feet'inches" format"""
|
||
if inches is None:
|
||
return None
|
||
feet = int(inches // 12)
|
||
remaining_inches = int(inches % 12)
|
||
return f"{feet}'{remaining_inches}\""
|
||
|
||
def parse_regional_table(text):
|
||
regions = ["Arms", "Legs", "Trunk", "Android", "Gynoid", "Total"]
|
||
out = {}
|
||
for r in regions:
|
||
# Example line: Arms 22.1% 27.4 6.0 20.2 1.1
|
||
pattern = rf"{r}\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)"
|
||
m = re.search(pattern, text)
|
||
if m:
|
||
out[r] = {
|
||
"fat_percent": float(m.group(1)),
|
||
"total_mass_lb": float(m.group(2)),
|
||
"fat_tissue_lb": float(m.group(3)),
|
||
"lean_tissue_lb": float(m.group(4)),
|
||
"bmc_lb": float(m.group(5)),
|
||
}
|
||
return out
|
||
|
||
def parse_muscle_balance(text):
|
||
names = ["Arms Total", "Right Arm", "Left Arm", "Legs Total", "Right Leg", "Left Leg"]
|
||
out = {}
|
||
for n in names:
|
||
# Example: Right Arm 20.4 13.7 2.8 10.3 0.6
|
||
pattern = rf"{n}\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)"
|
||
m = re.search(pattern, text)
|
||
if m:
|
||
out[n] = {
|
||
"fat_percent": float(m.group(1)),
|
||
"total_mass_lb": float(m.group(2)),
|
||
"fat_mass_lb": float(m.group(3)),
|
||
"lean_mass_lb": float(m.group(4)),
|
||
"bmc_lb": float(m.group(5)),
|
||
}
|
||
return out
|
||
|
||
def parse_bone_density_total(text):
|
||
# Example: Total 1.280 0.8 0.8
|
||
m = re.search(r"Total\s+([\d\.]+)\s+([-\d\.]+)\s+([-\d\.]+)", text)
|
||
if m:
|
||
return {
|
||
"total_bmd_g_per_cm2": float(m.group(1)),
|
||
"young_adult_t_score": float(m.group(2)),
|
||
"age_matched_z_score": float(m.group(3)),
|
||
}
|
||
return {}
|
||
|
||
def parse_dexa_pdf(pdf_path):
|
||
"""
|
||
Extract all data from BodySpec DEXA PDF
|
||
|
||
Uses pattern dictionary for main fields and specialized parsers for tables.
|
||
Returns dict with all extracted data and validation warnings.
|
||
"""
|
||
text = read_pdf_text(pdf_path)
|
||
|
||
# Extract all defined fields using pattern dictionary
|
||
data, missing_required = extract_all_fields(text)
|
||
|
||
# Warn about missing required fields
|
||
if missing_required:
|
||
print(f" ⚠️ Warning: Missing required fields: {', '.join(missing_required)}", file=sys.stderr)
|
||
|
||
# Parse structured tables (regional, muscle balance, bone density)
|
||
data["regional"] = parse_regional_table(text)
|
||
data["muscle_balance"] = parse_muscle_balance(text)
|
||
data["bone_density"] = parse_bone_density_total(text)
|
||
|
||
return data
|
||
|
||
def compute_derived(d, height_in, weight_lb=None):
|
||
# Prefer DEXA total mass if available
|
||
total_mass = d.get("total_mass_lb") or weight_lb
|
||
if total_mass is None:
|
||
raise ValueError("Total mass is missing; pass --weight-lb if the PDF lacks it.")
|
||
|
||
fm = d.get("fat_mass_lb")
|
||
lst = d.get("lean_soft_tissue_lb")
|
||
bmc = d.get("bmc_lb")
|
||
bf_pct = d.get("body_fat_percent")
|
||
|
||
ffm = None
|
||
if fm is not None:
|
||
ffm = total_mass - fm
|
||
elif lst is not None and bmc is not None:
|
||
ffm = lst + bmc
|
||
|
||
def idx(value_lb):
|
||
return round(703.0 * value_lb / (height_in ** 2), 2)
|
||
|
||
derived = {
|
||
"height_in": height_in,
|
||
"height_ft_in": inches_to_ft_in(height_in),
|
||
"weight_input_lb": weight_lb,
|
||
"bmi": round(703.0 * total_mass / (height_in ** 2), 1),
|
||
"fat_free_mass_lb": round(ffm, 1) if ffm is not None else None,
|
||
"ffmi": idx(ffm) if ffm is not None else None,
|
||
"fmi": idx(fm) if fm is not None else None,
|
||
"lsti": idx(lst) if lst is not None else None,
|
||
"alm_lb": None,
|
||
"smi": None,
|
||
}
|
||
|
||
# Lean mass percentage (complement of body fat %)
|
||
if bf_pct is not None:
|
||
derived["lean_mass_percent"] = round(100 - bf_pct, 1)
|
||
else:
|
||
derived["lean_mass_percent"] = None
|
||
|
||
# ALM from regional lean masses
|
||
arms_lean = d.get("regional", {}).get("Arms", {}).get("lean_tissue_lb")
|
||
legs_lean = d.get("regional", {}).get("Legs", {}).get("lean_tissue_lb")
|
||
trunk_lean = d.get("regional", {}).get("Trunk", {}).get("lean_tissue_lb")
|
||
|
||
if arms_lean is not None and legs_lean is not None:
|
||
alm = arms_lean + legs_lean
|
||
derived["alm_lb"] = round(alm, 1)
|
||
derived["smi"] = idx(alm)
|
||
|
||
# Regional lean mass distribution
|
||
if lst is not None and arms_lean is not None and legs_lean is not None and trunk_lean is not None:
|
||
derived["arms_lean_pct"] = round(100 * arms_lean / lst, 1)
|
||
derived["legs_lean_pct"] = round(100 * legs_lean / lst, 1)
|
||
derived["trunk_lean_pct"] = round(100 * trunk_lean / lst, 1)
|
||
else:
|
||
derived["arms_lean_pct"] = None
|
||
derived["legs_lean_pct"] = None
|
||
derived["trunk_lean_pct"] = None
|
||
|
||
# Trunk-to-limb fat ratio (health risk indicator)
|
||
trunk_fat = d.get("regional", {}).get("Trunk", {}).get("fat_tissue_lb")
|
||
arms_fat = d.get("regional", {}).get("Arms", {}).get("fat_tissue_lb")
|
||
legs_fat = d.get("regional", {}).get("Legs", {}).get("fat_tissue_lb")
|
||
|
||
if trunk_fat is not None and arms_fat is not None and legs_fat is not None:
|
||
limb_fat = arms_fat + legs_fat
|
||
if limb_fat > 0:
|
||
derived["trunk_to_limb_fat_ratio"] = round(trunk_fat / limb_fat, 2)
|
||
else:
|
||
derived["trunk_to_limb_fat_ratio"] = None
|
||
else:
|
||
derived["trunk_to_limb_fat_ratio"] = None
|
||
|
||
# Limb symmetry indices (balance indicators)
|
||
mb = d.get("muscle_balance", {})
|
||
right_arm = mb.get("Right Arm", {}).get("lean_mass_lb")
|
||
left_arm = mb.get("Left Arm", {}).get("lean_mass_lb")
|
||
right_leg = mb.get("Right Leg", {}).get("lean_mass_lb")
|
||
left_leg = mb.get("Left Leg", {}).get("lean_mass_lb")
|
||
|
||
if right_arm is not None and left_arm is not None and right_arm + left_arm > 0:
|
||
# Symmetry: 100 = perfect, <100 = left stronger, >100 = right stronger
|
||
derived["arm_symmetry_index"] = round(100 * right_arm / (right_arm + left_arm), 1)
|
||
else:
|
||
derived["arm_symmetry_index"] = None
|
||
|
||
if right_leg is not None and left_leg is not None and right_leg + left_leg > 0:
|
||
derived["leg_symmetry_index"] = round(100 * right_leg / (right_leg + left_leg), 1)
|
||
else:
|
||
derived["leg_symmetry_index"] = None
|
||
|
||
# VAT Index (normalized by height squared, like BMI)
|
||
vat_mass = d.get("vat_mass_lb")
|
||
if vat_mass is not None:
|
||
derived["vat_index"] = idx(vat_mass)
|
||
else:
|
||
derived["vat_index"] = None
|
||
|
||
# Bone Mineral Density Index (BMC normalized by height)
|
||
if bmc is not None:
|
||
derived["bmdi"] = idx(bmc)
|
||
else:
|
||
derived["bmdi"] = None
|
||
|
||
# Adjusted Body Weight (used in nutrition/health calculations)
|
||
# ABW = IBW + 0.4 * (actual weight - IBW), where IBW differs by sex
|
||
# For simplicity, using a unisex approximation: IBW ≈ height_in * 2.3 - 100 (rough estimate)
|
||
if total_mass is not None:
|
||
ibw_estimate = height_in * 2.3 - 100
|
||
if total_mass > ibw_estimate:
|
||
derived["adjusted_body_weight_lb"] = round(ibw_estimate + 0.4 * (total_mass - ibw_estimate), 1)
|
||
else:
|
||
derived["adjusted_body_weight_lb"] = round(total_mass, 1)
|
||
else:
|
||
derived["adjusted_body_weight_lb"] = None
|
||
|
||
return total_mass, derived
|
||
|
||
def ensure_outdir(outdir):
|
||
os.makedirs(outdir, exist_ok=True)
|
||
|
||
def write_or_append_csv(path, row_dict, columns):
|
||
df_row = pd.DataFrame([{k: row_dict.get(k) for k in columns}])
|
||
if os.path.exists(path):
|
||
df_row.to_csv(path, mode="a", header=False, index=False)
|
||
else:
|
||
df_row.to_csv(path, index=False)
|
||
|
||
def write_or_append_json(path, obj):
|
||
if os.path.exists(path):
|
||
with open(path, "r") as f:
|
||
try:
|
||
data = json.load(f)
|
||
except json.JSONDecodeError:
|
||
data = []
|
||
else:
|
||
data = []
|
||
if isinstance(data, dict):
|
||
# convert to list of entries if previous file was a single dict
|
||
data = [data]
|
||
data.append(obj)
|
||
with open(path, "w") as f:
|
||
json.dump(data, f, indent=2)
|
||
|
||
def append_markdown(path, md_text):
|
||
mode = "a" if os.path.exists(path) else "w"
|
||
with open(path, mode) as f:
|
||
f.write(md_text.strip() + "\n\n")
|
||
|
||
def process_single_pdf(pdf_path, height_in, weight_lb, outdir, batch_mode=False):
|
||
"""Process a single PDF file and return success status
|
||
|
||
Args:
|
||
pdf_path: Path to PDF file
|
||
height_in: Height in inches
|
||
weight_lb: Weight in pounds (optional)
|
||
outdir: Output directory
|
||
batch_mode: If True, use batch-style output messages
|
||
|
||
Returns:
|
||
bool: True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Validate PDF file
|
||
pdf_file = Path(pdf_path)
|
||
if not pdf_file.exists():
|
||
msg = f" ❌ Skipping {pdf_path}: File not found" if batch_mode else f"❌ Error: PDF file not found: {pdf_path}"
|
||
print(msg, file=sys.stderr)
|
||
return False
|
||
if not pdf_file.is_file():
|
||
msg = f" ❌ Skipping {pdf_path}: Not a file" if batch_mode else f"❌ Error: Path is not a file: {pdf_path}"
|
||
print(msg, file=sys.stderr)
|
||
return False
|
||
if pdf_file.suffix.lower() != '.pdf':
|
||
msg = f" ❌ Skipping {pdf_path}: Not a PDF" if batch_mode else f"❌ Error: File is not a PDF: {pdf_path}"
|
||
print(msg, file=sys.stderr)
|
||
return False
|
||
|
||
if batch_mode:
|
||
print(f"\n📄 Processing: {pdf_file.name}")
|
||
else:
|
||
print("📊 Computing derived metrics...")
|
||
|
||
# Parse PDF
|
||
d = parse_dexa_pdf(pdf_path)
|
||
|
||
# Check if critical data was extracted
|
||
if d.get("body_fat_percent") is None or d.get("total_mass_lb") is None:
|
||
print(f" ⚠️ Warning: Missing critical data from {pdf_file.name}", file=sys.stderr)
|
||
if d.get("body_fat_percent") is None:
|
||
print(" - Body Fat % not found", file=sys.stderr)
|
||
if d.get("total_mass_lb") is None:
|
||
print(" - Total Mass not found", file=sys.stderr)
|
||
|
||
# Process data
|
||
measured_date_raw = d.get("measured_date") or datetime.now().strftime("%m/%d/%Y")
|
||
measured_date = convert_date_to_iso(measured_date_raw)
|
||
total_mass, derived = compute_derived(d, height_in=height_in, weight_lb=weight_lb)
|
||
|
||
# Write output files (same as before)
|
||
overall_cols = [
|
||
"MeasuredDate","Height_in","Height_ft_in","Weight_lb_Input","DEXA_TotalMass_lb","BodyFat_percent",
|
||
"LeanMass_percent","FatMass_lb","LeanSoftTissue_lb","BoneMineralContent_lb","FatFreeMass_lb",
|
||
"BMI","FFMI","FMI","LST_Index","ALM_lb","SMI","VAT_Mass_lb","VAT_Volume_in3","VAT_Index",
|
||
"BMDI","Android_percent","Gynoid_percent","AG_Ratio","Trunk_to_Limb_Fat_Ratio",
|
||
"Arms_Lean_pct","Legs_Lean_pct","Trunk_Lean_pct","Arm_Symmetry_Index","Leg_Symmetry_Index",
|
||
"Adjusted_Body_Weight_lb","RMR_cal_per_day"
|
||
]
|
||
overall_row = {
|
||
"MeasuredDate": measured_date,
|
||
"Height_in": derived["height_in"],
|
||
"Height_ft_in": derived["height_ft_in"],
|
||
"Weight_lb_Input": derived["weight_input_lb"],
|
||
"DEXA_TotalMass_lb": round(total_mass, 1),
|
||
"BodyFat_percent": d.get("body_fat_percent"),
|
||
"LeanMass_percent": derived.get("lean_mass_percent"),
|
||
"FatMass_lb": d.get("fat_mass_lb"),
|
||
"LeanSoftTissue_lb": d.get("lean_soft_tissue_lb"),
|
||
"BoneMineralContent_lb": d.get("bmc_lb"),
|
||
"FatFreeMass_lb": derived.get("fat_free_mass_lb"),
|
||
"BMI": derived["bmi"],
|
||
"FFMI": derived.get("ffmi"),
|
||
"FMI": derived.get("fmi"),
|
||
"LST_Index": derived.get("lsti"),
|
||
"ALM_lb": derived.get("alm_lb"),
|
||
"SMI": derived.get("smi"),
|
||
"VAT_Mass_lb": d.get("vat_mass_lb"),
|
||
"VAT_Volume_in3": d.get("vat_volume_in3"),
|
||
"VAT_Index": derived.get("vat_index"),
|
||
"BMDI": derived.get("bmdi"),
|
||
"Android_percent": d.get("android_percent"),
|
||
"Gynoid_percent": d.get("gynoid_percent"),
|
||
"AG_Ratio": d.get("ag_ratio"),
|
||
"Trunk_to_Limb_Fat_Ratio": derived.get("trunk_to_limb_fat_ratio"),
|
||
"Arms_Lean_pct": derived.get("arms_lean_pct"),
|
||
"Legs_Lean_pct": derived.get("legs_lean_pct"),
|
||
"Trunk_Lean_pct": derived.get("trunk_lean_pct"),
|
||
"Arm_Symmetry_Index": derived.get("arm_symmetry_index"),
|
||
"Leg_Symmetry_Index": derived.get("leg_symmetry_index"),
|
||
"Adjusted_Body_Weight_lb": derived.get("adjusted_body_weight_lb"),
|
||
"RMR_cal_per_day": d.get("rmr_cal_per_day"),
|
||
}
|
||
write_or_append_csv(os.path.join(outdir, "overall.csv"), overall_row, overall_cols)
|
||
|
||
# Regional table
|
||
regional_cols = ["MeasuredDate","Region","FatPercent","LeanPercent","TotalMass_lb","FatTissue_lb","LeanTissue_lb","BMC_lb"]
|
||
reg_rows = []
|
||
for name, r in d.get("regional", {}).items():
|
||
# Calculate lean percentage (lean tissue only, not including BMC - matches BodySpec report)
|
||
lean_pct = round(100 * r["lean_tissue_lb"] / r["total_mass_lb"], 1) if r["total_mass_lb"] > 0 else None
|
||
reg_rows.append({
|
||
"MeasuredDate": measured_date,
|
||
"Region": name,
|
||
"FatPercent": r["fat_percent"],
|
||
"LeanPercent": lean_pct,
|
||
"TotalMass_lb": r["total_mass_lb"],
|
||
"FatTissue_lb": r["fat_tissue_lb"],
|
||
"LeanTissue_lb": r["lean_tissue_lb"],
|
||
"BMC_lb": r["bmc_lb"],
|
||
})
|
||
regional_path = os.path.join(outdir, "regional.csv")
|
||
df_regional = pd.DataFrame(reg_rows, columns=regional_cols)
|
||
if os.path.exists(regional_path):
|
||
df_regional.to_csv(regional_path, mode="a", header=False, index=False)
|
||
else:
|
||
df_regional.to_csv(regional_path, index=False)
|
||
|
||
# Muscle balance
|
||
mb_cols = ["MeasuredDate","Region","FatPercent","TotalMass_lb","FatMass_lb","LeanMass_lb","BMC_lb"]
|
||
mb_rows = []
|
||
for name, r in d.get("muscle_balance", {}).items():
|
||
mb_rows.append({
|
||
"MeasuredDate": measured_date,
|
||
"Region": name,
|
||
"FatPercent": r["fat_percent"],
|
||
"TotalMass_lb": r["total_mass_lb"],
|
||
"FatMass_lb": r["fat_mass_lb"],
|
||
"LeanMass_lb": r["lean_mass_lb"],
|
||
"BMC_lb": r["bmc_lb"],
|
||
})
|
||
mb_path = os.path.join(outdir, "muscle_balance.csv")
|
||
if os.path.exists(mb_path):
|
||
pd.DataFrame(mb_rows).to_csv(mb_path, mode="a", header=False, index=False)
|
||
else:
|
||
pd.DataFrame(mb_rows).to_csv(mb_path, index=False)
|
||
|
||
# JSON
|
||
regional_array = []
|
||
for name, data in d.get("regional", {}).items():
|
||
lean_pct = round(100 * (data["lean_tissue_lb"] + data["bmc_lb"]) / data["total_mass_lb"], 1) if data["total_mass_lb"] > 0 else None
|
||
regional_array.append({
|
||
"region": name,
|
||
"fat_percent": data["fat_percent"],
|
||
"lean_percent": lean_pct,
|
||
"total_mass_lb": data["total_mass_lb"],
|
||
"fat_tissue_lb": data["fat_tissue_lb"],
|
||
"lean_tissue_lb": data["lean_tissue_lb"],
|
||
"bmc_lb": data["bmc_lb"]
|
||
})
|
||
muscle_balance_array = [
|
||
{"region": name, **data}
|
||
for name, data in d.get("muscle_balance", {}).items()
|
||
]
|
||
|
||
overall_json = {
|
||
"measured_date": measured_date,
|
||
"anthropometrics": {
|
||
"height_in": derived["height_in"],
|
||
"height_ft_in": derived["height_ft_in"],
|
||
"weight_input_lb": derived["weight_input_lb"],
|
||
"dexa_total_mass_lb": round(total_mass, 1),
|
||
"adjusted_body_weight_lb": derived.get("adjusted_body_weight_lb"),
|
||
"bmi": derived["bmi"]
|
||
},
|
||
"composition": {
|
||
"body_fat_percent": d.get("body_fat_percent"),
|
||
"lean_mass_percent": derived.get("lean_mass_percent"),
|
||
"fat_mass_lb": d.get("fat_mass_lb"),
|
||
"lean_soft_tissue_lb": d.get("lean_soft_tissue_lb"),
|
||
"bone_mineral_content_lb": d.get("bmc_lb"),
|
||
"fat_free_mass_lb": derived.get("fat_free_mass_lb"),
|
||
"derived_indices": {
|
||
"ffmi": derived.get("ffmi"),
|
||
"fmi": derived.get("fmi"),
|
||
"lsti": derived.get("lsti"),
|
||
"alm_lb": derived.get("alm_lb"),
|
||
"smi": derived.get("smi"),
|
||
"bmdi": derived.get("bmdi")
|
||
}
|
||
},
|
||
"regional": regional_array,
|
||
"regional_analysis": {
|
||
"trunk_to_limb_fat_ratio": derived.get("trunk_to_limb_fat_ratio"),
|
||
"lean_mass_distribution": {
|
||
"arms_percent": derived.get("arms_lean_pct"),
|
||
"legs_percent": derived.get("legs_lean_pct"),
|
||
"trunk_percent": derived.get("trunk_lean_pct")
|
||
}
|
||
},
|
||
"muscle_balance": muscle_balance_array,
|
||
"symmetry_indices": {
|
||
"arm_symmetry_index": derived.get("arm_symmetry_index"),
|
||
"leg_symmetry_index": derived.get("leg_symmetry_index")
|
||
},
|
||
"supplemental": {
|
||
"android_percent": d.get("android_percent"),
|
||
"gynoid_percent": d.get("gynoid_percent"),
|
||
"ag_ratio": d.get("ag_ratio"),
|
||
"vat": {
|
||
"mass_lb": d.get("vat_mass_lb"),
|
||
"volume_in3": d.get("vat_volume_in3"),
|
||
"vat_index": derived.get("vat_index")
|
||
},
|
||
"rmr_cal_per_day": d.get("rmr_cal_per_day")
|
||
},
|
||
"bone_density": d.get("bone_density", {})
|
||
}
|
||
write_or_append_json(os.path.join(outdir, "overall.json"), overall_json)
|
||
|
||
# Markdown summary
|
||
md_text = make_markdown(measured_date, d, derived, total_mass)
|
||
append_markdown(os.path.join(outdir, "summary.md"), md_text)
|
||
|
||
if batch_mode:
|
||
print(f" ✅ {pdf_file.name}: Body fat {d.get('body_fat_percent')}%, FFMI {derived.get('ffmi')}")
|
||
else:
|
||
# Single-file mode prints detailed success info outside this function
|
||
pass
|
||
return True
|
||
|
||
except Exception as e:
|
||
if batch_mode:
|
||
print(f" ❌ Error processing {pdf_path}: {e}", file=sys.stderr)
|
||
else:
|
||
print(f"❌ Error reading PDF: {e}", file=sys.stderr)
|
||
print("This tool is specifically designed for BodySpec PDF reports.", file=sys.stderr)
|
||
return False
|
||
|
||
def make_markdown(measured_date, d, derived, total_mass):
|
||
lines = []
|
||
lines.append(f"# DEXA Summary — {measured_date}")
|
||
lines.append("")
|
||
lines.append(f"- Height: {derived['height_in']} in")
|
||
lines.append(f"- Weight: {round(total_mass, 1)} lb")
|
||
if d.get("body_fat_percent") is not None and d.get("fat_mass_lb") is not None:
|
||
lines.append(f"- Body fat: {d['body_fat_percent']}% ({d['fat_mass_lb']} lb)")
|
||
if d.get("lean_soft_tissue_lb") is not None:
|
||
lines.append(f"- Lean soft tissue: {d['lean_soft_tissue_lb']} lb")
|
||
if d.get("bmc_lb") is not None:
|
||
lines.append(f"- Bone mineral content: {d['bmc_lb']} lb")
|
||
lines.append(f"- Fat‑free mass: {derived.get('fat_free_mass_lb')}")
|
||
lines.append(f"- BMI: {derived['bmi']}")
|
||
lines.append(f"- FFMI: {derived.get('ffmi')}; FMI: {derived.get('fmi')}; Lean Soft Tissue Index: {derived.get('lsti')}")
|
||
if derived.get("alm_lb") is not None:
|
||
lines.append(f"- Appendicular Lean Mass: {derived['alm_lb']} lb; Skeletal Muscle Index: {derived['smi']}")
|
||
if d.get("android_percent") is not None and d.get("gynoid_percent") is not None and d.get("ag_ratio") is not None:
|
||
lines.append(f"- Android: {d['android_percent']}%; Gynoid: {d['gynoid_percent']}%; A/G ratio: {d['ag_ratio']}")
|
||
if d.get("vat_mass_lb") is not None and d.get("vat_volume_in3") is not None:
|
||
lines.append(f"- VAT: {d['vat_mass_lb']} lb ({d['vat_volume_in3']} in³)")
|
||
if d.get("rmr_cal_per_day") is not None:
|
||
lines.append(f"- RMR: {d['rmr_cal_per_day']} cal/day")
|
||
lines.append("")
|
||
lines.append("## Regional")
|
||
for name, r in d.get("regional", {}).items():
|
||
lines.append(f"- {name}: {r['fat_percent']}% fat; {r['total_mass_lb']} lb total; {r['fat_tissue_lb']} lb fat; {r['lean_tissue_lb']} lb lean; {r['bmc_lb']} lb BMC")
|
||
return "\n".join(lines)
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser(
|
||
description="BodySpec Insights - Extract and analyze body composition data from BodySpec DEXA scan PDFs",
|
||
epilog="Examples:\n"
|
||
" Single: python dexa_extract.py scan.pdf --height-in 74 --outdir ./data/results\n"
|
||
" Batch: python dexa_extract.py --batch data/pdfs --height-in 74 --outdir ./data/results",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||
)
|
||
ap.add_argument("pdf", nargs="?", help="Path to BodySpec DEXA report PDF (not used with --batch)")
|
||
ap.add_argument("--batch", metavar="DIR", help="Process all PDFs in directory (skips already-processed dates)")
|
||
ap.add_argument("--height-in", type=float, required=True, help="Height in inches (e.g., 6'2\" = 74)")
|
||
ap.add_argument("--weight-lb", type=float, help="Body weight in lbs (optional; used if DEXA total mass missing)")
|
||
ap.add_argument("--outdir", default="dexa_out", help="Output directory (default: dexa_out)")
|
||
ap.add_argument("--force", action="store_true", help="Reprocess all files, even if already in output")
|
||
args = ap.parse_args()
|
||
|
||
# Check that either pdf or --batch is provided
|
||
if not args.pdf and not args.batch:
|
||
print("❌ Error: Must provide either a PDF file or --batch directory", file=sys.stderr)
|
||
ap.print_help()
|
||
sys.exit(1)
|
||
if args.pdf and args.batch:
|
||
print("❌ Error: Cannot use both PDF file and --batch. Choose one.", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Validate height
|
||
if args.height_in < 36 or args.height_in > 96:
|
||
print(f"❌ Error: Height seems unrealistic: {args.height_in} inches (expected 36-96 inches / 3'-8')", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Validate weight if provided
|
||
if args.weight_lb is not None and (args.weight_lb < 50 or args.weight_lb > 500):
|
||
print(f"❌ Error: Weight seems unrealistic: {args.weight_lb} lbs (expected 50-500 lbs)", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Create output directory
|
||
try:
|
||
ensure_outdir(args.outdir)
|
||
except PermissionError:
|
||
print(f"❌ Error: Cannot create output directory: {args.outdir} (permission denied)", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Batch mode
|
||
if args.batch:
|
||
batch_dir = Path(args.batch)
|
||
if not batch_dir.exists():
|
||
print(f"❌ Error: Directory not found: {args.batch}", file=sys.stderr)
|
||
sys.exit(1)
|
||
if not batch_dir.is_dir():
|
||
print(f"❌ Error: Not a directory: {args.batch}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Find all PDF files in directory
|
||
pdf_files = sorted(batch_dir.glob("*.pdf"))
|
||
if not pdf_files:
|
||
print(f"❌ Error: No PDF files found in: {args.batch}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Get already-processed dates
|
||
processed_dates = set()
|
||
if not args.force:
|
||
processed_dates = get_processed_dates(args.outdir)
|
||
if processed_dates:
|
||
print(f"📋 Found {len(processed_dates)} already-processed scan(s) in {args.outdir}")
|
||
|
||
print(f"📦 Batch mode: Found {len(pdf_files)} PDF file(s) in {args.batch}")
|
||
print(f"📂 Output directory: {args.outdir}\n")
|
||
|
||
success_count = 0
|
||
fail_count = 0
|
||
skip_count = 0
|
||
|
||
for pdf_file in pdf_files:
|
||
# Quick check: try to extract date and see if already processed
|
||
if not args.force and processed_dates:
|
||
try:
|
||
d_temp = parse_dexa_pdf(str(pdf_file))
|
||
measured_date_raw = d_temp.get("measured_date")
|
||
if measured_date_raw:
|
||
measured_date = convert_date_to_iso(measured_date_raw)
|
||
if measured_date in processed_dates:
|
||
print(f"\n⏭️ Skipping: {pdf_file.name} (date {measured_date} already processed)")
|
||
skip_count += 1
|
||
continue
|
||
except Exception:
|
||
pass # If we can't extract date, try to process anyway
|
||
|
||
if process_single_pdf(str(pdf_file), args.height_in, args.weight_lb, args.outdir, batch_mode=True):
|
||
success_count += 1
|
||
else:
|
||
fail_count += 1
|
||
|
||
print(f"\n{'='*60}")
|
||
print(f"✅ Batch complete: {success_count} succeeded, {skip_count} skipped, {fail_count} failed")
|
||
print(f"📁 Results saved to: {args.outdir}")
|
||
|
||
if args.force and skip_count > 0:
|
||
print(f" 💡 Tip: Remove --force flag to skip already-processed scans")
|
||
elif skip_count > 0:
|
||
print(f" 💡 Tip: Use --force to reprocess skipped scans")
|
||
|
||
if fail_count > 0:
|
||
sys.exit(1)
|
||
return
|
||
|
||
# Single file mode
|
||
print(f"📄 Reading PDF: {args.pdf}")
|
||
|
||
# Use the shared processing function
|
||
success = process_single_pdf(args.pdf, args.height_in, args.weight_lb, args.outdir, batch_mode=False)
|
||
|
||
if not success:
|
||
sys.exit(1)
|
||
|
||
# Parse the result to show summary info
|
||
try:
|
||
# Read the latest entry from overall.json to get the summary data
|
||
json_path = os.path.join(args.outdir, "overall.json")
|
||
if os.path.exists(json_path):
|
||
with open(json_path, 'r') as f:
|
||
data = json.load(f)
|
||
latest = data[-1] if isinstance(data, list) and data else data
|
||
measured_date = latest.get("measured_date", "Unknown")
|
||
body_fat = latest.get("composition", {}).get("body_fat_percent", "N/A")
|
||
ffmi = latest.get("composition", {}).get("derived_indices", {}).get("ffmi", "N/A")
|
||
else:
|
||
measured_date = body_fat = ffmi = "N/A"
|
||
except Exception:
|
||
measured_date = body_fat = ffmi = "N/A"
|
||
|
||
# Success output
|
||
print(f"\n✅ Success! Wrote files to: {args.outdir}")
|
||
print(" 📁 Files created:")
|
||
print(" - overall.csv (time-series data)")
|
||
print(" - regional.csv (body composition by region)")
|
||
print(" - muscle_balance.csv (left/right symmetry)")
|
||
print(" - overall.json (structured data)")
|
||
print(" - summary.md (readable report)")
|
||
print(f"\n 📈 Scan date: {measured_date}")
|
||
print(f" 💪 Body fat: {body_fat}%")
|
||
print(f" 🏋️ FFMI: {ffmi}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|