#!/usr/bin/env python3 """ BodySpec Insights - Body composition analytics for BodySpec DEXA scan PDFs Extract measurements from BodySpec DEXA reports, compute 30+ derived metrics, and output structured data for progress tracking. Usage: python dexa_extract.py /path/to/bodyspec-report.pdf --height-in 74 --weight-lb 212 --outdir ./data/results Note: This script is specifically designed for BodySpec PDF reports. Requires: pip install pdfplumber pandas """ import argparse import json import math import os import re import sys from datetime import datetime from pathlib import Path import pdfplumber import pandas as pd class ValidationError(Exception): """Custom exception for validation errors""" pass def read_pdf_text(pdf_path): with pdfplumber.open(pdf_path) as pdf: pages_text = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages_text) def find_one(pattern, text, cast=float, flags=re.IGNORECASE): m = re.search(pattern, text, flags) if not m: return None val = m.group(1).replace(",", "").strip() return cast(val) if cast else val def convert_date_to_iso(date_str): """Convert MM/DD/YYYY to YYYY-MM-DD""" if not date_str: return None try: dt = datetime.strptime(date_str, "%m/%d/%Y") return dt.strftime("%Y-%m-%d") except: return date_str def inches_to_ft_in(inches): """Convert inches to feet'inches" format""" if inches is None: return None feet = int(inches // 12) remaining_inches = int(inches % 12) return f"{feet}'{remaining_inches}\"" def parse_regional_table(text): regions = ["Arms", "Legs", "Trunk", "Android", "Gynoid", "Total"] out = {} for r in regions: # Example line: Arms 22.1% 27.4 6.0 20.2 1.1 pattern = rf"{r}\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" m = re.search(pattern, text) if m: out[r] = { "fat_percent": float(m.group(1)), "total_mass_lb": float(m.group(2)), "fat_tissue_lb": float(m.group(3)), "lean_tissue_lb": float(m.group(4)), "bmc_lb": float(m.group(5)), } return out def parse_muscle_balance(text): names = ["Arms Total", "Right Arm", "Left Arm", "Legs Total", "Right Leg", "Left Leg"] out = {} for n in names: # Example: Right Arm 20.4 13.7 2.8 10.3 0.6 pattern = rf"{n}\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" m = re.search(pattern, text) if m: out[n] = { "fat_percent": float(m.group(1)), "total_mass_lb": float(m.group(2)), "fat_mass_lb": float(m.group(3)), "lean_mass_lb": float(m.group(4)), "bmc_lb": float(m.group(5)), } return out def parse_bone_density_total(text): # Example: Total 1.280 0.8 0.8 m = re.search(r"Total\s+([\d\.]+)\s+([-\d\.]+)\s+([-\d\.]+)", text) if m: return { "total_bmd_g_per_cm2": float(m.group(1)), "young_adult_t_score": float(m.group(2)), "age_matched_z_score": float(m.group(3)), } return {} def parse_dexa_pdf(pdf_path): text = read_pdf_text(pdf_path) data = {} data["measured_date"] = find_one(r"Measured Date\s+([\d/]+)", text, cast=str) # First try to extract from SUMMARY RESULTS table (more reliable) # Pattern: 10/6/2025 27.8% 211.6 58.8 145.4 7.4 summary_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" summary_match = re.search(summary_pattern, text) if summary_match: data["body_fat_percent"] = float(summary_match.group(2)) data["total_mass_lb"] = float(summary_match.group(3)) data["fat_mass_lb"] = float(summary_match.group(4)) data["lean_soft_tissue_lb"] = float(summary_match.group(5)) data["bmc_lb"] = float(summary_match.group(6)) else: # Fallback to individual patterns data["body_fat_percent"] = find_one(r"Total Body Fat %\s+([\d\.]+)", text) data["total_mass_lb"] = find_one(r"Total Mass.*?\(lbs\)\s+([\d\.]+)", text) data["fat_mass_lb"] = find_one(r"Fat Tissue \(lbs\)\s+([\d\.]+)", text) data["lean_soft_tissue_lb"] = find_one(r"Lean Tissue \(lbs\)\s+([\d\.]+)", text) data["bmc_lb"] = find_one(r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)", text) # Supplemental data["android_percent"] = find_one(r"Android.*?([\d\.]+)%", text) data["gynoid_percent"] = find_one(r"Gynoid.*?([\d\.]+)%", text) data["rmr_cal_per_day"] = find_one(r"([\d,]+)\s*cal/day", text, cast=lambda s: int(s.replace(",", ""))) # A/G Ratio appears after RMR, Android%, Gynoid% on same line: "1,778 cal/day 36.5% 27.8% 1.31" ag_match = re.search(r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", text) if ag_match: data["ag_ratio"] = float(ag_match.group(3)) else: data["ag_ratio"] = find_one(r"A/G Ratio\s+([\d\.]+)", text) data["vat_mass_lb"] = find_one(r"Mass \(lbs\)\s+([\d\.]+)", text) data["vat_volume_in3"] = find_one(r"Volume \(in3\)\s+([\d\.]+)", text) # Tables data["regional"] = parse_regional_table(text) data["muscle_balance"] = parse_muscle_balance(text) data["bone_density"] = parse_bone_density_total(text) return data def compute_derived(d, height_in, weight_lb=None): # Prefer DEXA total mass if available total_mass = d.get("total_mass_lb") or weight_lb if total_mass is None: raise ValueError("Total mass is missing; pass --weight-lb if the PDF lacks it.") fm = d.get("fat_mass_lb") lst = d.get("lean_soft_tissue_lb") bmc = d.get("bmc_lb") bf_pct = d.get("body_fat_percent") ffm = None if fm is not None: ffm = total_mass - fm elif lst is not None and bmc is not None: ffm = lst + bmc def idx(value_lb): return round(703.0 * value_lb / (height_in ** 2), 2) derived = { "height_in": height_in, "height_ft_in": inches_to_ft_in(height_in), "weight_input_lb": weight_lb, "bmi": round(703.0 * total_mass / (height_in ** 2), 1), "fat_free_mass_lb": round(ffm, 1) if ffm is not None else None, "ffmi": idx(ffm) if ffm is not None else None, "fmi": idx(fm) if fm is not None else None, "lsti": idx(lst) if lst is not None else None, "alm_lb": None, "smi": None, } # Lean mass percentage (complement of body fat %) if bf_pct is not None: derived["lean_mass_percent"] = round(100 - bf_pct, 1) else: derived["lean_mass_percent"] = None # ALM from regional lean masses arms_lean = d.get("regional", {}).get("Arms", {}).get("lean_tissue_lb") legs_lean = d.get("regional", {}).get("Legs", {}).get("lean_tissue_lb") trunk_lean = d.get("regional", {}).get("Trunk", {}).get("lean_tissue_lb") if arms_lean is not None and legs_lean is not None: alm = arms_lean + legs_lean derived["alm_lb"] = round(alm, 1) derived["smi"] = idx(alm) # Regional lean mass distribution if lst is not None and arms_lean is not None and legs_lean is not None and trunk_lean is not None: derived["arms_lean_pct"] = round(100 * arms_lean / lst, 1) derived["legs_lean_pct"] = round(100 * legs_lean / lst, 1) derived["trunk_lean_pct"] = round(100 * trunk_lean / lst, 1) else: derived["arms_lean_pct"] = None derived["legs_lean_pct"] = None derived["trunk_lean_pct"] = None # Trunk-to-limb fat ratio (health risk indicator) trunk_fat = d.get("regional", {}).get("Trunk", {}).get("fat_tissue_lb") arms_fat = d.get("regional", {}).get("Arms", {}).get("fat_tissue_lb") legs_fat = d.get("regional", {}).get("Legs", {}).get("fat_tissue_lb") if trunk_fat is not None and arms_fat is not None and legs_fat is not None: limb_fat = arms_fat + legs_fat if limb_fat > 0: derived["trunk_to_limb_fat_ratio"] = round(trunk_fat / limb_fat, 2) else: derived["trunk_to_limb_fat_ratio"] = None else: derived["trunk_to_limb_fat_ratio"] = None # Limb symmetry indices (balance indicators) mb = d.get("muscle_balance", {}) right_arm = mb.get("Right Arm", {}).get("lean_mass_lb") left_arm = mb.get("Left Arm", {}).get("lean_mass_lb") right_leg = mb.get("Right Leg", {}).get("lean_mass_lb") left_leg = mb.get("Left Leg", {}).get("lean_mass_lb") if right_arm is not None and left_arm is not None and right_arm + left_arm > 0: # Symmetry: 100 = perfect, <100 = left stronger, >100 = right stronger derived["arm_symmetry_index"] = round(100 * right_arm / (right_arm + left_arm), 1) else: derived["arm_symmetry_index"] = None if right_leg is not None and left_leg is not None and right_leg + left_leg > 0: derived["leg_symmetry_index"] = round(100 * right_leg / (right_leg + left_leg), 1) else: derived["leg_symmetry_index"] = None # VAT Index (normalized by height squared, like BMI) vat_mass = d.get("vat_mass_lb") if vat_mass is not None: derived["vat_index"] = idx(vat_mass) else: derived["vat_index"] = None # Bone Mineral Density Index (BMC normalized by height) if bmc is not None: derived["bmdi"] = idx(bmc) else: derived["bmdi"] = None # Adjusted Body Weight (used in nutrition/health calculations) # ABW = IBW + 0.4 * (actual weight - IBW), where IBW differs by sex # For simplicity, using a unisex approximation: IBW ≈ height_in * 2.3 - 100 (rough estimate) if total_mass is not None: ibw_estimate = height_in * 2.3 - 100 if total_mass > ibw_estimate: derived["adjusted_body_weight_lb"] = round(ibw_estimate + 0.4 * (total_mass - ibw_estimate), 1) else: derived["adjusted_body_weight_lb"] = round(total_mass, 1) else: derived["adjusted_body_weight_lb"] = None return total_mass, derived def ensure_outdir(outdir): os.makedirs(outdir, exist_ok=True) def write_or_append_csv(path, row_dict, columns): df_row = pd.DataFrame([{k: row_dict.get(k) for k in columns}]) if os.path.exists(path): df_row.to_csv(path, mode="a", header=False, index=False) else: df_row.to_csv(path, index=False) def write_or_append_json(path, obj): if os.path.exists(path): with open(path, "r") as f: try: data = json.load(f) except json.JSONDecodeError: data = [] else: data = [] if isinstance(data, dict): # convert to list of entries if previous file was a single dict data = [data] data.append(obj) with open(path, "w") as f: json.dump(data, f, indent=2) def append_markdown(path, md_text): mode = "a" if os.path.exists(path) else "w" with open(path, mode) as f: f.write(md_text.strip() + "\n\n") def make_markdown(measured_date, d, derived, total_mass): lines = [] lines.append(f"# DEXA Summary — {measured_date}") lines.append("") lines.append(f"- Height: {derived['height_in']} in") lines.append(f"- Weight: {round(total_mass, 1)} lb") if d.get("body_fat_percent") is not None and d.get("fat_mass_lb") is not None: lines.append(f"- Body fat: {d['body_fat_percent']}% ({d['fat_mass_lb']} lb)") if d.get("lean_soft_tissue_lb") is not None: lines.append(f"- Lean soft tissue: {d['lean_soft_tissue_lb']} lb") if d.get("bmc_lb") is not None: lines.append(f"- Bone mineral content: {d['bmc_lb']} lb") lines.append(f"- Fat‑free mass: {derived.get('fat_free_mass_lb')}") lines.append(f"- BMI: {derived['bmi']}") lines.append(f"- FFMI: {derived.get('ffmi')}; FMI: {derived.get('fmi')}; Lean Soft Tissue Index: {derived.get('lsti')}") if derived.get("alm_lb") is not None: lines.append(f"- Appendicular Lean Mass: {derived['alm_lb']} lb; Skeletal Muscle Index: {derived['smi']}") if d.get("android_percent") is not None and d.get("gynoid_percent") is not None and d.get("ag_ratio") is not None: lines.append(f"- Android: {d['android_percent']}%; Gynoid: {d['gynoid_percent']}%; A/G ratio: {d['ag_ratio']}") if d.get("vat_mass_lb") is not None and d.get("vat_volume_in3") is not None: lines.append(f"- VAT: {d['vat_mass_lb']} lb ({d['vat_volume_in3']} in³)") if d.get("rmr_cal_per_day") is not None: lines.append(f"- RMR: {d['rmr_cal_per_day']} cal/day") lines.append("") lines.append("## Regional") for name, r in d.get("regional", {}).items(): lines.append(f"- {name}: {r['fat_percent']}% fat; {r['total_mass_lb']} lb total; {r['fat_tissue_lb']} lb fat; {r['lean_tissue_lb']} lb lean; {r['bmc_lb']} lb BMC") return "\n".join(lines) def main(): ap = argparse.ArgumentParser( description="BodySpec Insights - Extract and analyze body composition data from BodySpec DEXA scan PDFs", epilog="Example: python dexa_extract.py scan.pdf --height-in 74 --weight-lb 212 --outdir ./data/results" ) ap.add_argument("pdf", help="Path to BodySpec DEXA report PDF") ap.add_argument("--height-in", type=float, required=True, help="Height in inches (e.g., 6'2\" = 74)") ap.add_argument("--weight-lb", type=float, help="Body weight in lbs (optional; used if DEXA total mass missing)") ap.add_argument("--outdir", default="dexa_out", help="Output directory (default: dexa_out)") args = ap.parse_args() # Validate PDF file exists pdf_file = Path(args.pdf) if not pdf_file.exists(): print(f"❌ Error: PDF file not found: {args.pdf}", file=sys.stderr) sys.exit(1) if not pdf_file.is_file(): print(f"❌ Error: Path is not a file: {args.pdf}", file=sys.stderr) sys.exit(1) if pdf_file.suffix.lower() != '.pdf': print(f"❌ Error: File is not a PDF: {args.pdf}", file=sys.stderr) sys.exit(1) # Validate height if args.height_in < 36 or args.height_in > 96: print(f"❌ Error: Height seems unrealistic: {args.height_in} inches (expected 36-96 inches / 3'-8')", file=sys.stderr) sys.exit(1) # Validate weight if provided if args.weight_lb is not None and (args.weight_lb < 50 or args.weight_lb > 500): print(f"❌ Error: Weight seems unrealistic: {args.weight_lb} lbs (expected 50-500 lbs)", file=sys.stderr) sys.exit(1) try: ensure_outdir(args.outdir) except PermissionError: print(f"❌ Error: Cannot create output directory: {args.outdir} (permission denied)", file=sys.stderr) sys.exit(1) print(f"📄 Reading PDF: {args.pdf}") try: d = parse_dexa_pdf(args.pdf) except Exception as e: print(f"❌ Error reading PDF: {e}", file=sys.stderr) print("This tool is specifically designed for BodySpec PDF reports.", file=sys.stderr) sys.exit(1) # Check if critical data was extracted if d.get("body_fat_percent") is None or d.get("total_mass_lb") is None: print("⚠️ Warning: Missing critical data from PDF. This may not be a BodySpec report.", file=sys.stderr) if d.get("body_fat_percent") is None: print(" - Body Fat % not found", file=sys.stderr) if d.get("total_mass_lb") is None: print(" - Total Mass not found", file=sys.stderr) print("📊 Computing derived metrics...") measured_date_raw = d.get("measured_date") or datetime.now().strftime("%m/%d/%Y") measured_date = convert_date_to_iso(measured_date_raw) total_mass, derived = compute_derived(d, height_in=args.height_in, weight_lb=args.weight_lb) # Overall CSV row overall_cols = [ "MeasuredDate","Height_in","Height_ft_in","Weight_lb_Input","DEXA_TotalMass_lb","BodyFat_percent", "LeanMass_percent","FatMass_lb","LeanSoftTissue_lb","BoneMineralContent_lb","FatFreeMass_lb", "BMI","FFMI","FMI","LST_Index","ALM_lb","SMI","VAT_Mass_lb","VAT_Volume_in3","VAT_Index", "BMDI","Android_percent","Gynoid_percent","AG_Ratio","Trunk_to_Limb_Fat_Ratio", "Arms_Lean_pct","Legs_Lean_pct","Trunk_Lean_pct","Arm_Symmetry_Index","Leg_Symmetry_Index", "Adjusted_Body_Weight_lb","RMR_cal_per_day" ] overall_row = { "MeasuredDate": measured_date, "Height_in": derived["height_in"], "Height_ft_in": derived["height_ft_in"], "Weight_lb_Input": derived["weight_input_lb"], "DEXA_TotalMass_lb": round(total_mass, 1), "BodyFat_percent": d.get("body_fat_percent"), "LeanMass_percent": derived.get("lean_mass_percent"), "FatMass_lb": d.get("fat_mass_lb"), "LeanSoftTissue_lb": d.get("lean_soft_tissue_lb"), "BoneMineralContent_lb": d.get("bmc_lb"), "FatFreeMass_lb": derived.get("fat_free_mass_lb"), "BMI": derived["bmi"], "FFMI": derived.get("ffmi"), "FMI": derived.get("fmi"), "LST_Index": derived.get("lsti"), "ALM_lb": derived.get("alm_lb"), "SMI": derived.get("smi"), "VAT_Mass_lb": d.get("vat_mass_lb"), "VAT_Volume_in3": d.get("vat_volume_in3"), "VAT_Index": derived.get("vat_index"), "BMDI": derived.get("bmdi"), "Android_percent": d.get("android_percent"), "Gynoid_percent": d.get("gynoid_percent"), "AG_Ratio": d.get("ag_ratio"), "Trunk_to_Limb_Fat_Ratio": derived.get("trunk_to_limb_fat_ratio"), "Arms_Lean_pct": derived.get("arms_lean_pct"), "Legs_Lean_pct": derived.get("legs_lean_pct"), "Trunk_Lean_pct": derived.get("trunk_lean_pct"), "Arm_Symmetry_Index": derived.get("arm_symmetry_index"), "Leg_Symmetry_Index": derived.get("leg_symmetry_index"), "Adjusted_Body_Weight_lb": derived.get("adjusted_body_weight_lb"), "RMR_cal_per_day": d.get("rmr_cal_per_day"), } write_or_append_csv(os.path.join(args.outdir, "overall.csv"), overall_row, overall_cols) # Regional table regional_cols = ["Region","FatPercent","TotalMass_lb","FatTissue_lb","LeanTissue_lb","BMC_lb"] reg_rows = [] for name, r in d.get("regional", {}).items(): reg_rows.append({ "Region": name, "FatPercent": r["fat_percent"], "TotalMass_lb": r["total_mass_lb"], "FatTissue_lb": r["fat_tissue_lb"], "LeanTissue_lb": r["lean_tissue_lb"], "BMC_lb": r["bmc_lb"], }) regional_path = os.path.join(args.outdir, "regional.csv") if os.path.exists(regional_path): pd.DataFrame(reg_rows).to_csv(regional_path, mode="a", header=False, index=False) else: pd.DataFrame(reg_rows).to_csv(regional_path, index=False) # Muscle balance mb_cols = ["Region","FatPercent","TotalMass_lb","FatMass_lb","LeanMass_lb","BMC_lb"] mb_rows = [] for name, r in d.get("muscle_balance", {}).items(): mb_rows.append({ "Region": name, "FatPercent": r["fat_percent"], "TotalMass_lb": r["total_mass_lb"], "FatMass_lb": r["fat_mass_lb"], "LeanMass_lb": r["lean_mass_lb"], "BMC_lb": r["bmc_lb"], }) mb_path = os.path.join(args.outdir, "muscle_balance.csv") if os.path.exists(mb_path): pd.DataFrame(mb_rows).to_csv(mb_path, mode="a", header=False, index=False) else: pd.DataFrame(mb_rows).to_csv(mb_path, index=False) # JSON (overall structured object) # Convert regional and muscle_balance dicts to arrays regional_array = [ {"region": name, **data} for name, data in d.get("regional", {}).items() ] muscle_balance_array = [ {"region": name, **data} for name, data in d.get("muscle_balance", {}).items() ] overall_json = { "measured_date": measured_date, "anthropometrics": { "height_in": derived["height_in"], "height_ft_in": derived["height_ft_in"], "weight_input_lb": derived["weight_input_lb"], "dexa_total_mass_lb": round(total_mass, 1), "adjusted_body_weight_lb": derived.get("adjusted_body_weight_lb"), "bmi": derived["bmi"] }, "composition": { "body_fat_percent": d.get("body_fat_percent"), "lean_mass_percent": derived.get("lean_mass_percent"), "fat_mass_lb": d.get("fat_mass_lb"), "lean_soft_tissue_lb": d.get("lean_soft_tissue_lb"), "bone_mineral_content_lb": d.get("bmc_lb"), "fat_free_mass_lb": derived.get("fat_free_mass_lb"), "derived_indices": { "ffmi": derived.get("ffmi"), "fmi": derived.get("fmi"), "lsti": derived.get("lsti"), "alm_lb": derived.get("alm_lb"), "smi": derived.get("smi"), "bmdi": derived.get("bmdi") } }, "regional": regional_array, "regional_analysis": { "trunk_to_limb_fat_ratio": derived.get("trunk_to_limb_fat_ratio"), "lean_mass_distribution": { "arms_percent": derived.get("arms_lean_pct"), "legs_percent": derived.get("legs_lean_pct"), "trunk_percent": derived.get("trunk_lean_pct") } }, "muscle_balance": muscle_balance_array, "symmetry_indices": { "arm_symmetry_index": derived.get("arm_symmetry_index"), "leg_symmetry_index": derived.get("leg_symmetry_index") }, "supplemental": { "android_percent": d.get("android_percent"), "gynoid_percent": d.get("gynoid_percent"), "ag_ratio": d.get("ag_ratio"), "vat": { "mass_lb": d.get("vat_mass_lb"), "volume_in3": d.get("vat_volume_in3"), "vat_index": derived.get("vat_index") }, "rmr_cal_per_day": d.get("rmr_cal_per_day") }, "bone_density": d.get("bone_density", {}) } write_or_append_json(os.path.join(args.outdir, "overall.json"), overall_json) # Markdown summary (append) md_text = make_markdown(measured_date, d, derived, total_mass) append_markdown(os.path.join(args.outdir, "summary.md"), md_text) print(f"\n✅ Success! Wrote files to: {args.outdir}") print(" 📁 Files created:") print(" - overall.csv (time-series data)") print(" - regional.csv (body composition by region)") print(" - muscle_balance.csv (left/right symmetry)") print(" - overall.json (structured data)") print(" - summary.md (readable report)") print(f"\n 📈 Scan date: {measured_date}") print(f" 💪 Body fat: {d.get('body_fat_percent')}%") print(f" 🏋️ FFMI: {derived.get('ffmi')}") if __name__ == "__main__": main()