#!/usr/bin/env python3 """ BodySpec Insights - Body composition analytics for BodySpec DEXA scan PDFs Extract measurements from BodySpec DEXA reports, compute 30+ derived metrics, and output structured data for progress tracking. Usage: python dexa_extract.py /path/to/bodyspec-report.pdf --height-in 74 --weight-lb 212 --outdir ./data/results Note: This script is specifically designed for BodySpec PDF reports. Requires: pip install pdfplumber pandas """ import argparse import json import math import os import re from datetime import datetime import pdfplumber import pandas as pd def read_pdf_text(pdf_path): with pdfplumber.open(pdf_path) as pdf: pages_text = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages_text) def find_one(pattern, text, cast=float, flags=re.IGNORECASE): m = re.search(pattern, text, flags) if not m: return None val = m.group(1).replace(",", "").strip() return cast(val) if cast else val def convert_date_to_iso(date_str): """Convert MM/DD/YYYY to YYYY-MM-DD""" if not date_str: return None try: dt = datetime.strptime(date_str, "%m/%d/%Y") return dt.strftime("%Y-%m-%d") except: return date_str def inches_to_ft_in(inches): """Convert inches to feet'inches" format""" if inches is None: return None feet = int(inches // 12) remaining_inches = int(inches % 12) return f"{feet}'{remaining_inches}\"" def parse_regional_table(text): regions = ["Arms", "Legs", "Trunk", "Android", "Gynoid", "Total"] out = {} for r in regions: # Example line: Arms 22.1% 27.4 6.0 20.2 1.1 pattern = rf"{r}\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" m = re.search(pattern, text) if m: out[r] = { "fat_percent": float(m.group(1)), "total_mass_lb": float(m.group(2)), "fat_tissue_lb": float(m.group(3)), "lean_tissue_lb": float(m.group(4)), "bmc_lb": float(m.group(5)), } return out def parse_muscle_balance(text): names = ["Arms Total", "Right Arm", "Left Arm", "Legs Total", "Right Leg", "Left Leg"] out = {} for n in names: # Example: Right Arm 20.4 13.7 2.8 10.3 0.6 pattern = rf"{n}\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" m = re.search(pattern, text) if m: out[n] = { "fat_percent": float(m.group(1)), "total_mass_lb": float(m.group(2)), "fat_mass_lb": float(m.group(3)), "lean_mass_lb": float(m.group(4)), "bmc_lb": float(m.group(5)), } return out def parse_bone_density_total(text): # Example: Total 1.280 0.8 0.8 m = re.search(r"Total\s+([\d\.]+)\s+([-\d\.]+)\s+([-\d\.]+)", text) if m: return { "total_bmd_g_per_cm2": float(m.group(1)), "young_adult_t_score": float(m.group(2)), "age_matched_z_score": float(m.group(3)), } return {} def parse_dexa_pdf(pdf_path): text = read_pdf_text(pdf_path) data = {} data["measured_date"] = find_one(r"Measured Date\s+([\d/]+)", text, cast=str) # First try to extract from SUMMARY RESULTS table (more reliable) # Pattern: 10/6/2025 27.8% 211.6 58.8 145.4 7.4 summary_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" summary_match = re.search(summary_pattern, text) if summary_match: data["body_fat_percent"] = float(summary_match.group(2)) data["total_mass_lb"] = float(summary_match.group(3)) data["fat_mass_lb"] = float(summary_match.group(4)) data["lean_soft_tissue_lb"] = float(summary_match.group(5)) data["bmc_lb"] = float(summary_match.group(6)) else: # Fallback to individual patterns data["body_fat_percent"] = find_one(r"Total Body Fat %\s+([\d\.]+)", text) data["total_mass_lb"] = find_one(r"Total Mass.*?\(lbs\)\s+([\d\.]+)", text) data["fat_mass_lb"] = find_one(r"Fat Tissue \(lbs\)\s+([\d\.]+)", text) data["lean_soft_tissue_lb"] = find_one(r"Lean Tissue \(lbs\)\s+([\d\.]+)", text) data["bmc_lb"] = find_one(r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)", text) # Supplemental data["android_percent"] = find_one(r"Android.*?([\d\.]+)%", text) data["gynoid_percent"] = find_one(r"Gynoid.*?([\d\.]+)%", text) data["rmr_cal_per_day"] = find_one(r"([\d,]+)\s*cal/day", text, cast=lambda s: int(s.replace(",", ""))) # A/G Ratio appears after RMR, Android%, Gynoid% on same line: "1,778 cal/day 36.5% 27.8% 1.31" ag_match = re.search(r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", text) if ag_match: data["ag_ratio"] = float(ag_match.group(3)) else: data["ag_ratio"] = find_one(r"A/G Ratio\s+([\d\.]+)", text) data["vat_mass_lb"] = find_one(r"Mass \(lbs\)\s+([\d\.]+)", text) data["vat_volume_in3"] = find_one(r"Volume \(in3\)\s+([\d\.]+)", text) # Tables data["regional"] = parse_regional_table(text) data["muscle_balance"] = parse_muscle_balance(text) data["bone_density"] = parse_bone_density_total(text) return data def compute_derived(d, height_in, weight_lb=None): # Prefer DEXA total mass if available total_mass = d.get("total_mass_lb") or weight_lb if total_mass is None: raise ValueError("Total mass is missing; pass --weight-lb if the PDF lacks it.") fm = d.get("fat_mass_lb") lst = d.get("lean_soft_tissue_lb") bmc = d.get("bmc_lb") bf_pct = d.get("body_fat_percent") ffm = None if fm is not None: ffm = total_mass - fm elif lst is not None and bmc is not None: ffm = lst + bmc def idx(value_lb): return round(703.0 * value_lb / (height_in ** 2), 2) derived = { "height_in": height_in, "height_ft_in": inches_to_ft_in(height_in), "weight_input_lb": weight_lb, "bmi": round(703.0 * total_mass / (height_in ** 2), 1), "fat_free_mass_lb": round(ffm, 1) if ffm is not None else None, "ffmi": idx(ffm) if ffm is not None else None, "fmi": idx(fm) if fm is not None else None, "lsti": idx(lst) if lst is not None else None, "alm_lb": None, "smi": None, } # Lean mass percentage (complement of body fat %) if bf_pct is not None: derived["lean_mass_percent"] = round(100 - bf_pct, 1) else: derived["lean_mass_percent"] = None # ALM from regional lean masses arms_lean = d.get("regional", {}).get("Arms", {}).get("lean_tissue_lb") legs_lean = d.get("regional", {}).get("Legs", {}).get("lean_tissue_lb") trunk_lean = d.get("regional", {}).get("Trunk", {}).get("lean_tissue_lb") if arms_lean is not None and legs_lean is not None: alm = arms_lean + legs_lean derived["alm_lb"] = round(alm, 1) derived["smi"] = idx(alm) # Regional lean mass distribution if lst is not None and arms_lean is not None and legs_lean is not None and trunk_lean is not None: derived["arms_lean_pct"] = round(100 * arms_lean / lst, 1) derived["legs_lean_pct"] = round(100 * legs_lean / lst, 1) derived["trunk_lean_pct"] = round(100 * trunk_lean / lst, 1) else: derived["arms_lean_pct"] = None derived["legs_lean_pct"] = None derived["trunk_lean_pct"] = None # Trunk-to-limb fat ratio (health risk indicator) trunk_fat = d.get("regional", {}).get("Trunk", {}).get("fat_tissue_lb") arms_fat = d.get("regional", {}).get("Arms", {}).get("fat_tissue_lb") legs_fat = d.get("regional", {}).get("Legs", {}).get("fat_tissue_lb") if trunk_fat is not None and arms_fat is not None and legs_fat is not None: limb_fat = arms_fat + legs_fat if limb_fat > 0: derived["trunk_to_limb_fat_ratio"] = round(trunk_fat / limb_fat, 2) else: derived["trunk_to_limb_fat_ratio"] = None else: derived["trunk_to_limb_fat_ratio"] = None # Limb symmetry indices (balance indicators) mb = d.get("muscle_balance", {}) right_arm = mb.get("Right Arm", {}).get("lean_mass_lb") left_arm = mb.get("Left Arm", {}).get("lean_mass_lb") right_leg = mb.get("Right Leg", {}).get("lean_mass_lb") left_leg = mb.get("Left Leg", {}).get("lean_mass_lb") if right_arm is not None and left_arm is not None and right_arm + left_arm > 0: # Symmetry: 100 = perfect, <100 = left stronger, >100 = right stronger derived["arm_symmetry_index"] = round(100 * right_arm / (right_arm + left_arm), 1) else: derived["arm_symmetry_index"] = None if right_leg is not None and left_leg is not None and right_leg + left_leg > 0: derived["leg_symmetry_index"] = round(100 * right_leg / (right_leg + left_leg), 1) else: derived["leg_symmetry_index"] = None # VAT Index (normalized by height squared, like BMI) vat_mass = d.get("vat_mass_lb") if vat_mass is not None: derived["vat_index"] = idx(vat_mass) else: derived["vat_index"] = None # Bone Mineral Density Index (BMC normalized by height) if bmc is not None: derived["bmdi"] = idx(bmc) else: derived["bmdi"] = None # Adjusted Body Weight (used in nutrition/health calculations) # ABW = IBW + 0.4 * (actual weight - IBW), where IBW differs by sex # For simplicity, using a unisex approximation: IBW ≈ height_in * 2.3 - 100 (rough estimate) if total_mass is not None: ibw_estimate = height_in * 2.3 - 100 if total_mass > ibw_estimate: derived["adjusted_body_weight_lb"] = round(ibw_estimate + 0.4 * (total_mass - ibw_estimate), 1) else: derived["adjusted_body_weight_lb"] = round(total_mass, 1) else: derived["adjusted_body_weight_lb"] = None return total_mass, derived def ensure_outdir(outdir): os.makedirs(outdir, exist_ok=True) def write_or_append_csv(path, row_dict, columns): df_row = pd.DataFrame([{k: row_dict.get(k) for k in columns}]) if os.path.exists(path): df_row.to_csv(path, mode="a", header=False, index=False) else: df_row.to_csv(path, index=False) def write_or_append_json(path, obj): if os.path.exists(path): with open(path, "r") as f: try: data = json.load(f) except json.JSONDecodeError: data = [] else: data = [] if isinstance(data, dict): # convert to list of entries if previous file was a single dict data = [data] data.append(obj) with open(path, "w") as f: json.dump(data, f, indent=2) def append_markdown(path, md_text): mode = "a" if os.path.exists(path) else "w" with open(path, mode) as f: f.write(md_text.strip() + "\n\n") def make_markdown(measured_date, d, derived, total_mass): lines = [] lines.append(f"# DEXA Summary — {measured_date}") lines.append("") lines.append(f"- Height: {derived['height_in']} in") lines.append(f"- Weight: {round(total_mass, 1)} lb") if d.get("body_fat_percent") is not None and d.get("fat_mass_lb") is not None: lines.append(f"- Body fat: {d['body_fat_percent']}% ({d['fat_mass_lb']} lb)") if d.get("lean_soft_tissue_lb") is not None: lines.append(f"- Lean soft tissue: {d['lean_soft_tissue_lb']} lb") if d.get("bmc_lb") is not None: lines.append(f"- Bone mineral content: {d['bmc_lb']} lb") lines.append(f"- Fat‑free mass: {derived.get('fat_free_mass_lb')}") lines.append(f"- BMI: {derived['bmi']}") lines.append(f"- FFMI: {derived.get('ffmi')}; FMI: {derived.get('fmi')}; Lean Soft Tissue Index: {derived.get('lsti')}") if derived.get("alm_lb") is not None: lines.append(f"- Appendicular Lean Mass: {derived['alm_lb']} lb; Skeletal Muscle Index: {derived['smi']}") if d.get("android_percent") is not None and d.get("gynoid_percent") is not None and d.get("ag_ratio") is not None: lines.append(f"- Android: {d['android_percent']}%; Gynoid: {d['gynoid_percent']}%; A/G ratio: {d['ag_ratio']}") if d.get("vat_mass_lb") is not None and d.get("vat_volume_in3") is not None: lines.append(f"- VAT: {d['vat_mass_lb']} lb ({d['vat_volume_in3']} in³)") if d.get("rmr_cal_per_day") is not None: lines.append(f"- RMR: {d['rmr_cal_per_day']} cal/day") lines.append("") lines.append("## Regional") for name, r in d.get("regional", {}).items(): lines.append(f"- {name}: {r['fat_percent']}% fat; {r['total_mass_lb']} lb total; {r['fat_tissue_lb']} lb fat; {r['lean_tissue_lb']} lb lean; {r['bmc_lb']} lb BMC") return "\n".join(lines) def main(): ap = argparse.ArgumentParser() ap.add_argument("pdf", help="Path to DEXA report PDF") ap.add_argument("--height-in", type=float, required=True, help="Height in inches (Imperial)") ap.add_argument("--weight-lb", type=float, help="Body weight in lb (optional; used if DEXA total mass missing)") ap.add_argument("--outdir", default="dexa_out", help="Output directory") args = ap.parse_args() ensure_outdir(args.outdir) d = parse_dexa_pdf(args.pdf) measured_date_raw = d.get("measured_date") or datetime.now().strftime("%m/%d/%Y") measured_date = convert_date_to_iso(measured_date_raw) total_mass, derived = compute_derived(d, height_in=args.height_in, weight_lb=args.weight_lb) # Overall CSV row overall_cols = [ "MeasuredDate","Height_in","Height_ft_in","Weight_lb_Input","DEXA_TotalMass_lb","BodyFat_percent", "LeanMass_percent","FatMass_lb","LeanSoftTissue_lb","BoneMineralContent_lb","FatFreeMass_lb", "BMI","FFMI","FMI","LST_Index","ALM_lb","SMI","VAT_Mass_lb","VAT_Volume_in3","VAT_Index", "BMDI","Android_percent","Gynoid_percent","AG_Ratio","Trunk_to_Limb_Fat_Ratio", "Arms_Lean_pct","Legs_Lean_pct","Trunk_Lean_pct","Arm_Symmetry_Index","Leg_Symmetry_Index", "Adjusted_Body_Weight_lb","RMR_cal_per_day" ] overall_row = { "MeasuredDate": measured_date, "Height_in": derived["height_in"], "Height_ft_in": derived["height_ft_in"], "Weight_lb_Input": derived["weight_input_lb"], "DEXA_TotalMass_lb": round(total_mass, 1), "BodyFat_percent": d.get("body_fat_percent"), "LeanMass_percent": derived.get("lean_mass_percent"), "FatMass_lb": d.get("fat_mass_lb"), "LeanSoftTissue_lb": d.get("lean_soft_tissue_lb"), "BoneMineralContent_lb": d.get("bmc_lb"), "FatFreeMass_lb": derived.get("fat_free_mass_lb"), "BMI": derived["bmi"], "FFMI": derived.get("ffmi"), "FMI": derived.get("fmi"), "LST_Index": derived.get("lsti"), "ALM_lb": derived.get("alm_lb"), "SMI": derived.get("smi"), "VAT_Mass_lb": d.get("vat_mass_lb"), "VAT_Volume_in3": d.get("vat_volume_in3"), "VAT_Index": derived.get("vat_index"), "BMDI": derived.get("bmdi"), "Android_percent": d.get("android_percent"), "Gynoid_percent": d.get("gynoid_percent"), "AG_Ratio": d.get("ag_ratio"), "Trunk_to_Limb_Fat_Ratio": derived.get("trunk_to_limb_fat_ratio"), "Arms_Lean_pct": derived.get("arms_lean_pct"), "Legs_Lean_pct": derived.get("legs_lean_pct"), "Trunk_Lean_pct": derived.get("trunk_lean_pct"), "Arm_Symmetry_Index": derived.get("arm_symmetry_index"), "Leg_Symmetry_Index": derived.get("leg_symmetry_index"), "Adjusted_Body_Weight_lb": derived.get("adjusted_body_weight_lb"), "RMR_cal_per_day": d.get("rmr_cal_per_day"), } write_or_append_csv(os.path.join(args.outdir, "overall.csv"), overall_row, overall_cols) # Regional table regional_cols = ["Region","FatPercent","TotalMass_lb","FatTissue_lb","LeanTissue_lb","BMC_lb"] reg_rows = [] for name, r in d.get("regional", {}).items(): reg_rows.append({ "Region": name, "FatPercent": r["fat_percent"], "TotalMass_lb": r["total_mass_lb"], "FatTissue_lb": r["fat_tissue_lb"], "LeanTissue_lb": r["lean_tissue_lb"], "BMC_lb": r["bmc_lb"], }) regional_path = os.path.join(args.outdir, "regional.csv") if os.path.exists(regional_path): pd.DataFrame(reg_rows).to_csv(regional_path, mode="a", header=False, index=False) else: pd.DataFrame(reg_rows).to_csv(regional_path, index=False) # Muscle balance mb_cols = ["Region","FatPercent","TotalMass_lb","FatMass_lb","LeanMass_lb","BMC_lb"] mb_rows = [] for name, r in d.get("muscle_balance", {}).items(): mb_rows.append({ "Region": name, "FatPercent": r["fat_percent"], "TotalMass_lb": r["total_mass_lb"], "FatMass_lb": r["fat_mass_lb"], "LeanMass_lb": r["lean_mass_lb"], "BMC_lb": r["bmc_lb"], }) mb_path = os.path.join(args.outdir, "muscle_balance.csv") if os.path.exists(mb_path): pd.DataFrame(mb_rows).to_csv(mb_path, mode="a", header=False, index=False) else: pd.DataFrame(mb_rows).to_csv(mb_path, index=False) # JSON (overall structured object) # Convert regional and muscle_balance dicts to arrays regional_array = [ {"region": name, **data} for name, data in d.get("regional", {}).items() ] muscle_balance_array = [ {"region": name, **data} for name, data in d.get("muscle_balance", {}).items() ] overall_json = { "measured_date": measured_date, "anthropometrics": { "height_in": derived["height_in"], "height_ft_in": derived["height_ft_in"], "weight_input_lb": derived["weight_input_lb"], "dexa_total_mass_lb": round(total_mass, 1), "adjusted_body_weight_lb": derived.get("adjusted_body_weight_lb"), "bmi": derived["bmi"] }, "composition": { "body_fat_percent": d.get("body_fat_percent"), "lean_mass_percent": derived.get("lean_mass_percent"), "fat_mass_lb": d.get("fat_mass_lb"), "lean_soft_tissue_lb": d.get("lean_soft_tissue_lb"), "bone_mineral_content_lb": d.get("bmc_lb"), "fat_free_mass_lb": derived.get("fat_free_mass_lb"), "derived_indices": { "ffmi": derived.get("ffmi"), "fmi": derived.get("fmi"), "lsti": derived.get("lsti"), "alm_lb": derived.get("alm_lb"), "smi": derived.get("smi"), "bmdi": derived.get("bmdi") } }, "regional": regional_array, "regional_analysis": { "trunk_to_limb_fat_ratio": derived.get("trunk_to_limb_fat_ratio"), "lean_mass_distribution": { "arms_percent": derived.get("arms_lean_pct"), "legs_percent": derived.get("legs_lean_pct"), "trunk_percent": derived.get("trunk_lean_pct") } }, "muscle_balance": muscle_balance_array, "symmetry_indices": { "arm_symmetry_index": derived.get("arm_symmetry_index"), "leg_symmetry_index": derived.get("leg_symmetry_index") }, "supplemental": { "android_percent": d.get("android_percent"), "gynoid_percent": d.get("gynoid_percent"), "ag_ratio": d.get("ag_ratio"), "vat": { "mass_lb": d.get("vat_mass_lb"), "volume_in3": d.get("vat_volume_in3"), "vat_index": derived.get("vat_index") }, "rmr_cal_per_day": d.get("rmr_cal_per_day") }, "bone_density": d.get("bone_density", {}) } write_or_append_json(os.path.join(args.outdir, "overall.json"), overall_json) # Markdown summary (append) md_text = make_markdown(measured_date, d, derived, total_mass) append_markdown(os.path.join(args.outdir, "summary.md"), md_text) print(f"Wrote files to: {args.outdir}") print("Files: overall.csv, regional.csv, muscle_balance.csv, overall.json, summary.md") if __name__ == "__main__": main()