bodyspec-insights/dexa_extract.py
Mac DeCourcy c7d2e32d7d Add MeasuredDate column to all CSV files
CHANGES:
- Added MeasuredDate as first column in regional.csv
- Added MeasuredDate as first column in muscle_balance.csv
- Updated README to document new column structure

BENEFITS:
 Track regional changes over time (e.g., Arms fat % across scans)
 Easy time-series analysis with pandas/Excel
 Filter by date range for progress tracking
 Consistent date column across all 3 CSV files
 Enables queries like: 'Show me trunk fat % over last 6 months'

EXAMPLE USAGE:
  import pandas as pd
  regional = pd.read_csv('regional.csv')
  arms = regional[regional['Region'] == 'Arms']
  # Now you can track Arms progress over time!

Each scan now adds:
- 1 row to overall.csv
- 6 rows to regional.csv (one per region)
- 6 rows to muscle_balance.csv (one per limb comparison)
2025-10-07 15:24:29 -07:00

886 lines
35 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
BodySpec Insights - Body composition analytics for BodySpec DEXA scan PDFs
Extract measurements from BodySpec DEXA reports, compute 30+ derived metrics,
and output structured data for progress tracking.
Usage:
python dexa_extract.py /path/to/bodyspec-report.pdf --height-in 74 --weight-lb 212 --outdir ./data/results
Note: This script is specifically designed for BodySpec PDF reports.
Requires:
pip install pdfplumber pandas
"""
import argparse
import json
import math
import os
import re
import sys
from datetime import datetime
from pathlib import Path
import pdfplumber
import pandas as pd
class ValidationError(Exception):
"""Custom exception for validation errors"""
pass
def get_processed_dates(outdir):
"""Get list of already-processed scan dates from existing CSV"""
overall_csv = Path(outdir) / "overall.csv"
if not overall_csv.exists():
return set()
try:
df = pd.read_csv(overall_csv)
if 'MeasuredDate' in df.columns:
return set(df['MeasuredDate'].dropna().unique())
except Exception:
pass
return set()
# ============================================================================
# EXTRACTION PATTERN DICTIONARY
# ============================================================================
# Centralized pattern definitions for extracting data from BodySpec PDFs.
#
# Each field has:
# - patterns: List of regex patterns to try (in order of preference)
# - group: Which capture group to extract from each pattern
# - cast: Function to convert extracted string to desired type
# - required: Whether this field must be present for valid extraction
# - description: Human-readable explanation of the field
#
# Benefits of this approach:
# 1. Self-documenting - patterns describe what they extract
# 2. Maintainable - add/modify patterns in one place
# 3. Robust - multiple patterns provide fallback options
# 4. Validated - know exactly which required fields are missing
#
# To add a new field:
# 1. Add entry to this dictionary
# 2. Pattern will automatically be used by extract_all_fields()
# 3. Validation will check if required fields are present
# ============================================================================
EXTRACTION_PATTERNS = {
# Primary body composition metrics (from SUMMARY RESULTS table or individual fields)
'body_fat_percent': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 2)
r"Total Body Fat %\s+([\d\.]+)" # Individual field
],
'group': [2, 1], # Which capture group for each pattern
'cast': float,
'required': True,
'description': 'Total body fat percentage'
},
'total_mass_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 3)
r"Total Mass.*?\(lbs\)\s+([\d\.]+)" # Individual field
],
'group': [3, 1],
'cast': float,
'required': True,
'description': 'Total body mass in pounds'
},
'fat_mass_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 4)
r"Fat Tissue \(lbs\)\s+([\d\.]+)" # Individual field
],
'group': [4, 1],
'cast': float,
'required': True,
'description': 'Total fat tissue mass'
},
'lean_soft_tissue_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 5)
r"Lean Tissue \(lbs\)\s+([\d\.]+)" # Individual field
],
'group': [5, 1],
'cast': float,
'required': True,
'description': 'Lean soft tissue mass'
},
'bmc_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 6)
r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)" # Individual field
],
'group': [6, 1],
'cast': float,
'required': True,
'description': 'Bone mineral content'
},
# Date information
'measured_date': {
'patterns': [
r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", # After weight in client info
r"(\d{1,2}/\d{1,2}/\d{4})\s*$" # End of first line
],
'group': [1, 1],
'cast': str,
'required': True,
'description': 'Date of DEXA scan'
},
# Regional fat distribution
'android_percent': {
'patterns': [r"Android.*?([\d\.]+)%"],
'group': [1],
'cast': float,
'required': False,
'description': 'Android region fat percentage (belly area)'
},
'gynoid_percent': {
'patterns': [r"Gynoid.*?([\d\.]+)%"],
'group': [1],
'cast': float,
'required': False,
'description': 'Gynoid region fat percentage (hips/thighs)'
},
'ag_ratio': {
'patterns': [
r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", # On same line as RMR (group 3)
r"A/G Ratio\s+([\d\.]+)" # Standalone field
],
'group': [3, 1],
'cast': float,
'required': False,
'description': 'Android/Gynoid ratio (central vs peripheral fat)'
},
# Metabolic and supplemental metrics
'rmr_cal_per_day': {
'patterns': [r"([\d,]+)\s*cal/day"],
'group': [1],
'cast': lambda s: int(s.replace(',', '')),
'required': False,
'description': 'Resting metabolic rate (calories per day)'
},
'vat_mass_lb': {
'patterns': [r"Mass \(lbs\)\s+([\d\.]+)"],
'group': [1],
'cast': float,
'required': False,
'description': 'Visceral adipose tissue mass'
},
'vat_volume_in3': {
'patterns': [r"Volume \(in3\)\s+([\d\.]+)"],
'group': [1],
'cast': float,
'required': False,
'description': 'Visceral adipose tissue volume'
},
}
def read_pdf_text(pdf_path):
"""Extract all text from PDF pages"""
with pdfplumber.open(pdf_path) as pdf:
pages_text = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages_text)
def extract_field(text, patterns, groups, cast=float, flags=re.IGNORECASE):
"""
Extract a field using multiple pattern attempts
Args:
text: Full PDF text
patterns: List of regex patterns to try (in order)
groups: List of capture group indices (one per pattern)
cast: Function to convert extracted string to desired type
flags: Regex flags
Returns:
Extracted value or None
"""
for pattern, group in zip(patterns, groups):
match = re.search(pattern, text, flags)
if match:
try:
val = match.group(group).replace(",", "").strip()
return cast(val) if cast else val
except (ValueError, IndexError, AttributeError):
continue
return None
def extract_all_fields(text, patterns_dict=None):
"""
Extract all defined fields from PDF text
Args:
text: Full PDF text content
patterns_dict: Field definitions (uses EXTRACTION_PATTERNS if None)
Returns:
Dictionary of extracted data and list of missing required fields
"""
if patterns_dict is None:
patterns_dict = EXTRACTION_PATTERNS
data = {}
missing_required = []
for field_name, config in patterns_dict.items():
value = extract_field(
text,
config['patterns'],
config['group'],
config['cast']
)
data[field_name] = value
if value is None and config['required']:
missing_required.append(field_name)
return data, missing_required
def convert_date_to_iso(date_str):
"""Convert MM/DD/YYYY to YYYY-MM-DD"""
if not date_str:
return None
try:
dt = datetime.strptime(date_str, "%m/%d/%Y")
return dt.strftime("%Y-%m-%d")
except:
return date_str
def inches_to_ft_in(inches):
"""Convert inches to feet'inches" format"""
if inches is None:
return None
feet = int(inches // 12)
remaining_inches = int(inches % 12)
return f"{feet}'{remaining_inches}\""
def parse_regional_table(text):
regions = ["Arms", "Legs", "Trunk", "Android", "Gynoid", "Total"]
out = {}
for r in regions:
# Example line: Arms 22.1% 27.4 6.0 20.2 1.1
pattern = rf"{r}\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)"
m = re.search(pattern, text)
if m:
out[r] = {
"fat_percent": float(m.group(1)),
"total_mass_lb": float(m.group(2)),
"fat_tissue_lb": float(m.group(3)),
"lean_tissue_lb": float(m.group(4)),
"bmc_lb": float(m.group(5)),
}
return out
def parse_muscle_balance(text):
names = ["Arms Total", "Right Arm", "Left Arm", "Legs Total", "Right Leg", "Left Leg"]
out = {}
for n in names:
# Example: Right Arm 20.4 13.7 2.8 10.3 0.6
pattern = rf"{n}\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)"
m = re.search(pattern, text)
if m:
out[n] = {
"fat_percent": float(m.group(1)),
"total_mass_lb": float(m.group(2)),
"fat_mass_lb": float(m.group(3)),
"lean_mass_lb": float(m.group(4)),
"bmc_lb": float(m.group(5)),
}
return out
def parse_bone_density_total(text):
# Example: Total 1.280 0.8 0.8
m = re.search(r"Total\s+([\d\.]+)\s+([-\d\.]+)\s+([-\d\.]+)", text)
if m:
return {
"total_bmd_g_per_cm2": float(m.group(1)),
"young_adult_t_score": float(m.group(2)),
"age_matched_z_score": float(m.group(3)),
}
return {}
def parse_dexa_pdf(pdf_path):
"""
Extract all data from BodySpec DEXA PDF
Uses pattern dictionary for main fields and specialized parsers for tables.
Returns dict with all extracted data and validation warnings.
"""
text = read_pdf_text(pdf_path)
# Extract all defined fields using pattern dictionary
data, missing_required = extract_all_fields(text)
# Warn about missing required fields
if missing_required:
print(f" ⚠️ Warning: Missing required fields: {', '.join(missing_required)}", file=sys.stderr)
# Parse structured tables (regional, muscle balance, bone density)
data["regional"] = parse_regional_table(text)
data["muscle_balance"] = parse_muscle_balance(text)
data["bone_density"] = parse_bone_density_total(text)
return data
def compute_derived(d, height_in, weight_lb=None):
# Prefer DEXA total mass if available
total_mass = d.get("total_mass_lb") or weight_lb
if total_mass is None:
raise ValueError("Total mass is missing; pass --weight-lb if the PDF lacks it.")
fm = d.get("fat_mass_lb")
lst = d.get("lean_soft_tissue_lb")
bmc = d.get("bmc_lb")
bf_pct = d.get("body_fat_percent")
ffm = None
if fm is not None:
ffm = total_mass - fm
elif lst is not None and bmc is not None:
ffm = lst + bmc
def idx(value_lb):
return round(703.0 * value_lb / (height_in ** 2), 2)
derived = {
"height_in": height_in,
"height_ft_in": inches_to_ft_in(height_in),
"weight_input_lb": weight_lb,
"bmi": round(703.0 * total_mass / (height_in ** 2), 1),
"fat_free_mass_lb": round(ffm, 1) if ffm is not None else None,
"ffmi": idx(ffm) if ffm is not None else None,
"fmi": idx(fm) if fm is not None else None,
"lsti": idx(lst) if lst is not None else None,
"alm_lb": None,
"smi": None,
}
# Lean mass percentage (complement of body fat %)
if bf_pct is not None:
derived["lean_mass_percent"] = round(100 - bf_pct, 1)
else:
derived["lean_mass_percent"] = None
# ALM from regional lean masses
arms_lean = d.get("regional", {}).get("Arms", {}).get("lean_tissue_lb")
legs_lean = d.get("regional", {}).get("Legs", {}).get("lean_tissue_lb")
trunk_lean = d.get("regional", {}).get("Trunk", {}).get("lean_tissue_lb")
if arms_lean is not None and legs_lean is not None:
alm = arms_lean + legs_lean
derived["alm_lb"] = round(alm, 1)
derived["smi"] = idx(alm)
# Regional lean mass distribution
if lst is not None and arms_lean is not None and legs_lean is not None and trunk_lean is not None:
derived["arms_lean_pct"] = round(100 * arms_lean / lst, 1)
derived["legs_lean_pct"] = round(100 * legs_lean / lst, 1)
derived["trunk_lean_pct"] = round(100 * trunk_lean / lst, 1)
else:
derived["arms_lean_pct"] = None
derived["legs_lean_pct"] = None
derived["trunk_lean_pct"] = None
# Trunk-to-limb fat ratio (health risk indicator)
trunk_fat = d.get("regional", {}).get("Trunk", {}).get("fat_tissue_lb")
arms_fat = d.get("regional", {}).get("Arms", {}).get("fat_tissue_lb")
legs_fat = d.get("regional", {}).get("Legs", {}).get("fat_tissue_lb")
if trunk_fat is not None and arms_fat is not None and legs_fat is not None:
limb_fat = arms_fat + legs_fat
if limb_fat > 0:
derived["trunk_to_limb_fat_ratio"] = round(trunk_fat / limb_fat, 2)
else:
derived["trunk_to_limb_fat_ratio"] = None
else:
derived["trunk_to_limb_fat_ratio"] = None
# Limb symmetry indices (balance indicators)
mb = d.get("muscle_balance", {})
right_arm = mb.get("Right Arm", {}).get("lean_mass_lb")
left_arm = mb.get("Left Arm", {}).get("lean_mass_lb")
right_leg = mb.get("Right Leg", {}).get("lean_mass_lb")
left_leg = mb.get("Left Leg", {}).get("lean_mass_lb")
if right_arm is not None and left_arm is not None and right_arm + left_arm > 0:
# Symmetry: 100 = perfect, <100 = left stronger, >100 = right stronger
derived["arm_symmetry_index"] = round(100 * right_arm / (right_arm + left_arm), 1)
else:
derived["arm_symmetry_index"] = None
if right_leg is not None and left_leg is not None and right_leg + left_leg > 0:
derived["leg_symmetry_index"] = round(100 * right_leg / (right_leg + left_leg), 1)
else:
derived["leg_symmetry_index"] = None
# VAT Index (normalized by height squared, like BMI)
vat_mass = d.get("vat_mass_lb")
if vat_mass is not None:
derived["vat_index"] = idx(vat_mass)
else:
derived["vat_index"] = None
# Bone Mineral Density Index (BMC normalized by height)
if bmc is not None:
derived["bmdi"] = idx(bmc)
else:
derived["bmdi"] = None
# Adjusted Body Weight (used in nutrition/health calculations)
# ABW = IBW + 0.4 * (actual weight - IBW), where IBW differs by sex
# For simplicity, using a unisex approximation: IBW ≈ height_in * 2.3 - 100 (rough estimate)
if total_mass is not None:
ibw_estimate = height_in * 2.3 - 100
if total_mass > ibw_estimate:
derived["adjusted_body_weight_lb"] = round(ibw_estimate + 0.4 * (total_mass - ibw_estimate), 1)
else:
derived["adjusted_body_weight_lb"] = round(total_mass, 1)
else:
derived["adjusted_body_weight_lb"] = None
return total_mass, derived
def ensure_outdir(outdir):
os.makedirs(outdir, exist_ok=True)
def write_or_append_csv(path, row_dict, columns):
df_row = pd.DataFrame([{k: row_dict.get(k) for k in columns}])
if os.path.exists(path):
df_row.to_csv(path, mode="a", header=False, index=False)
else:
df_row.to_csv(path, index=False)
def write_or_append_json(path, obj):
if os.path.exists(path):
with open(path, "r") as f:
try:
data = json.load(f)
except json.JSONDecodeError:
data = []
else:
data = []
if isinstance(data, dict):
# convert to list of entries if previous file was a single dict
data = [data]
data.append(obj)
with open(path, "w") as f:
json.dump(data, f, indent=2)
def append_markdown(path, md_text):
mode = "a" if os.path.exists(path) else "w"
with open(path, mode) as f:
f.write(md_text.strip() + "\n\n")
def process_single_pdf(pdf_path, height_in, weight_lb, outdir, batch_mode=False):
"""Process a single PDF file and return success status
Args:
pdf_path: Path to PDF file
height_in: Height in inches
weight_lb: Weight in pounds (optional)
outdir: Output directory
batch_mode: If True, use batch-style output messages
Returns:
bool: True if successful, False otherwise
"""
try:
# Validate PDF file
pdf_file = Path(pdf_path)
if not pdf_file.exists():
msg = f" ❌ Skipping {pdf_path}: File not found" if batch_mode else f"❌ Error: PDF file not found: {pdf_path}"
print(msg, file=sys.stderr)
return False
if not pdf_file.is_file():
msg = f" ❌ Skipping {pdf_path}: Not a file" if batch_mode else f"❌ Error: Path is not a file: {pdf_path}"
print(msg, file=sys.stderr)
return False
if pdf_file.suffix.lower() != '.pdf':
msg = f" ❌ Skipping {pdf_path}: Not a PDF" if batch_mode else f"❌ Error: File is not a PDF: {pdf_path}"
print(msg, file=sys.stderr)
return False
if batch_mode:
print(f"\n📄 Processing: {pdf_file.name}")
else:
print("📊 Computing derived metrics...")
# Parse PDF
d = parse_dexa_pdf(pdf_path)
# Check if critical data was extracted
if d.get("body_fat_percent") is None or d.get("total_mass_lb") is None:
print(f" ⚠️ Warning: Missing critical data from {pdf_file.name}", file=sys.stderr)
if d.get("body_fat_percent") is None:
print(" - Body Fat % not found", file=sys.stderr)
if d.get("total_mass_lb") is None:
print(" - Total Mass not found", file=sys.stderr)
# Process data
measured_date_raw = d.get("measured_date") or datetime.now().strftime("%m/%d/%Y")
measured_date = convert_date_to_iso(measured_date_raw)
total_mass, derived = compute_derived(d, height_in=height_in, weight_lb=weight_lb)
# Write output files (same as before)
overall_cols = [
"MeasuredDate","Height_in","Height_ft_in","Weight_lb_Input","DEXA_TotalMass_lb","BodyFat_percent",
"LeanMass_percent","FatMass_lb","LeanSoftTissue_lb","BoneMineralContent_lb","FatFreeMass_lb",
"BMI","FFMI","FMI","LST_Index","ALM_lb","SMI","VAT_Mass_lb","VAT_Volume_in3","VAT_Index",
"BMDI","Android_percent","Gynoid_percent","AG_Ratio","Trunk_to_Limb_Fat_Ratio",
"Arms_Lean_pct","Legs_Lean_pct","Trunk_Lean_pct","Arm_Symmetry_Index","Leg_Symmetry_Index",
"Adjusted_Body_Weight_lb","RMR_cal_per_day"
]
overall_row = {
"MeasuredDate": measured_date,
"Height_in": derived["height_in"],
"Height_ft_in": derived["height_ft_in"],
"Weight_lb_Input": derived["weight_input_lb"],
"DEXA_TotalMass_lb": round(total_mass, 1),
"BodyFat_percent": d.get("body_fat_percent"),
"LeanMass_percent": derived.get("lean_mass_percent"),
"FatMass_lb": d.get("fat_mass_lb"),
"LeanSoftTissue_lb": d.get("lean_soft_tissue_lb"),
"BoneMineralContent_lb": d.get("bmc_lb"),
"FatFreeMass_lb": derived.get("fat_free_mass_lb"),
"BMI": derived["bmi"],
"FFMI": derived.get("ffmi"),
"FMI": derived.get("fmi"),
"LST_Index": derived.get("lsti"),
"ALM_lb": derived.get("alm_lb"),
"SMI": derived.get("smi"),
"VAT_Mass_lb": d.get("vat_mass_lb"),
"VAT_Volume_in3": d.get("vat_volume_in3"),
"VAT_Index": derived.get("vat_index"),
"BMDI": derived.get("bmdi"),
"Android_percent": d.get("android_percent"),
"Gynoid_percent": d.get("gynoid_percent"),
"AG_Ratio": d.get("ag_ratio"),
"Trunk_to_Limb_Fat_Ratio": derived.get("trunk_to_limb_fat_ratio"),
"Arms_Lean_pct": derived.get("arms_lean_pct"),
"Legs_Lean_pct": derived.get("legs_lean_pct"),
"Trunk_Lean_pct": derived.get("trunk_lean_pct"),
"Arm_Symmetry_Index": derived.get("arm_symmetry_index"),
"Leg_Symmetry_Index": derived.get("leg_symmetry_index"),
"Adjusted_Body_Weight_lb": derived.get("adjusted_body_weight_lb"),
"RMR_cal_per_day": d.get("rmr_cal_per_day"),
}
write_or_append_csv(os.path.join(outdir, "overall.csv"), overall_row, overall_cols)
# Regional table
regional_cols = ["MeasuredDate","Region","FatPercent","LeanPercent","TotalMass_lb","FatTissue_lb","LeanTissue_lb","BMC_lb"]
reg_rows = []
for name, r in d.get("regional", {}).items():
# Calculate lean percentage (lean tissue only, not including BMC - matches BodySpec report)
lean_pct = round(100 * r["lean_tissue_lb"] / r["total_mass_lb"], 1) if r["total_mass_lb"] > 0 else None
reg_rows.append({
"MeasuredDate": measured_date,
"Region": name,
"FatPercent": r["fat_percent"],
"LeanPercent": lean_pct,
"TotalMass_lb": r["total_mass_lb"],
"FatTissue_lb": r["fat_tissue_lb"],
"LeanTissue_lb": r["lean_tissue_lb"],
"BMC_lb": r["bmc_lb"],
})
regional_path = os.path.join(outdir, "regional.csv")
df_regional = pd.DataFrame(reg_rows, columns=regional_cols)
if os.path.exists(regional_path):
df_regional.to_csv(regional_path, mode="a", header=False, index=False)
else:
df_regional.to_csv(regional_path, index=False)
# Muscle balance
mb_cols = ["MeasuredDate","Region","FatPercent","TotalMass_lb","FatMass_lb","LeanMass_lb","BMC_lb"]
mb_rows = []
for name, r in d.get("muscle_balance", {}).items():
mb_rows.append({
"MeasuredDate": measured_date,
"Region": name,
"FatPercent": r["fat_percent"],
"TotalMass_lb": r["total_mass_lb"],
"FatMass_lb": r["fat_mass_lb"],
"LeanMass_lb": r["lean_mass_lb"],
"BMC_lb": r["bmc_lb"],
})
mb_path = os.path.join(outdir, "muscle_balance.csv")
if os.path.exists(mb_path):
pd.DataFrame(mb_rows).to_csv(mb_path, mode="a", header=False, index=False)
else:
pd.DataFrame(mb_rows).to_csv(mb_path, index=False)
# JSON
regional_array = []
for name, data in d.get("regional", {}).items():
lean_pct = round(100 * (data["lean_tissue_lb"] + data["bmc_lb"]) / data["total_mass_lb"], 1) if data["total_mass_lb"] > 0 else None
regional_array.append({
"region": name,
"fat_percent": data["fat_percent"],
"lean_percent": lean_pct,
"total_mass_lb": data["total_mass_lb"],
"fat_tissue_lb": data["fat_tissue_lb"],
"lean_tissue_lb": data["lean_tissue_lb"],
"bmc_lb": data["bmc_lb"]
})
muscle_balance_array = [
{"region": name, **data}
for name, data in d.get("muscle_balance", {}).items()
]
overall_json = {
"measured_date": measured_date,
"anthropometrics": {
"height_in": derived["height_in"],
"height_ft_in": derived["height_ft_in"],
"weight_input_lb": derived["weight_input_lb"],
"dexa_total_mass_lb": round(total_mass, 1),
"adjusted_body_weight_lb": derived.get("adjusted_body_weight_lb"),
"bmi": derived["bmi"]
},
"composition": {
"body_fat_percent": d.get("body_fat_percent"),
"lean_mass_percent": derived.get("lean_mass_percent"),
"fat_mass_lb": d.get("fat_mass_lb"),
"lean_soft_tissue_lb": d.get("lean_soft_tissue_lb"),
"bone_mineral_content_lb": d.get("bmc_lb"),
"fat_free_mass_lb": derived.get("fat_free_mass_lb"),
"derived_indices": {
"ffmi": derived.get("ffmi"),
"fmi": derived.get("fmi"),
"lsti": derived.get("lsti"),
"alm_lb": derived.get("alm_lb"),
"smi": derived.get("smi"),
"bmdi": derived.get("bmdi")
}
},
"regional": regional_array,
"regional_analysis": {
"trunk_to_limb_fat_ratio": derived.get("trunk_to_limb_fat_ratio"),
"lean_mass_distribution": {
"arms_percent": derived.get("arms_lean_pct"),
"legs_percent": derived.get("legs_lean_pct"),
"trunk_percent": derived.get("trunk_lean_pct")
}
},
"muscle_balance": muscle_balance_array,
"symmetry_indices": {
"arm_symmetry_index": derived.get("arm_symmetry_index"),
"leg_symmetry_index": derived.get("leg_symmetry_index")
},
"supplemental": {
"android_percent": d.get("android_percent"),
"gynoid_percent": d.get("gynoid_percent"),
"ag_ratio": d.get("ag_ratio"),
"vat": {
"mass_lb": d.get("vat_mass_lb"),
"volume_in3": d.get("vat_volume_in3"),
"vat_index": derived.get("vat_index")
},
"rmr_cal_per_day": d.get("rmr_cal_per_day")
},
"bone_density": d.get("bone_density", {})
}
write_or_append_json(os.path.join(outdir, "overall.json"), overall_json)
# Markdown summary
md_text = make_markdown(measured_date, d, derived, total_mass)
append_markdown(os.path.join(outdir, "summary.md"), md_text)
if batch_mode:
print(f"{pdf_file.name}: Body fat {d.get('body_fat_percent')}%, FFMI {derived.get('ffmi')}")
else:
# Single-file mode prints detailed success info outside this function
pass
return True
except Exception as e:
if batch_mode:
print(f" ❌ Error processing {pdf_path}: {e}", file=sys.stderr)
else:
print(f"❌ Error reading PDF: {e}", file=sys.stderr)
print("This tool is specifically designed for BodySpec PDF reports.", file=sys.stderr)
return False
def make_markdown(measured_date, d, derived, total_mass):
lines = []
lines.append(f"# DEXA Summary — {measured_date}")
lines.append("")
lines.append(f"- Height: {derived['height_in']} in")
lines.append(f"- Weight: {round(total_mass, 1)} lb")
if d.get("body_fat_percent") is not None and d.get("fat_mass_lb") is not None:
lines.append(f"- Body fat: {d['body_fat_percent']}% ({d['fat_mass_lb']} lb)")
if d.get("lean_soft_tissue_lb") is not None:
lines.append(f"- Lean soft tissue: {d['lean_soft_tissue_lb']} lb")
if d.get("bmc_lb") is not None:
lines.append(f"- Bone mineral content: {d['bmc_lb']} lb")
lines.append(f"- Fatfree mass: {derived.get('fat_free_mass_lb')}")
lines.append(f"- BMI: {derived['bmi']}")
lines.append(f"- FFMI: {derived.get('ffmi')}; FMI: {derived.get('fmi')}; Lean Soft Tissue Index: {derived.get('lsti')}")
if derived.get("alm_lb") is not None:
lines.append(f"- Appendicular Lean Mass: {derived['alm_lb']} lb; Skeletal Muscle Index: {derived['smi']}")
if d.get("android_percent") is not None and d.get("gynoid_percent") is not None and d.get("ag_ratio") is not None:
lines.append(f"- Android: {d['android_percent']}%; Gynoid: {d['gynoid_percent']}%; A/G ratio: {d['ag_ratio']}")
if d.get("vat_mass_lb") is not None and d.get("vat_volume_in3") is not None:
lines.append(f"- VAT: {d['vat_mass_lb']} lb ({d['vat_volume_in3']} in³)")
if d.get("rmr_cal_per_day") is not None:
lines.append(f"- RMR: {d['rmr_cal_per_day']} cal/day")
lines.append("")
lines.append("## Regional")
for name, r in d.get("regional", {}).items():
lines.append(f"- {name}: {r['fat_percent']}% fat; {r['total_mass_lb']} lb total; {r['fat_tissue_lb']} lb fat; {r['lean_tissue_lb']} lb lean; {r['bmc_lb']} lb BMC")
return "\n".join(lines)
def main():
ap = argparse.ArgumentParser(
description="BodySpec Insights - Extract and analyze body composition data from BodySpec DEXA scan PDFs",
epilog="Examples:\n"
" Single: python dexa_extract.py scan.pdf --height-in 74 --outdir ./data/results\n"
" Batch: python dexa_extract.py --batch data/pdfs --height-in 74 --outdir ./data/results",
formatter_class=argparse.RawDescriptionHelpFormatter
)
ap.add_argument("pdf", nargs="?", help="Path to BodySpec DEXA report PDF (not used with --batch)")
ap.add_argument("--batch", metavar="DIR", help="Process all PDFs in directory (skips already-processed dates)")
ap.add_argument("--height-in", type=float, required=True, help="Height in inches (e.g., 6'2\" = 74)")
ap.add_argument("--weight-lb", type=float, help="Body weight in lbs (optional; used if DEXA total mass missing)")
ap.add_argument("--outdir", default="dexa_out", help="Output directory (default: dexa_out)")
ap.add_argument("--force", action="store_true", help="Reprocess all files, even if already in output")
args = ap.parse_args()
# Check that either pdf or --batch is provided
if not args.pdf and not args.batch:
print("❌ Error: Must provide either a PDF file or --batch directory", file=sys.stderr)
ap.print_help()
sys.exit(1)
if args.pdf and args.batch:
print("❌ Error: Cannot use both PDF file and --batch. Choose one.", file=sys.stderr)
sys.exit(1)
# Validate height
if args.height_in < 36 or args.height_in > 96:
print(f"❌ Error: Height seems unrealistic: {args.height_in} inches (expected 36-96 inches / 3'-8')", file=sys.stderr)
sys.exit(1)
# Validate weight if provided
if args.weight_lb is not None and (args.weight_lb < 50 or args.weight_lb > 500):
print(f"❌ Error: Weight seems unrealistic: {args.weight_lb} lbs (expected 50-500 lbs)", file=sys.stderr)
sys.exit(1)
# Create output directory
try:
ensure_outdir(args.outdir)
except PermissionError:
print(f"❌ Error: Cannot create output directory: {args.outdir} (permission denied)", file=sys.stderr)
sys.exit(1)
# Batch mode
if args.batch:
batch_dir = Path(args.batch)
if not batch_dir.exists():
print(f"❌ Error: Directory not found: {args.batch}", file=sys.stderr)
sys.exit(1)
if not batch_dir.is_dir():
print(f"❌ Error: Not a directory: {args.batch}", file=sys.stderr)
sys.exit(1)
# Find all PDF files in directory
pdf_files = sorted(batch_dir.glob("*.pdf"))
if not pdf_files:
print(f"❌ Error: No PDF files found in: {args.batch}", file=sys.stderr)
sys.exit(1)
# Get already-processed dates
processed_dates = set()
if not args.force:
processed_dates = get_processed_dates(args.outdir)
if processed_dates:
print(f"📋 Found {len(processed_dates)} already-processed scan(s) in {args.outdir}")
print(f"📦 Batch mode: Found {len(pdf_files)} PDF file(s) in {args.batch}")
print(f"📂 Output directory: {args.outdir}\n")
success_count = 0
fail_count = 0
skip_count = 0
for pdf_file in pdf_files:
# Quick check: try to extract date and see if already processed
if not args.force and processed_dates:
try:
d_temp = parse_dexa_pdf(str(pdf_file))
measured_date_raw = d_temp.get("measured_date")
if measured_date_raw:
measured_date = convert_date_to_iso(measured_date_raw)
if measured_date in processed_dates:
print(f"\n⏭️ Skipping: {pdf_file.name} (date {measured_date} already processed)")
skip_count += 1
continue
except Exception:
pass # If we can't extract date, try to process anyway
if process_single_pdf(str(pdf_file), args.height_in, args.weight_lb, args.outdir, batch_mode=True):
success_count += 1
else:
fail_count += 1
print(f"\n{'='*60}")
print(f"✅ Batch complete: {success_count} succeeded, {skip_count} skipped, {fail_count} failed")
print(f"📁 Results saved to: {args.outdir}")
if args.force and skip_count > 0:
print(f" 💡 Tip: Remove --force flag to skip already-processed scans")
elif skip_count > 0:
print(f" 💡 Tip: Use --force to reprocess skipped scans")
if fail_count > 0:
sys.exit(1)
return
# Single file mode
print(f"📄 Reading PDF: {args.pdf}")
# Use the shared processing function
success = process_single_pdf(args.pdf, args.height_in, args.weight_lb, args.outdir, batch_mode=False)
if not success:
sys.exit(1)
# Parse the result to show summary info
try:
# Read the latest entry from overall.json to get the summary data
json_path = os.path.join(args.outdir, "overall.json")
if os.path.exists(json_path):
with open(json_path, 'r') as f:
data = json.load(f)
latest = data[-1] if isinstance(data, list) and data else data
measured_date = latest.get("measured_date", "Unknown")
body_fat = latest.get("composition", {}).get("body_fat_percent", "N/A")
ffmi = latest.get("composition", {}).get("derived_indices", {}).get("ffmi", "N/A")
else:
measured_date = body_fat = ffmi = "N/A"
except Exception:
measured_date = body_fat = ffmi = "N/A"
# Success output
print(f"\n✅ Success! Wrote files to: {args.outdir}")
print(" 📁 Files created:")
print(" - overall.csv (time-series data)")
print(" - regional.csv (body composition by region)")
print(" - muscle_balance.csv (left/right symmetry)")
print(" - overall.json (structured data)")
print(" - summary.md (readable report)")
print(f"\n 📈 Scan date: {measured_date}")
print(f" 💪 Body fat: {body_fat}%")
print(f" 🏋️ FFMI: {ffmi}")
if __name__ == "__main__":
main()