From b046af5d25b41847a4bc3e48aa46b8381f8ee2ab Mon Sep 17 00:00:00 2001 From: Mac DeCourcy Date: Mon, 6 Oct 2025 15:33:05 -0700 Subject: [PATCH] feat: smart batch processing with skip logic - Change --batch to accept directory instead of glob pattern - Automatically skip already-processed scan dates - Add --force flag to reprocess all files - Fix date extraction regex to parse from client info line - Display helpful tips about skipping/forcing - Better user feedback with skip counts and suggestions Usage: python dexa_extract.py --batch data/pdfs --height-in 74 --outdir data/results This will process only new scans, skipping any dates already in the output. --- README.md | 48 ++++++- data/results/README.md | 18 --- dexa_extract.py | 314 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 342 insertions(+), 38 deletions(-) delete mode 100644 data/results/README.md diff --git a/README.md b/README.md index e3dbb3c..1bc8b1b 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,16 @@ python dexa_extract.py --height-in [--weight-lb ] [- python dexa_extract.py data/pdfs/2025-10-06-scan.pdf --height-in 74 --weight-lb 212 --outdir data/results ``` -**Process multiple scans** (appends to existing files): +**Batch process multiple scans:** +```bash +# Process all PDFs in a directory (automatically skips already-processed dates) +python dexa_extract.py --batch data/pdfs --height-in 74 --outdir data/results + +# Force reprocessing all files +python dexa_extract.py --batch data/pdfs --height-in 74 --outdir data/results --force +``` + +**Individual scans** (appends to existing files): ```bash python dexa_extract.py data/pdfs/scan-2025-01.pdf --height-in 74 --outdir data/results python dexa_extract.py data/pdfs/scan-2025-04.pdf --height-in 74 --outdir data/results @@ -247,10 +256,35 @@ Higher trunk percentage may indicate good core development, while higher leg per The script appends data to existing CSV files, making it easy to track changes over time: -1. Place all your DEXA PDFs in `data/pdfs/` -2. Process each one with the same output directory -3. Open `overall.csv` in Excel/Google Sheets to visualize trends -4. Compare `muscle_balance.csv` to track left/right symmetry improvements +### Option 1: Batch Processing (Recommended) +```bash +# Place all your PDFs in one directory +data/pdfs/ +├── scan-2025-01-15.pdf +├── scan-2025-04-20.pdf +└── scan-2025-10-06.pdf + +# Process all at once (automatically skips already-processed dates) +python dexa_extract.py --batch data/pdfs --height-in 74 --outdir data/results + +# Add new scans later - only new ones will be processed +cp ~/Downloads/scan-2025-12-15.pdf data/pdfs/ +python dexa_extract.py --batch data/pdfs --height-in 74 --outdir data/results +``` + +### Option 2: Individual Processing +```bash +# Process scans as you get them +python dexa_extract.py data/pdfs/scan-2025-01.pdf --height-in 74 --outdir data/results +python dexa_extract.py data/pdfs/scan-2025-04.pdf --height-in 74 --outdir data/results +python dexa_extract.py data/pdfs/scan-2025-10.pdf --height-in 74 --outdir data/results +``` + +### Analyzing Results +1. Open `overall.csv` in Excel/Google Sheets to visualize trends +2. Compare `muscle_balance.csv` to track left/right symmetry improvements +3. Review `summary.md` for readable reports of each scan +4. Use `overall.json` for programmatic analysis ## Privacy & Security @@ -281,12 +315,12 @@ The script appends data to existing CSV files, making it easy to track changes o Contributions welcome! Areas for improvement: -- [ ] Enhanced error handling and validation - [ ] Automatic height detection from PDF - [ ] Data visualization/plotting features - [ ] GUI interface for non-technical users -- [ ] Batch processing multiple PDFs at once - [ ] Export to additional formats (Excel, SQLite, etc.) +- [ ] Support for older BodySpec PDF formats +- [ ] Progress bar for batch processing ## License diff --git a/data/results/README.md b/data/results/README.md deleted file mode 100644 index 85e71f3..0000000 --- a/data/results/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Results Directory - -Your extracted DEXA data will be saved here by default. - -## Output Files - -When you run the extraction script with `--outdir data/results`, you'll get: - -- `overall.csv` - Time-series data (one row per scan) -- `regional.csv` - Regional body composition -- `muscle_balance.csv` - Left/right limb comparison -- `overall.json` - Structured JSON format -- `summary.md` - Human-readable summary - -## Note - -⚠️ **Result files are gitignored** - They contain your personal health data and won't be committed to version control. - diff --git a/dexa_extract.py b/dexa_extract.py index 240976f..38cbb7e 100644 --- a/dexa_extract.py +++ b/dexa_extract.py @@ -22,7 +22,6 @@ import re import sys from datetime import datetime from pathlib import Path - import pdfplumber import pandas as pd @@ -30,6 +29,21 @@ class ValidationError(Exception): """Custom exception for validation errors""" pass +def get_processed_dates(outdir): + """Get list of already-processed scan dates from existing CSV""" + overall_csv = Path(outdir) / "overall.csv" + if not overall_csv.exists(): + return set() + + try: + df = pd.read_csv(overall_csv) + if 'MeasuredDate' in df.columns: + return set(df['MeasuredDate'].dropna().unique()) + except Exception: + pass + + return set() + def read_pdf_text(pdf_path): with pdfplumber.open(pdf_path) as pdf: pages_text = [page.extract_text() or "" for page in pdf.pages] @@ -109,7 +123,13 @@ def parse_dexa_pdf(pdf_path): text = read_pdf_text(pdf_path) data = {} - data["measured_date"] = find_one(r"Measured Date\s+([\d/]+)", text, cast=str) + # Try to extract date from client info line: "Name Male 9/26/1995 74.0 in. 213.0 lbs. 10/6/2025" + # The last date on the line is the measured date + date_match = re.search(r"(\d{1,2}/\d{1,2}/\d{4})\s*$", text.split('\n')[0] if '\n' in text else text, re.MULTILINE) + if not date_match: + # Try finding it in the full text - look for pattern at end of client info lines + date_match = re.search(r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", text) + data["measured_date"] = date_match.group(1) if date_match else None # First try to extract from SUMMARY RESULTS table (more reliable) # Pattern: 10/6/2025 27.8% 211.6 58.8 145.4 7.4 @@ -300,6 +320,196 @@ def append_markdown(path, md_text): with open(path, mode) as f: f.write(md_text.strip() + "\n\n") +def process_single_pdf(pdf_path, height_in, weight_lb, outdir): + """Process a single PDF file and return success status""" + try: + # Validate PDF file + pdf_file = Path(pdf_path) + if not pdf_file.exists(): + print(f" ❌ Skipping {pdf_path}: File not found", file=sys.stderr) + return False + if not pdf_file.is_file(): + print(f" ❌ Skipping {pdf_path}: Not a file", file=sys.stderr) + return False + if pdf_file.suffix.lower() != '.pdf': + print(f" ❌ Skipping {pdf_path}: Not a PDF", file=sys.stderr) + return False + + print(f"\n📄 Processing: {pdf_file.name}") + + # Parse PDF + d = parse_dexa_pdf(pdf_path) + + # Check if critical data was extracted + if d.get("body_fat_percent") is None or d.get("total_mass_lb") is None: + print(f" ⚠️ Warning: Missing critical data from {pdf_file.name}", file=sys.stderr) + if d.get("body_fat_percent") is None: + print(" - Body Fat % not found", file=sys.stderr) + if d.get("total_mass_lb") is None: + print(" - Total Mass not found", file=sys.stderr) + + # Process data + measured_date_raw = d.get("measured_date") or datetime.now().strftime("%m/%d/%Y") + measured_date = convert_date_to_iso(measured_date_raw) + total_mass, derived = compute_derived(d, height_in=height_in, weight_lb=weight_lb) + + # Write output files (same as before) + overall_cols = [ + "MeasuredDate","Height_in","Height_ft_in","Weight_lb_Input","DEXA_TotalMass_lb","BodyFat_percent", + "LeanMass_percent","FatMass_lb","LeanSoftTissue_lb","BoneMineralContent_lb","FatFreeMass_lb", + "BMI","FFMI","FMI","LST_Index","ALM_lb","SMI","VAT_Mass_lb","VAT_Volume_in3","VAT_Index", + "BMDI","Android_percent","Gynoid_percent","AG_Ratio","Trunk_to_Limb_Fat_Ratio", + "Arms_Lean_pct","Legs_Lean_pct","Trunk_Lean_pct","Arm_Symmetry_Index","Leg_Symmetry_Index", + "Adjusted_Body_Weight_lb","RMR_cal_per_day" + ] + overall_row = { + "MeasuredDate": measured_date, + "Height_in": derived["height_in"], + "Height_ft_in": derived["height_ft_in"], + "Weight_lb_Input": derived["weight_input_lb"], + "DEXA_TotalMass_lb": round(total_mass, 1), + "BodyFat_percent": d.get("body_fat_percent"), + "LeanMass_percent": derived.get("lean_mass_percent"), + "FatMass_lb": d.get("fat_mass_lb"), + "LeanSoftTissue_lb": d.get("lean_soft_tissue_lb"), + "BoneMineralContent_lb": d.get("bmc_lb"), + "FatFreeMass_lb": derived.get("fat_free_mass_lb"), + "BMI": derived["bmi"], + "FFMI": derived.get("ffmi"), + "FMI": derived.get("fmi"), + "LST_Index": derived.get("lsti"), + "ALM_lb": derived.get("alm_lb"), + "SMI": derived.get("smi"), + "VAT_Mass_lb": d.get("vat_mass_lb"), + "VAT_Volume_in3": d.get("vat_volume_in3"), + "VAT_Index": derived.get("vat_index"), + "BMDI": derived.get("bmdi"), + "Android_percent": d.get("android_percent"), + "Gynoid_percent": d.get("gynoid_percent"), + "AG_Ratio": d.get("ag_ratio"), + "Trunk_to_Limb_Fat_Ratio": derived.get("trunk_to_limb_fat_ratio"), + "Arms_Lean_pct": derived.get("arms_lean_pct"), + "Legs_Lean_pct": derived.get("legs_lean_pct"), + "Trunk_Lean_pct": derived.get("trunk_lean_pct"), + "Arm_Symmetry_Index": derived.get("arm_symmetry_index"), + "Leg_Symmetry_Index": derived.get("leg_symmetry_index"), + "Adjusted_Body_Weight_lb": derived.get("adjusted_body_weight_lb"), + "RMR_cal_per_day": d.get("rmr_cal_per_day"), + } + write_or_append_csv(os.path.join(outdir, "overall.csv"), overall_row, overall_cols) + + # Regional table + regional_cols = ["Region","FatPercent","TotalMass_lb","FatTissue_lb","LeanTissue_lb","BMC_lb"] + reg_rows = [] + for name, r in d.get("regional", {}).items(): + reg_rows.append({ + "Region": name, + "FatPercent": r["fat_percent"], + "TotalMass_lb": r["total_mass_lb"], + "FatTissue_lb": r["fat_tissue_lb"], + "LeanTissue_lb": r["lean_tissue_lb"], + "BMC_lb": r["bmc_lb"], + }) + regional_path = os.path.join(outdir, "regional.csv") + if os.path.exists(regional_path): + pd.DataFrame(reg_rows).to_csv(regional_path, mode="a", header=False, index=False) + else: + pd.DataFrame(reg_rows).to_csv(regional_path, index=False) + + # Muscle balance + mb_cols = ["Region","FatPercent","TotalMass_lb","FatMass_lb","LeanMass_lb","BMC_lb"] + mb_rows = [] + for name, r in d.get("muscle_balance", {}).items(): + mb_rows.append({ + "Region": name, + "FatPercent": r["fat_percent"], + "TotalMass_lb": r["total_mass_lb"], + "FatMass_lb": r["fat_mass_lb"], + "LeanMass_lb": r["lean_mass_lb"], + "BMC_lb": r["bmc_lb"], + }) + mb_path = os.path.join(outdir, "muscle_balance.csv") + if os.path.exists(mb_path): + pd.DataFrame(mb_rows).to_csv(mb_path, mode="a", header=False, index=False) + else: + pd.DataFrame(mb_rows).to_csv(mb_path, index=False) + + # JSON + regional_array = [ + {"region": name, **data} + for name, data in d.get("regional", {}).items() + ] + muscle_balance_array = [ + {"region": name, **data} + for name, data in d.get("muscle_balance", {}).items() + ] + + overall_json = { + "measured_date": measured_date, + "anthropometrics": { + "height_in": derived["height_in"], + "height_ft_in": derived["height_ft_in"], + "weight_input_lb": derived["weight_input_lb"], + "dexa_total_mass_lb": round(total_mass, 1), + "adjusted_body_weight_lb": derived.get("adjusted_body_weight_lb"), + "bmi": derived["bmi"] + }, + "composition": { + "body_fat_percent": d.get("body_fat_percent"), + "lean_mass_percent": derived.get("lean_mass_percent"), + "fat_mass_lb": d.get("fat_mass_lb"), + "lean_soft_tissue_lb": d.get("lean_soft_tissue_lb"), + "bone_mineral_content_lb": d.get("bmc_lb"), + "fat_free_mass_lb": derived.get("fat_free_mass_lb"), + "derived_indices": { + "ffmi": derived.get("ffmi"), + "fmi": derived.get("fmi"), + "lsti": derived.get("lsti"), + "alm_lb": derived.get("alm_lb"), + "smi": derived.get("smi"), + "bmdi": derived.get("bmdi") + } + }, + "regional": regional_array, + "regional_analysis": { + "trunk_to_limb_fat_ratio": derived.get("trunk_to_limb_fat_ratio"), + "lean_mass_distribution": { + "arms_percent": derived.get("arms_lean_pct"), + "legs_percent": derived.get("legs_lean_pct"), + "trunk_percent": derived.get("trunk_lean_pct") + } + }, + "muscle_balance": muscle_balance_array, + "symmetry_indices": { + "arm_symmetry_index": derived.get("arm_symmetry_index"), + "leg_symmetry_index": derived.get("leg_symmetry_index") + }, + "supplemental": { + "android_percent": d.get("android_percent"), + "gynoid_percent": d.get("gynoid_percent"), + "ag_ratio": d.get("ag_ratio"), + "vat": { + "mass_lb": d.get("vat_mass_lb"), + "volume_in3": d.get("vat_volume_in3"), + "vat_index": derived.get("vat_index") + }, + "rmr_cal_per_day": d.get("rmr_cal_per_day") + }, + "bone_density": d.get("bone_density", {}) + } + write_or_append_json(os.path.join(outdir, "overall.json"), overall_json) + + # Markdown summary + md_text = make_markdown(measured_date, d, derived, total_mass) + append_markdown(os.path.join(outdir, "summary.md"), md_text) + + print(f" ✅ {pdf_file.name}: Body fat {d.get('body_fat_percent')}%, FFMI {derived.get('ffmi')}") + return True + + except Exception as e: + print(f" ❌ Error processing {pdf_path}: {e}", file=sys.stderr) + return False + def make_markdown(measured_date, d, derived, total_mass): lines = [] lines.append(f"# DEXA Summary — {measured_date}") @@ -332,24 +542,26 @@ def make_markdown(measured_date, d, derived, total_mass): def main(): ap = argparse.ArgumentParser( description="BodySpec Insights - Extract and analyze body composition data from BodySpec DEXA scan PDFs", - epilog="Example: python dexa_extract.py scan.pdf --height-in 74 --weight-lb 212 --outdir ./data/results" + epilog="Examples:\n" + " Single: python dexa_extract.py scan.pdf --height-in 74 --outdir ./data/results\n" + " Batch: python dexa_extract.py --batch data/pdfs --height-in 74 --outdir ./data/results", + formatter_class=argparse.RawDescriptionHelpFormatter ) - ap.add_argument("pdf", help="Path to BodySpec DEXA report PDF") + ap.add_argument("pdf", nargs="?", help="Path to BodySpec DEXA report PDF (not used with --batch)") + ap.add_argument("--batch", metavar="DIR", help="Process all PDFs in directory (skips already-processed dates)") ap.add_argument("--height-in", type=float, required=True, help="Height in inches (e.g., 6'2\" = 74)") ap.add_argument("--weight-lb", type=float, help="Body weight in lbs (optional; used if DEXA total mass missing)") ap.add_argument("--outdir", default="dexa_out", help="Output directory (default: dexa_out)") + ap.add_argument("--force", action="store_true", help="Reprocess all files, even if already in output") args = ap.parse_args() - # Validate PDF file exists - pdf_file = Path(args.pdf) - if not pdf_file.exists(): - print(f"❌ Error: PDF file not found: {args.pdf}", file=sys.stderr) + # Check that either pdf or --batch is provided + if not args.pdf and not args.batch: + print("❌ Error: Must provide either a PDF file or --batch directory", file=sys.stderr) + ap.print_help() sys.exit(1) - if not pdf_file.is_file(): - print(f"❌ Error: Path is not a file: {args.pdf}", file=sys.stderr) - sys.exit(1) - if pdf_file.suffix.lower() != '.pdf': - print(f"❌ Error: File is not a PDF: {args.pdf}", file=sys.stderr) + if args.pdf and args.batch: + print("❌ Error: Cannot use both PDF file and --batch. Choose one.", file=sys.stderr) sys.exit(1) # Validate height @@ -362,12 +574,88 @@ def main(): print(f"❌ Error: Weight seems unrealistic: {args.weight_lb} lbs (expected 50-500 lbs)", file=sys.stderr) sys.exit(1) + # Create output directory try: ensure_outdir(args.outdir) except PermissionError: print(f"❌ Error: Cannot create output directory: {args.outdir} (permission denied)", file=sys.stderr) sys.exit(1) + # Batch mode + if args.batch: + batch_dir = Path(args.batch) + if not batch_dir.exists(): + print(f"❌ Error: Directory not found: {args.batch}", file=sys.stderr) + sys.exit(1) + if not batch_dir.is_dir(): + print(f"❌ Error: Not a directory: {args.batch}", file=sys.stderr) + sys.exit(1) + + # Find all PDF files in directory + pdf_files = sorted(batch_dir.glob("*.pdf")) + if not pdf_files: + print(f"❌ Error: No PDF files found in: {args.batch}", file=sys.stderr) + sys.exit(1) + + # Get already-processed dates + processed_dates = set() + if not args.force: + processed_dates = get_processed_dates(args.outdir) + if processed_dates: + print(f"📋 Found {len(processed_dates)} already-processed scan(s) in {args.outdir}") + + print(f"📦 Batch mode: Found {len(pdf_files)} PDF file(s) in {args.batch}") + print(f"📂 Output directory: {args.outdir}\n") + + success_count = 0 + fail_count = 0 + skip_count = 0 + + for pdf_file in pdf_files: + # Quick check: try to extract date and see if already processed + if not args.force and processed_dates: + try: + d_temp = parse_dexa_pdf(str(pdf_file)) + measured_date_raw = d_temp.get("measured_date") + if measured_date_raw: + measured_date = convert_date_to_iso(measured_date_raw) + if measured_date in processed_dates: + print(f"\n⏭️ Skipping: {pdf_file.name} (date {measured_date} already processed)") + skip_count += 1 + continue + except Exception: + pass # If we can't extract date, try to process anyway + + if process_single_pdf(str(pdf_file), args.height_in, args.weight_lb, args.outdir): + success_count += 1 + else: + fail_count += 1 + + print(f"\n{'='*60}") + print(f"✅ Batch complete: {success_count} succeeded, {skip_count} skipped, {fail_count} failed") + print(f"📁 Results saved to: {args.outdir}") + + if args.force and skip_count > 0: + print(f" 💡 Tip: Remove --force flag to skip already-processed scans") + elif skip_count > 0: + print(f" 💡 Tip: Use --force to reprocess skipped scans") + + if fail_count > 0: + sys.exit(1) + return + + # Single file mode + pdf_file = Path(args.pdf) + if not pdf_file.exists(): + print(f"❌ Error: PDF file not found: {args.pdf}", file=sys.stderr) + sys.exit(1) + if not pdf_file.is_file(): + print(f"❌ Error: Path is not a file: {args.pdf}", file=sys.stderr) + sys.exit(1) + if pdf_file.suffix.lower() != '.pdf': + print(f"❌ Error: File is not a PDF: {args.pdf}", file=sys.stderr) + sys.exit(1) + print(f"📄 Reading PDF: {args.pdf}") try: