diff --git a/dexa_extract.py b/dexa_extract.py index 5fc87af..d04d4b0 100644 --- a/dexa_extract.py +++ b/dexa_extract.py @@ -44,17 +44,208 @@ def get_processed_dates(outdir): return set() +# ============================================================================ +# EXTRACTION PATTERN DICTIONARY +# ============================================================================ +# Centralized pattern definitions for extracting data from BodySpec PDFs. +# +# Each field has: +# - patterns: List of regex patterns to try (in order of preference) +# - group: Which capture group to extract from each pattern +# - cast: Function to convert extracted string to desired type +# - required: Whether this field must be present for valid extraction +# - description: Human-readable explanation of the field +# +# Benefits of this approach: +# 1. Self-documenting - patterns describe what they extract +# 2. Maintainable - add/modify patterns in one place +# 3. Robust - multiple patterns provide fallback options +# 4. Validated - know exactly which required fields are missing +# +# To add a new field: +# 1. Add entry to this dictionary +# 2. Pattern will automatically be used by extract_all_fields() +# 3. Validation will check if required fields are present +# ============================================================================ + +EXTRACTION_PATTERNS = { + # Primary body composition metrics (from SUMMARY RESULTS table or individual fields) + 'body_fat_percent': { + 'patterns': [ + r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 2) + r"Total Body Fat %\s+([\d\.]+)" # Individual field + ], + 'group': [2, 1], # Which capture group for each pattern + 'cast': float, + 'required': True, + 'description': 'Total body fat percentage' + }, + 'total_mass_lb': { + 'patterns': [ + r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 3) + r"Total Mass.*?\(lbs\)\s+([\d\.]+)" # Individual field + ], + 'group': [3, 1], + 'cast': float, + 'required': True, + 'description': 'Total body mass in pounds' + }, + 'fat_mass_lb': { + 'patterns': [ + r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 4) + r"Fat Tissue \(lbs\)\s+([\d\.]+)" # Individual field + ], + 'group': [4, 1], + 'cast': float, + 'required': True, + 'description': 'Total fat tissue mass' + }, + 'lean_soft_tissue_lb': { + 'patterns': [ + r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 5) + r"Lean Tissue \(lbs\)\s+([\d\.]+)" # Individual field + ], + 'group': [5, 1], + 'cast': float, + 'required': True, + 'description': 'Lean soft tissue mass' + }, + 'bmc_lb': { + 'patterns': [ + r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 6) + r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)" # Individual field + ], + 'group': [6, 1], + 'cast': float, + 'required': True, + 'description': 'Bone mineral content' + }, + + # Date information + 'measured_date': { + 'patterns': [ + r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", # After weight in client info + r"(\d{1,2}/\d{1,2}/\d{4})\s*$" # End of first line + ], + 'group': [1, 1], + 'cast': str, + 'required': True, + 'description': 'Date of DEXA scan' + }, + + # Regional fat distribution + 'android_percent': { + 'patterns': [r"Android.*?([\d\.]+)%"], + 'group': [1], + 'cast': float, + 'required': False, + 'description': 'Android region fat percentage (belly area)' + }, + 'gynoid_percent': { + 'patterns': [r"Gynoid.*?([\d\.]+)%"], + 'group': [1], + 'cast': float, + 'required': False, + 'description': 'Gynoid region fat percentage (hips/thighs)' + }, + 'ag_ratio': { + 'patterns': [ + r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", # On same line as RMR (group 3) + r"A/G Ratio\s+([\d\.]+)" # Standalone field + ], + 'group': [3, 1], + 'cast': float, + 'required': False, + 'description': 'Android/Gynoid ratio (central vs peripheral fat)' + }, + + # Metabolic and supplemental metrics + 'rmr_cal_per_day': { + 'patterns': [r"([\d,]+)\s*cal/day"], + 'group': [1], + 'cast': lambda s: int(s.replace(',', '')), + 'required': False, + 'description': 'Resting metabolic rate (calories per day)' + }, + 'vat_mass_lb': { + 'patterns': [r"Mass \(lbs\)\s+([\d\.]+)"], + 'group': [1], + 'cast': float, + 'required': False, + 'description': 'Visceral adipose tissue mass' + }, + 'vat_volume_in3': { + 'patterns': [r"Volume \(in3\)\s+([\d\.]+)"], + 'group': [1], + 'cast': float, + 'required': False, + 'description': 'Visceral adipose tissue volume' + }, +} + + def read_pdf_text(pdf_path): + """Extract all text from PDF pages""" with pdfplumber.open(pdf_path) as pdf: pages_text = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages_text) -def find_one(pattern, text, cast=float, flags=re.IGNORECASE): - m = re.search(pattern, text, flags) - if not m: - return None - val = m.group(1).replace(",", "").strip() - return cast(val) if cast else val + +def extract_field(text, patterns, groups, cast=float, flags=re.IGNORECASE): + """ + Extract a field using multiple pattern attempts + + Args: + text: Full PDF text + patterns: List of regex patterns to try (in order) + groups: List of capture group indices (one per pattern) + cast: Function to convert extracted string to desired type + flags: Regex flags + + Returns: + Extracted value or None + """ + for pattern, group in zip(patterns, groups): + match = re.search(pattern, text, flags) + if match: + try: + val = match.group(group).replace(",", "").strip() + return cast(val) if cast else val + except (ValueError, IndexError, AttributeError): + continue + return None + + +def extract_all_fields(text, patterns_dict=None): + """ + Extract all defined fields from PDF text + + Args: + text: Full PDF text content + patterns_dict: Field definitions (uses EXTRACTION_PATTERNS if None) + + Returns: + Dictionary of extracted data and list of missing required fields + """ + if patterns_dict is None: + patterns_dict = EXTRACTION_PATTERNS + + data = {} + missing_required = [] + + for field_name, config in patterns_dict.items(): + value = extract_field( + text, + config['patterns'], + config['group'], + config['cast'] + ) + data[field_name] = value + + if value is None and config['required']: + missing_required.append(field_name) + + return data, missing_required def convert_date_to_iso(date_str): """Convert MM/DD/YYYY to YYYY-MM-DD""" @@ -120,51 +311,22 @@ def parse_bone_density_total(text): return {} def parse_dexa_pdf(pdf_path): + """ + Extract all data from BodySpec DEXA PDF + + Uses pattern dictionary for main fields and specialized parsers for tables. + Returns dict with all extracted data and validation warnings. + """ text = read_pdf_text(pdf_path) - - data = {} - # Try to extract date from client info line: "Name Male 9/26/1995 74.0 in. 213.0 lbs. 10/6/2025" - # The last date on the line is the measured date - date_match = re.search(r"(\d{1,2}/\d{1,2}/\d{4})\s*$", text.split('\n')[0] if '\n' in text else text, re.MULTILINE) - if not date_match: - # Try finding it in the full text - look for pattern at end of client info lines - date_match = re.search(r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", text) - data["measured_date"] = date_match.group(1) if date_match else None - # First try to extract from SUMMARY RESULTS table (more reliable) - # Pattern: 10/6/2025 27.8% 211.6 58.8 145.4 7.4 - summary_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)" - summary_match = re.search(summary_pattern, text) - if summary_match: - data["body_fat_percent"] = float(summary_match.group(2)) - data["total_mass_lb"] = float(summary_match.group(3)) - data["fat_mass_lb"] = float(summary_match.group(4)) - data["lean_soft_tissue_lb"] = float(summary_match.group(5)) - data["bmc_lb"] = float(summary_match.group(6)) - else: - # Fallback to individual patterns - data["body_fat_percent"] = find_one(r"Total Body Fat %\s+([\d\.]+)", text) - data["total_mass_lb"] = find_one(r"Total Mass.*?\(lbs\)\s+([\d\.]+)", text) - data["fat_mass_lb"] = find_one(r"Fat Tissue \(lbs\)\s+([\d\.]+)", text) - data["lean_soft_tissue_lb"] = find_one(r"Lean Tissue \(lbs\)\s+([\d\.]+)", text) - data["bmc_lb"] = find_one(r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)", text) - - # Supplemental - data["android_percent"] = find_one(r"Android.*?([\d\.]+)%", text) - data["gynoid_percent"] = find_one(r"Gynoid.*?([\d\.]+)%", text) - data["rmr_cal_per_day"] = find_one(r"([\d,]+)\s*cal/day", text, cast=lambda s: int(s.replace(",", ""))) + # Extract all defined fields using pattern dictionary + data, missing_required = extract_all_fields(text) - # A/G Ratio appears after RMR, Android%, Gynoid% on same line: "1,778 cal/day 36.5% 27.8% 1.31" - ag_match = re.search(r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", text) - if ag_match: - data["ag_ratio"] = float(ag_match.group(3)) - else: - data["ag_ratio"] = find_one(r"A/G Ratio\s+([\d\.]+)", text) + # Warn about missing required fields + if missing_required: + print(f" ⚠️ Warning: Missing required fields: {', '.join(missing_required)}", file=sys.stderr) - data["vat_mass_lb"] = find_one(r"Mass \(lbs\)\s+([\d\.]+)", text) - data["vat_volume_in3"] = find_one(r"Volume \(in3\)\s+([\d\.]+)", text) - - # Tables + # Parse structured tables (regional, muscle balance, bone density) data["regional"] = parse_regional_table(text) data["muscle_balance"] = parse_muscle_balance(text) data["bone_density"] = parse_bone_density_total(text)