Refactor: implement pattern dictionary for PDF extraction

Major improvements to extraction code: PATTERN DICTIONARY APPROACH: - Centralized all extraction patterns into EXTRACTION_PATTERNS dict - Each field now self-documents with description, required status, cast function - Multiple patterns per field with automatic fallback (e.g., summary table → individual field) - Validation built-in: reports missing required vs optional fields NEW FUNCTIONS: - extract_field(): Tries multiple patterns with fallback logic - extract_all_fields(): Extracts all defined fields with validation - Comprehensive docstrings explaining the approach BENEFITS: ✅ Self-documenting - each pattern describes what it extracts ✅ Maintainable - add new fields by adding one dict entry ✅ Robust - automatic fallback if primary pattern fails ✅ Validated - instant feedback on missing required fields ✅ Type-safe - cast functions ensure correct data types TESTING: - All existing tests pass - Single-file mode: ✅ - Batch mode: ✅ - Data extraction: ✅ identical to previous version Code grew by ~160 lines but with significant improvements in: - Readability (clear field definitions) - Maintainability (centralized patterns) - Extensibility (easy to add new fields) - Debuggability (validation reports)
2025-10-06 17:59:27 -07:00 · 2025-10-06 17:59:27 -07:00 · 130f0ba994
commit 130f0ba994
parent 2c17d86fe7
1 changed files with 209 additions and 47 deletions
--- a/dexa_extract.py
+++ b/dexa_extract.py
@ -44,17 +44,208 @@ def get_processed_dates(outdir):
    
    return set()

+# ============================================================================
+# EXTRACTION PATTERN DICTIONARY
+# ============================================================================
+# Centralized pattern definitions for extracting data from BodySpec PDFs.
+# 
+# Each field has:
+#   - patterns: List of regex patterns to try (in order of preference)
+#   - group: Which capture group to extract from each pattern
+#   - cast: Function to convert extracted string to desired type
+#   - required: Whether this field must be present for valid extraction
+#   - description: Human-readable explanation of the field
+#
+# Benefits of this approach:
+#   1. Self-documenting - patterns describe what they extract
+#   2. Maintainable - add/modify patterns in one place
+#   3. Robust - multiple patterns provide fallback options
+#   4. Validated - know exactly which required fields are missing
+#
+# To add a new field:
+#   1. Add entry to this dictionary
+#   2. Pattern will automatically be used by extract_all_fields()
+#   3. Validation will check if required fields are present
+# ============================================================================
+
+EXTRACTION_PATTERNS = {
+    # Primary body composition metrics (from SUMMARY RESULTS table or individual fields)
+    'body_fat_percent': {
+        'patterns': [
+            r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)",  # Summary table (group 2)
+            r"Total Body Fat %\s+([\d\.]+)"  # Individual field
+        ],
+        'group': [2, 1],  # Which capture group for each pattern
+        'cast': float,
+        'required': True,
+        'description': 'Total body fat percentage'
+    },
+    'total_mass_lb': {
+        'patterns': [
+            r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)",  # Summary table (group 3)
+            r"Total Mass.*?\(lbs\)\s+([\d\.]+)"  # Individual field
+        ],
+        'group': [3, 1],
+        'cast': float,
+        'required': True,
+        'description': 'Total body mass in pounds'
+    },
+    'fat_mass_lb': {
+        'patterns': [
+            r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)",  # Summary table (group 4)
+            r"Fat Tissue \(lbs\)\s+([\d\.]+)"  # Individual field
+        ],
+        'group': [4, 1],
+        'cast': float,
+        'required': True,
+        'description': 'Total fat tissue mass'
+    },
+    'lean_soft_tissue_lb': {
+        'patterns': [
+            r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)",  # Summary table (group 5)
+            r"Lean Tissue \(lbs\)\s+([\d\.]+)"  # Individual field
+        ],
+        'group': [5, 1],
+        'cast': float,
+        'required': True,
+        'description': 'Lean soft tissue mass'
+    },
+    'bmc_lb': {
+        'patterns': [
+            r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)",  # Summary table (group 6)
+            r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)"  # Individual field
+        ],
+        'group': [6, 1],
+        'cast': float,
+        'required': True,
+        'description': 'Bone mineral content'
+    },
+    
+    # Date information
+    'measured_date': {
+        'patterns': [
+            r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})",  # After weight in client info
+            r"(\d{1,2}/\d{1,2}/\d{4})\s*$"  # End of first line
+        ],
+        'group': [1, 1],
+        'cast': str,
+        'required': True,
+        'description': 'Date of DEXA scan'
+    },
+    
+    # Regional fat distribution
+    'android_percent': {
+        'patterns': [r"Android.*?([\d\.]+)%"],
+        'group': [1],
+        'cast': float,
+        'required': False,
+        'description': 'Android region fat percentage (belly area)'
+    },
+    'gynoid_percent': {
+        'patterns': [r"Gynoid.*?([\d\.]+)%"],
+        'group': [1],
+        'cast': float,
+        'required': False,
+        'description': 'Gynoid region fat percentage (hips/thighs)'
+    },
+    'ag_ratio': {
+        'patterns': [
+            r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)",  # On same line as RMR (group 3)
+            r"A/G Ratio\s+([\d\.]+)"  # Standalone field
+        ],
+        'group': [3, 1],
+        'cast': float,
+        'required': False,
+        'description': 'Android/Gynoid ratio (central vs peripheral fat)'
+    },
+    
+    # Metabolic and supplemental metrics
+    'rmr_cal_per_day': {
+        'patterns': [r"([\d,]+)\s*cal/day"],
+        'group': [1],
+        'cast': lambda s: int(s.replace(',', '')),
+        'required': False,
+        'description': 'Resting metabolic rate (calories per day)'
+    },
+    'vat_mass_lb': {
+        'patterns': [r"Mass \(lbs\)\s+([\d\.]+)"],
+        'group': [1],
+        'cast': float,
+        'required': False,
+        'description': 'Visceral adipose tissue mass'
+    },
+    'vat_volume_in3': {
+        'patterns': [r"Volume \(in3\)\s+([\d\.]+)"],
+        'group': [1],
+        'cast': float,
+        'required': False,
+        'description': 'Visceral adipose tissue volume'
+    },
+}
+
+
 def read_pdf_text(pdf_path):
+    """Extract all text from PDF pages"""
    with pdfplumber.open(pdf_path) as pdf:
        pages_text = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(pages_text)

-def find_one(pattern, text, cast=float, flags=re.IGNORECASE):
-    m = re.search(pattern, text, flags)
-    if not m:
-        return None
-    val = m.group(1).replace(",", "").strip()
-    return cast(val) if cast else val
+
+def extract_field(text, patterns, groups, cast=float, flags=re.IGNORECASE):
+    """
+    Extract a field using multiple pattern attempts
+    
+    Args:
+        text: Full PDF text
+        patterns: List of regex patterns to try (in order)
+        groups: List of capture group indices (one per pattern)
+        cast: Function to convert extracted string to desired type
+        flags: Regex flags
+    
+    Returns:
+        Extracted value or None
+    """
+    for pattern, group in zip(patterns, groups):
+        match = re.search(pattern, text, flags)
+        if match:
+            try:
+                val = match.group(group).replace(",", "").strip()
+                return cast(val) if cast else val
+            except (ValueError, IndexError, AttributeError):
+                continue
+    return None
+
+
+def extract_all_fields(text, patterns_dict=None):
+    """
+    Extract all defined fields from PDF text
+    
+    Args:
+        text: Full PDF text content
+        patterns_dict: Field definitions (uses EXTRACTION_PATTERNS if None)
+    
+    Returns:
+        Dictionary of extracted data and list of missing required fields
+    """
+    if patterns_dict is None:
+        patterns_dict = EXTRACTION_PATTERNS
+    
+    data = {}
+    missing_required = []
+    
+    for field_name, config in patterns_dict.items():
+        value = extract_field(
+            text,
+            config['patterns'],
+            config['group'],
+            config['cast']
+        )
+        data[field_name] = value
+        
+        if value is None and config['required']:
+            missing_required.append(field_name)
+    
+    return data, missing_required

 def convert_date_to_iso(date_str):
    """Convert MM/DD/YYYY to YYYY-MM-DD"""
@ -120,51 +311,22 @@ def parse_bone_density_total(text):
    return {}

 def parse_dexa_pdf(pdf_path):
+    """
+    Extract all data from BodySpec DEXA PDF
+    
+    Uses pattern dictionary for main fields and specialized parsers for tables.
+    Returns dict with all extracted data and validation warnings.
+    """
    text = read_pdf_text(pdf_path)
-
-    data = {}
-    # Try to extract date from client info line: "Name Male 9/26/1995 74.0 in. 213.0 lbs. 10/6/2025"
-    # The last date on the line is the measured date
-    date_match = re.search(r"(\d{1,2}/\d{1,2}/\d{4})\s*$", text.split('\n')[0] if '\n' in text else text, re.MULTILINE)
-    if not date_match:
-        # Try finding it in the full text - look for pattern at end of client info lines
-        date_match = re.search(r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", text)
-    data["measured_date"] = date_match.group(1) if date_match else None
    
-    # First try to extract from SUMMARY RESULTS table (more reliable)
-    # Pattern: 10/6/2025 27.8% 211.6 58.8 145.4 7.4
-    summary_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)"
-    summary_match = re.search(summary_pattern, text)
-    if summary_match:
-        data["body_fat_percent"] = float(summary_match.group(2))
-        data["total_mass_lb"] = float(summary_match.group(3))
-        data["fat_mass_lb"] = float(summary_match.group(4))
-        data["lean_soft_tissue_lb"] = float(summary_match.group(5))
-        data["bmc_lb"] = float(summary_match.group(6))
-    else:
-        # Fallback to individual patterns
-        data["body_fat_percent"] = find_one(r"Total Body Fat %\s+([\d\.]+)", text)
-        data["total_mass_lb"] = find_one(r"Total Mass.*?\(lbs\)\s+([\d\.]+)", text)
-        data["fat_mass_lb"] = find_one(r"Fat Tissue \(lbs\)\s+([\d\.]+)", text)
-        data["lean_soft_tissue_lb"] = find_one(r"Lean Tissue \(lbs\)\s+([\d\.]+)", text)
-        data["bmc_lb"] = find_one(r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)", text)
-
-    # Supplemental
-    data["android_percent"] = find_one(r"Android.*?([\d\.]+)%", text)
-    data["gynoid_percent"] = find_one(r"Gynoid.*?([\d\.]+)%", text)
-    data["rmr_cal_per_day"] = find_one(r"([\d,]+)\s*cal/day", text, cast=lambda s: int(s.replace(",", "")))
+    # Extract all defined fields using pattern dictionary
+    data, missing_required = extract_all_fields(text)
    
-    # A/G Ratio appears after RMR, Android%, Gynoid% on same line: "1,778 cal/day 36.5% 27.8% 1.31"
-    ag_match = re.search(r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", text)
-    if ag_match:
-        data["ag_ratio"] = float(ag_match.group(3))
-    else:
-        data["ag_ratio"] = find_one(r"A/G Ratio\s+([\d\.]+)", text)
+    # Warn about missing required fields
+    if missing_required:
+        print(f"   ⚠️  Warning: Missing required fields: {', '.join(missing_required)}", file=sys.stderr)
    
-    data["vat_mass_lb"] = find_one(r"Mass \(lbs\)\s+([\d\.]+)", text)
-    data["vat_volume_in3"] = find_one(r"Volume \(in3\)\s+([\d\.]+)", text)
-
-    # Tables
+    # Parse structured tables (regional, muscle balance, bone density)
    data["regional"] = parse_regional_table(text)
    data["muscle_balance"] = parse_muscle_balance(text)
    data["bone_density"] = parse_bone_density_total(text)