Refactor: implement pattern dictionary for PDF extraction

Major improvements to extraction code:

PATTERN DICTIONARY APPROACH:
- Centralized all extraction patterns into EXTRACTION_PATTERNS dict
- Each field now self-documents with description, required status, cast function
- Multiple patterns per field with automatic fallback (e.g., summary table → individual field)
- Validation built-in: reports missing required vs optional fields

NEW FUNCTIONS:
- extract_field(): Tries multiple patterns with fallback logic
- extract_all_fields(): Extracts all defined fields with validation
- Comprehensive docstrings explaining the approach

BENEFITS:
 Self-documenting - each pattern describes what it extracts
 Maintainable - add new fields by adding one dict entry
 Robust - automatic fallback if primary pattern fails
 Validated - instant feedback on missing required fields
 Type-safe - cast functions ensure correct data types

TESTING:
- All existing tests pass
- Single-file mode: 
- Batch mode: 
- Data extraction:  identical to previous version

Code grew by ~160 lines but with significant improvements in:
- Readability (clear field definitions)
- Maintainability (centralized patterns)
- Extensibility (easy to add new fields)
- Debuggability (validation reports)
This commit is contained in:
Mac DeCourcy 2025-10-06 17:59:27 -07:00
parent 2c17d86fe7
commit 130f0ba994

View file

@ -44,17 +44,208 @@ def get_processed_dates(outdir):
return set()
# ============================================================================
# EXTRACTION PATTERN DICTIONARY
# ============================================================================
# Centralized pattern definitions for extracting data from BodySpec PDFs.
#
# Each field has:
# - patterns: List of regex patterns to try (in order of preference)
# - group: Which capture group to extract from each pattern
# - cast: Function to convert extracted string to desired type
# - required: Whether this field must be present for valid extraction
# - description: Human-readable explanation of the field
#
# Benefits of this approach:
# 1. Self-documenting - patterns describe what they extract
# 2. Maintainable - add/modify patterns in one place
# 3. Robust - multiple patterns provide fallback options
# 4. Validated - know exactly which required fields are missing
#
# To add a new field:
# 1. Add entry to this dictionary
# 2. Pattern will automatically be used by extract_all_fields()
# 3. Validation will check if required fields are present
# ============================================================================
EXTRACTION_PATTERNS = {
# Primary body composition metrics (from SUMMARY RESULTS table or individual fields)
'body_fat_percent': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 2)
r"Total Body Fat %\s+([\d\.]+)" # Individual field
],
'group': [2, 1], # Which capture group for each pattern
'cast': float,
'required': True,
'description': 'Total body fat percentage'
},
'total_mass_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 3)
r"Total Mass.*?\(lbs\)\s+([\d\.]+)" # Individual field
],
'group': [3, 1],
'cast': float,
'required': True,
'description': 'Total body mass in pounds'
},
'fat_mass_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 4)
r"Fat Tissue \(lbs\)\s+([\d\.]+)" # Individual field
],
'group': [4, 1],
'cast': float,
'required': True,
'description': 'Total fat tissue mass'
},
'lean_soft_tissue_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 5)
r"Lean Tissue \(lbs\)\s+([\d\.]+)" # Individual field
],
'group': [5, 1],
'cast': float,
'required': True,
'description': 'Lean soft tissue mass'
},
'bmc_lb': {
'patterns': [
r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)", # Summary table (group 6)
r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)" # Individual field
],
'group': [6, 1],
'cast': float,
'required': True,
'description': 'Bone mineral content'
},
# Date information
'measured_date': {
'patterns': [
r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", # After weight in client info
r"(\d{1,2}/\d{1,2}/\d{4})\s*$" # End of first line
],
'group': [1, 1],
'cast': str,
'required': True,
'description': 'Date of DEXA scan'
},
# Regional fat distribution
'android_percent': {
'patterns': [r"Android.*?([\d\.]+)%"],
'group': [1],
'cast': float,
'required': False,
'description': 'Android region fat percentage (belly area)'
},
'gynoid_percent': {
'patterns': [r"Gynoid.*?([\d\.]+)%"],
'group': [1],
'cast': float,
'required': False,
'description': 'Gynoid region fat percentage (hips/thighs)'
},
'ag_ratio': {
'patterns': [
r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", # On same line as RMR (group 3)
r"A/G Ratio\s+([\d\.]+)" # Standalone field
],
'group': [3, 1],
'cast': float,
'required': False,
'description': 'Android/Gynoid ratio (central vs peripheral fat)'
},
# Metabolic and supplemental metrics
'rmr_cal_per_day': {
'patterns': [r"([\d,]+)\s*cal/day"],
'group': [1],
'cast': lambda s: int(s.replace(',', '')),
'required': False,
'description': 'Resting metabolic rate (calories per day)'
},
'vat_mass_lb': {
'patterns': [r"Mass \(lbs\)\s+([\d\.]+)"],
'group': [1],
'cast': float,
'required': False,
'description': 'Visceral adipose tissue mass'
},
'vat_volume_in3': {
'patterns': [r"Volume \(in3\)\s+([\d\.]+)"],
'group': [1],
'cast': float,
'required': False,
'description': 'Visceral adipose tissue volume'
},
}
def read_pdf_text(pdf_path):
"""Extract all text from PDF pages"""
with pdfplumber.open(pdf_path) as pdf:
pages_text = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages_text)
def find_one(pattern, text, cast=float, flags=re.IGNORECASE):
m = re.search(pattern, text, flags)
if not m:
return None
val = m.group(1).replace(",", "").strip()
return cast(val) if cast else val
def extract_field(text, patterns, groups, cast=float, flags=re.IGNORECASE):
"""
Extract a field using multiple pattern attempts
Args:
text: Full PDF text
patterns: List of regex patterns to try (in order)
groups: List of capture group indices (one per pattern)
cast: Function to convert extracted string to desired type
flags: Regex flags
Returns:
Extracted value or None
"""
for pattern, group in zip(patterns, groups):
match = re.search(pattern, text, flags)
if match:
try:
val = match.group(group).replace(",", "").strip()
return cast(val) if cast else val
except (ValueError, IndexError, AttributeError):
continue
return None
def extract_all_fields(text, patterns_dict=None):
"""
Extract all defined fields from PDF text
Args:
text: Full PDF text content
patterns_dict: Field definitions (uses EXTRACTION_PATTERNS if None)
Returns:
Dictionary of extracted data and list of missing required fields
"""
if patterns_dict is None:
patterns_dict = EXTRACTION_PATTERNS
data = {}
missing_required = []
for field_name, config in patterns_dict.items():
value = extract_field(
text,
config['patterns'],
config['group'],
config['cast']
)
data[field_name] = value
if value is None and config['required']:
missing_required.append(field_name)
return data, missing_required
def convert_date_to_iso(date_str):
"""Convert MM/DD/YYYY to YYYY-MM-DD"""
@ -120,51 +311,22 @@ def parse_bone_density_total(text):
return {}
def parse_dexa_pdf(pdf_path):
"""
Extract all data from BodySpec DEXA PDF
Uses pattern dictionary for main fields and specialized parsers for tables.
Returns dict with all extracted data and validation warnings.
"""
text = read_pdf_text(pdf_path)
data = {}
# Try to extract date from client info line: "Name Male 9/26/1995 74.0 in. 213.0 lbs. 10/6/2025"
# The last date on the line is the measured date
date_match = re.search(r"(\d{1,2}/\d{1,2}/\d{4})\s*$", text.split('\n')[0] if '\n' in text else text, re.MULTILINE)
if not date_match:
# Try finding it in the full text - look for pattern at end of client info lines
date_match = re.search(r"lbs\.\s+(\d{1,2}/\d{1,2}/\d{4})", text)
data["measured_date"] = date_match.group(1) if date_match else None
# First try to extract from SUMMARY RESULTS table (more reliable)
# Pattern: 10/6/2025 27.8% 211.6 58.8 145.4 7.4
summary_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s+([\d\.]+)%\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)"
summary_match = re.search(summary_pattern, text)
if summary_match:
data["body_fat_percent"] = float(summary_match.group(2))
data["total_mass_lb"] = float(summary_match.group(3))
data["fat_mass_lb"] = float(summary_match.group(4))
data["lean_soft_tissue_lb"] = float(summary_match.group(5))
data["bmc_lb"] = float(summary_match.group(6))
else:
# Fallback to individual patterns
data["body_fat_percent"] = find_one(r"Total Body Fat %\s+([\d\.]+)", text)
data["total_mass_lb"] = find_one(r"Total Mass.*?\(lbs\)\s+([\d\.]+)", text)
data["fat_mass_lb"] = find_one(r"Fat Tissue \(lbs\)\s+([\d\.]+)", text)
data["lean_soft_tissue_lb"] = find_one(r"Lean Tissue \(lbs\)\s+([\d\.]+)", text)
data["bmc_lb"] = find_one(r"Bone Mineral\s+Content \(BMC\)\s+([\d\.]+)", text)
# Supplemental
data["android_percent"] = find_one(r"Android.*?([\d\.]+)%", text)
data["gynoid_percent"] = find_one(r"Gynoid.*?([\d\.]+)%", text)
data["rmr_cal_per_day"] = find_one(r"([\d,]+)\s*cal/day", text, cast=lambda s: int(s.replace(",", "")))
# Extract all defined fields using pattern dictionary
data, missing_required = extract_all_fields(text)
# A/G Ratio appears after RMR, Android%, Gynoid% on same line: "1,778 cal/day 36.5% 27.8% 1.31"
ag_match = re.search(r"[\d,]+\s*cal/day\s+([\d\.]+)%\s+([\d\.]+)%\s+([\d\.]+)", text)
if ag_match:
data["ag_ratio"] = float(ag_match.group(3))
else:
data["ag_ratio"] = find_one(r"A/G Ratio\s+([\d\.]+)", text)
# Warn about missing required fields
if missing_required:
print(f" ⚠️ Warning: Missing required fields: {', '.join(missing_required)}", file=sys.stderr)
data["vat_mass_lb"] = find_one(r"Mass \(lbs\)\s+([\d\.]+)", text)
data["vat_volume_in3"] = find_one(r"Volume \(in3\)\s+([\d\.]+)", text)
# Tables
# Parse structured tables (regional, muscle balance, bone density)
data["regional"] = parse_regional_table(text)
data["muscle_balance"] = parse_muscle_balance(text)
data["bone_density"] = parse_bone_density_total(text)