"""Compare existing MD files with freshly extracted text from PDFs.""" import difflib from pathlib import Path DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents") EXTRACTED_DIR = DOCS_DIR / "extracted" # Map: existing MD -> extracted MD PAIRS = [ ("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"), ("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"), ("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"), ("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"), ] def normalize(text: str) -> str: """Normalize text for comparison.""" # Remove markdown formatting, extra whitespace lines = text.strip().split("\n") lines = [l.strip() for l in lines if l.strip()] return "\n".join(lines) def word_overlap(a: str, b: str) -> float: """Calculate word-level overlap ratio.""" words_a = set(a.split()) words_b = set(b.split()) if not words_a or not words_b: return 0.0 intersection = words_a & words_b return len(intersection) / max(len(words_a), len(words_b)) def main(): print(f"{'=' * 70}") print("COMPARISON: Existing MD vs Fresh PDF Extraction") print(f"{'=' * 70}\n") summary = [] for existing_name, extracted_name, label in PAIRS: existing_path = DOCS_DIR / existing_name extracted_path = EXTRACTED_DIR / extracted_name if not existing_path.exists(): print(f"SKIP: {existing_name} not found") continue if not extracted_path.exists(): print(f"SKIP: {extracted_name} not found") continue existing_text = existing_path.read_text(encoding="utf-8") extracted_text = extracted_path.read_text(encoding="utf-8") existing_norm = normalize(existing_text) extracted_norm = normalize(extracted_text) # Stats existing_chars = len(existing_text) extracted_chars = len(extracted_text) existing_words = len(existing_text.split()) extracted_words = len(extracted_text.split()) # Similarity overlap = word_overlap(existing_norm, extracted_norm) # Sequence matcher ratio (slower but more accurate) # Use first 5000 chars for speed sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000]) seq_ratio = sm.ratio() # Find lines in extracted but not in existing (new content) existing_lines = set(existing_norm.split("\n")) extracted_lines = set(extracted_norm.split("\n")) new_lines = extracted_lines - existing_lines missing_lines = existing_lines - extracted_lines print(f"{'=' * 70}") print(f" {label}") print(f" Existing: {existing_name}") print(f" Extracted: {extracted_name}") print(f"{'=' * 70}") print(f" {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}") print(f" {'-' * 70}") print(f" {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}") print(f" {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}") print(f" {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}") print(f" {'Word overlap':<30} {overlap:>15.1%}") print(f" {'Sequence similarity':<30} {seq_ratio:>15.1%}") print(f" {'Lines only in fresh PDF':<30} {len(new_lines):>15}") print(f" {'Lines only in existing MD':<30} {len(missing_lines):>15}") # Show sample differences if new_lines: print(f"\n Sample lines ONLY in fresh extraction (first 3):") for line in sorted(new_lines)[:3]: print(f" + {line[:100]}") if missing_lines: print(f"\n Sample lines ONLY in existing MD (first 3):") for line in sorted(missing_lines)[:3]: print(f" - {line[:100]}") print() summary.append({ "label": label, "existing_words": existing_words, "extracted_words": extracted_words, "word_overlap": overlap, "seq_similarity": seq_ratio, }) # Summary table print(f"\n{'=' * 70}") print("SUMMARY") print(f"{'=' * 70}") print(f" {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}") print(f" {'-' * 67}") for s in summary: print(f" {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}") if __name__ == "__main__": main()