Flatten cases directory structure and unify paths

- Remove cases/new|in-progress|completed subdivision (status managed in DB) - Rename documents/original → documents/originals (consistent plural) - Move exports from global data/exports/ into cases/{num}/exports/ - Add documents/research/ for case law and analysis files - Update all agents, scripts, config, web API endpoints, and DB paths Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:33:27 +00:00
parent 4d674bf475
commit 22e819363e
17 changed files with 1203 additions and 62 deletions
--- a/scripts/compare_extractions.py
+++ b/scripts/compare_extractions.py
@@ -0,0 +1,126 @@
+"""Compare existing MD files with freshly extracted text from PDFs."""
+
+import difflib
+from pathlib import Path
+
+DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
+EXTRACTED_DIR = DOCS_DIR / "extracted"
+
+# Map: existing MD -> extracted MD
+PAIRS = [
+    ("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"),
+    ("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"),
+    ("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"),
+    ("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"),
+]
+
+
+def normalize(text: str) -> str:
+    """Normalize text for comparison."""
+    # Remove markdown formatting, extra whitespace
+    lines = text.strip().split("\n")
+    lines = [l.strip() for l in lines if l.strip()]
+    return "\n".join(lines)
+
+
+def word_overlap(a: str, b: str) -> float:
+    """Calculate word-level overlap ratio."""
+    words_a = set(a.split())
+    words_b = set(b.split())
+    if not words_a or not words_b:
+        return 0.0
+    intersection = words_a & words_b
+    return len(intersection) / max(len(words_a), len(words_b))
+
+
+def main():
+    print(f"{'=' * 70}")
+    print("COMPARISON: Existing MD vs Fresh PDF Extraction")
+    print(f"{'=' * 70}\n")
+
+    summary = []
+
+    for existing_name, extracted_name, label in PAIRS:
+        existing_path = DOCS_DIR / existing_name
+        extracted_path = EXTRACTED_DIR / extracted_name
+
+        if not existing_path.exists():
+            print(f"SKIP: {existing_name} not found")
+            continue
+        if not extracted_path.exists():
+            print(f"SKIP: {extracted_name} not found")
+            continue
+
+        existing_text = existing_path.read_text(encoding="utf-8")
+        extracted_text = extracted_path.read_text(encoding="utf-8")
+
+        existing_norm = normalize(existing_text)
+        extracted_norm = normalize(extracted_text)
+
+        # Stats
+        existing_chars = len(existing_text)
+        extracted_chars = len(extracted_text)
+        existing_words = len(existing_text.split())
+        extracted_words = len(extracted_text.split())
+
+        # Similarity
+        overlap = word_overlap(existing_norm, extracted_norm)
+
+        # Sequence matcher ratio (slower but more accurate)
+        # Use first 5000 chars for speed
+        sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000])
+        seq_ratio = sm.ratio()
+
+        # Find lines in extracted but not in existing (new content)
+        existing_lines = set(existing_norm.split("\n"))
+        extracted_lines = set(extracted_norm.split("\n"))
+        new_lines = extracted_lines - existing_lines
+        missing_lines = existing_lines - extracted_lines
+
+        print(f"{'=' * 70}")
+        print(f"  {label}")
+        print(f"  Existing: {existing_name}")
+        print(f"  Extracted: {extracted_name}")
+        print(f"{'=' * 70}")
+        print(f"  {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}")
+        print(f"  {'-' * 70}")
+        print(f"  {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}")
+        print(f"  {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}")
+        print(f"  {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}")
+        print(f"  {'Word overlap':<30} {overlap:>15.1%}")
+        print(f"  {'Sequence similarity':<30} {seq_ratio:>15.1%}")
+        print(f"  {'Lines only in fresh PDF':<30} {len(new_lines):>15}")
+        print(f"  {'Lines only in existing MD':<30} {len(missing_lines):>15}")
+
+        # Show sample differences
+        if new_lines:
+            print(f"\n  Sample lines ONLY in fresh extraction (first 3):")
+            for line in sorted(new_lines)[:3]:
+                print(f"    + {line[:100]}")
+        if missing_lines:
+            print(f"\n  Sample lines ONLY in existing MD (first 3):")
+            for line in sorted(missing_lines)[:3]:
+                print(f"    - {line[:100]}")
+
+        print()
+
+        summary.append({
+            "label": label,
+            "existing_words": existing_words,
+            "extracted_words": extracted_words,
+            "word_overlap": overlap,
+            "seq_similarity": seq_ratio,
+        })
+
+    # Summary table
+    print(f"\n{'=' * 70}")
+    print("SUMMARY")
+    print(f"{'=' * 70}")
+    print(f"  {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}")
+    print(f"  {'-' * 67}")
+    for s in summary:
+        print(f"  {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}")
+
+
+if __name__ == "__main__":
+    main()