legal-ai/scripts/compare_extractions.py

"""Compare existing MD files with freshly extracted text from PDFs."""

import difflib
from pathlib import Path

DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
EXTRACTED_DIR = DOCS_DIR / "extracted"

# Map: existing MD -> extracted MD
PAIRS = [
    ("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"),
    ("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"),
    ("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"),
    ("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"),
]


def normalize(text: str) -> str:
    """Normalize text for comparison."""
    # Remove markdown formatting, extra whitespace
    lines = text.strip().split("\n")
    lines = [l.strip() for l in lines if l.strip()]
    return "\n".join(lines)


def word_overlap(a: str, b: str) -> float:
    """Calculate word-level overlap ratio."""
    words_a = set(a.split())
    words_b = set(b.split())
    if not words_a or not words_b:
        return 0.0
    intersection = words_a & words_b
    return len(intersection) / max(len(words_a), len(words_b))


def main():
    print(f"{'=' * 70}")
    print("COMPARISON: Existing MD vs Fresh PDF Extraction")
    print(f"{'=' * 70}\n")

    summary = []

    for existing_name, extracted_name, label in PAIRS:
        existing_path = DOCS_DIR / existing_name
        extracted_path = EXTRACTED_DIR / extracted_name

        if not existing_path.exists():
            print(f"SKIP: {existing_name} not found")
            continue
        if not extracted_path.exists():
            print(f"SKIP: {extracted_name} not found")
            continue

        existing_text = existing_path.read_text(encoding="utf-8")
        extracted_text = extracted_path.read_text(encoding="utf-8")

        existing_norm = normalize(existing_text)
        extracted_norm = normalize(extracted_text)

        # Stats
        existing_chars = len(existing_text)
        extracted_chars = len(extracted_text)
        existing_words = len(existing_text.split())
        extracted_words = len(extracted_text.split())

        # Similarity
        overlap = word_overlap(existing_norm, extracted_norm)

        # Sequence matcher ratio (slower but more accurate)
        # Use first 5000 chars for speed
        sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000])
        seq_ratio = sm.ratio()

        # Find lines in extracted but not in existing (new content)
        existing_lines = set(existing_norm.split("\n"))
        extracted_lines = set(extracted_norm.split("\n"))
        new_lines = extracted_lines - existing_lines
        missing_lines = existing_lines - extracted_lines

        print(f"{'=' * 70}")
        print(f"  {label}")
        print(f"  Existing: {existing_name}")
        print(f"  Extracted: {extracted_name}")
        print(f"{'=' * 70}")
        print(f"  {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}")
        print(f"  {'-' * 70}")
        print(f"  {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}")
        print(f"  {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}")
        print(f"  {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}")
        print(f"  {'Word overlap':<30} {overlap:>15.1%}")
        print(f"  {'Sequence similarity':<30} {seq_ratio:>15.1%}")
        print(f"  {'Lines only in fresh PDF':<30} {len(new_lines):>15}")
        print(f"  {'Lines only in existing MD':<30} {len(missing_lines):>15}")

        # Show sample differences
        if new_lines:
            print(f"\n  Sample lines ONLY in fresh extraction (first 3):")
            for line in sorted(new_lines)[:3]:
                print(f"    + {line[:100]}")
        if missing_lines:
            print(f"\n  Sample lines ONLY in existing MD (first 3):")
            for line in sorted(missing_lines)[:3]:
                print(f"    - {line[:100]}")

        print()

        summary.append({
            "label": label,
            "existing_words": existing_words,
            "extracted_words": extracted_words,
            "word_overlap": overlap,
            "seq_similarity": seq_ratio,
        })

    # Summary table
    print(f"\n{'=' * 70}")
    print("SUMMARY")
    print(f"{'=' * 70}")
    print(f"  {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}")
    print(f"  {'-' * 67}")
    for s in summary:
        print(f"  {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}")


if __name__ == "__main__":
    main()