- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
127 lines
4.8 KiB
Python
127 lines
4.8 KiB
Python
"""Compare existing MD files with freshly extracted text from PDFs."""
|
|
|
|
import difflib
|
|
from pathlib import Path
|
|
|
|
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
|
|
EXTRACTED_DIR = DOCS_DIR / "extracted"
|
|
|
|
# Map: existing MD -> extracted MD
|
|
PAIRS = [
|
|
("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"),
|
|
("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"),
|
|
("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"),
|
|
("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"),
|
|
]
|
|
|
|
|
|
def normalize(text: str) -> str:
|
|
"""Normalize text for comparison."""
|
|
# Remove markdown formatting, extra whitespace
|
|
lines = text.strip().split("\n")
|
|
lines = [l.strip() for l in lines if l.strip()]
|
|
return "\n".join(lines)
|
|
|
|
|
|
def word_overlap(a: str, b: str) -> float:
|
|
"""Calculate word-level overlap ratio."""
|
|
words_a = set(a.split())
|
|
words_b = set(b.split())
|
|
if not words_a or not words_b:
|
|
return 0.0
|
|
intersection = words_a & words_b
|
|
return len(intersection) / max(len(words_a), len(words_b))
|
|
|
|
|
|
def main():
|
|
print(f"{'=' * 70}")
|
|
print("COMPARISON: Existing MD vs Fresh PDF Extraction")
|
|
print(f"{'=' * 70}\n")
|
|
|
|
summary = []
|
|
|
|
for existing_name, extracted_name, label in PAIRS:
|
|
existing_path = DOCS_DIR / existing_name
|
|
extracted_path = EXTRACTED_DIR / extracted_name
|
|
|
|
if not existing_path.exists():
|
|
print(f"SKIP: {existing_name} not found")
|
|
continue
|
|
if not extracted_path.exists():
|
|
print(f"SKIP: {extracted_name} not found")
|
|
continue
|
|
|
|
existing_text = existing_path.read_text(encoding="utf-8")
|
|
extracted_text = extracted_path.read_text(encoding="utf-8")
|
|
|
|
existing_norm = normalize(existing_text)
|
|
extracted_norm = normalize(extracted_text)
|
|
|
|
# Stats
|
|
existing_chars = len(existing_text)
|
|
extracted_chars = len(extracted_text)
|
|
existing_words = len(existing_text.split())
|
|
extracted_words = len(extracted_text.split())
|
|
|
|
# Similarity
|
|
overlap = word_overlap(existing_norm, extracted_norm)
|
|
|
|
# Sequence matcher ratio (slower but more accurate)
|
|
# Use first 5000 chars for speed
|
|
sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000])
|
|
seq_ratio = sm.ratio()
|
|
|
|
# Find lines in extracted but not in existing (new content)
|
|
existing_lines = set(existing_norm.split("\n"))
|
|
extracted_lines = set(extracted_norm.split("\n"))
|
|
new_lines = extracted_lines - existing_lines
|
|
missing_lines = existing_lines - extracted_lines
|
|
|
|
print(f"{'=' * 70}")
|
|
print(f" {label}")
|
|
print(f" Existing: {existing_name}")
|
|
print(f" Extracted: {extracted_name}")
|
|
print(f"{'=' * 70}")
|
|
print(f" {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}")
|
|
print(f" {'-' * 70}")
|
|
print(f" {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}")
|
|
print(f" {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}")
|
|
print(f" {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}")
|
|
print(f" {'Word overlap':<30} {overlap:>15.1%}")
|
|
print(f" {'Sequence similarity':<30} {seq_ratio:>15.1%}")
|
|
print(f" {'Lines only in fresh PDF':<30} {len(new_lines):>15}")
|
|
print(f" {'Lines only in existing MD':<30} {len(missing_lines):>15}")
|
|
|
|
# Show sample differences
|
|
if new_lines:
|
|
print(f"\n Sample lines ONLY in fresh extraction (first 3):")
|
|
for line in sorted(new_lines)[:3]:
|
|
print(f" + {line[:100]}")
|
|
if missing_lines:
|
|
print(f"\n Sample lines ONLY in existing MD (first 3):")
|
|
for line in sorted(missing_lines)[:3]:
|
|
print(f" - {line[:100]}")
|
|
|
|
print()
|
|
|
|
summary.append({
|
|
"label": label,
|
|
"existing_words": existing_words,
|
|
"extracted_words": extracted_words,
|
|
"word_overlap": overlap,
|
|
"seq_similarity": seq_ratio,
|
|
})
|
|
|
|
# Summary table
|
|
print(f"\n{'=' * 70}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 70}")
|
|
print(f" {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}")
|
|
print(f" {'-' * 67}")
|
|
for s in summary:
|
|
print(f" {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|