Flatten cases directory structure and unify paths
- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
126
scripts/compare_extractions.py
Normal file
126
scripts/compare_extractions.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Compare existing MD files with freshly extracted text from PDFs."""
|
||||
|
||||
import difflib
|
||||
from pathlib import Path
|
||||
|
||||
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
|
||||
EXTRACTED_DIR = DOCS_DIR / "extracted"
|
||||
|
||||
# Map: existing MD -> extracted MD
|
||||
PAIRS = [
|
||||
("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"),
|
||||
("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"),
|
||||
("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"),
|
||||
("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"),
|
||||
]
|
||||
|
||||
|
||||
def normalize(text: str) -> str:
|
||||
"""Normalize text for comparison."""
|
||||
# Remove markdown formatting, extra whitespace
|
||||
lines = text.strip().split("\n")
|
||||
lines = [l.strip() for l in lines if l.strip()]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def word_overlap(a: str, b: str) -> float:
|
||||
"""Calculate word-level overlap ratio."""
|
||||
words_a = set(a.split())
|
||||
words_b = set(b.split())
|
||||
if not words_a or not words_b:
|
||||
return 0.0
|
||||
intersection = words_a & words_b
|
||||
return len(intersection) / max(len(words_a), len(words_b))
|
||||
|
||||
|
||||
def main():
|
||||
print(f"{'=' * 70}")
|
||||
print("COMPARISON: Existing MD vs Fresh PDF Extraction")
|
||||
print(f"{'=' * 70}\n")
|
||||
|
||||
summary = []
|
||||
|
||||
for existing_name, extracted_name, label in PAIRS:
|
||||
existing_path = DOCS_DIR / existing_name
|
||||
extracted_path = EXTRACTED_DIR / extracted_name
|
||||
|
||||
if not existing_path.exists():
|
||||
print(f"SKIP: {existing_name} not found")
|
||||
continue
|
||||
if not extracted_path.exists():
|
||||
print(f"SKIP: {extracted_name} not found")
|
||||
continue
|
||||
|
||||
existing_text = existing_path.read_text(encoding="utf-8")
|
||||
extracted_text = extracted_path.read_text(encoding="utf-8")
|
||||
|
||||
existing_norm = normalize(existing_text)
|
||||
extracted_norm = normalize(extracted_text)
|
||||
|
||||
# Stats
|
||||
existing_chars = len(existing_text)
|
||||
extracted_chars = len(extracted_text)
|
||||
existing_words = len(existing_text.split())
|
||||
extracted_words = len(extracted_text.split())
|
||||
|
||||
# Similarity
|
||||
overlap = word_overlap(existing_norm, extracted_norm)
|
||||
|
||||
# Sequence matcher ratio (slower but more accurate)
|
||||
# Use first 5000 chars for speed
|
||||
sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000])
|
||||
seq_ratio = sm.ratio()
|
||||
|
||||
# Find lines in extracted but not in existing (new content)
|
||||
existing_lines = set(existing_norm.split("\n"))
|
||||
extracted_lines = set(extracted_norm.split("\n"))
|
||||
new_lines = extracted_lines - existing_lines
|
||||
missing_lines = existing_lines - extracted_lines
|
||||
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {label}")
|
||||
print(f" Existing: {existing_name}")
|
||||
print(f" Extracted: {extracted_name}")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}")
|
||||
print(f" {'-' * 70}")
|
||||
print(f" {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}")
|
||||
print(f" {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}")
|
||||
print(f" {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}")
|
||||
print(f" {'Word overlap':<30} {overlap:>15.1%}")
|
||||
print(f" {'Sequence similarity':<30} {seq_ratio:>15.1%}")
|
||||
print(f" {'Lines only in fresh PDF':<30} {len(new_lines):>15}")
|
||||
print(f" {'Lines only in existing MD':<30} {len(missing_lines):>15}")
|
||||
|
||||
# Show sample differences
|
||||
if new_lines:
|
||||
print(f"\n Sample lines ONLY in fresh extraction (first 3):")
|
||||
for line in sorted(new_lines)[:3]:
|
||||
print(f" + {line[:100]}")
|
||||
if missing_lines:
|
||||
print(f"\n Sample lines ONLY in existing MD (first 3):")
|
||||
for line in sorted(missing_lines)[:3]:
|
||||
print(f" - {line[:100]}")
|
||||
|
||||
print()
|
||||
|
||||
summary.append({
|
||||
"label": label,
|
||||
"existing_words": existing_words,
|
||||
"extracted_words": extracted_words,
|
||||
"word_overlap": overlap,
|
||||
"seq_similarity": seq_ratio,
|
||||
})
|
||||
|
||||
# Summary table
|
||||
print(f"\n{'=' * 70}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}")
|
||||
print(f" {'-' * 67}")
|
||||
for s in summary:
|
||||
print(f" {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user