Files
legal-ai/scripts/.archive/proofread_training_corpus.py
Chaim 5c9a5d702a Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh,
notify.py, bidi_table.py

Archived (17): one-time migration/seeding scripts whose functionality
is now in MCP server or web API. Moved to scripts/.archive/

Deleted (5): zero-value scripts (duplicates, hardcoded single-case,
debug scripts)

Added scripts/SCRIPTS.md — registry of all scripts with purpose,
status, and what superseded them. CLAUDE.md updated with rule:
any script change requires SCRIPTS.md update.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 16:30:19 +00:00

383 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Proofread training corpus: strip Nevo additions from DOCX/PDF, output clean Markdown.
Nevo DOCX additions:
Front: ספרות / חקיקה שאוזכרה / מיני-רציו / topic tags / Nevo summary paragraphs
Back: 5129371512937154678313 / "בעניין עריכה ושינויים" link / "54678313-..." / "נוסח מסמך זה כפוף"
Nevo PDF additions:
"עמוד X מתוך Y" header on every page
PDF text extraction uses Google Cloud Vision OCR — PyMuPDF fragments Hebrew RTL
text unusably (words split mid-word, reading order broken). OCR gives clean output.
"""
from __future__ import annotations
import io
import os
import re
import sys
import time
from pathlib import Path
import fitz
from docx import Document
# Load GOOGLE_CLOUD_VISION_API_KEY from ~/.env if not already set
if not os.environ.get("GOOGLE_CLOUD_VISION_API_KEY"):
env_path = Path.home() / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("GOOGLE_CLOUD_VISION_API_KEY="):
os.environ["GOOGLE_CLOUD_VISION_API_KEY"] = line.split("=", 1)[1].strip().strip('"').strip("'")
break
from google.cloud import vision # noqa: E402
TRAINING_DIR = Path("/home/chaim/legal-ai/data/training")
OUTPUT_DIR = TRAINING_DIR / "proofread"
RAW_DIR = TRAINING_DIR / "raw"
# ── Nevo pattern detection ────────────────────────────────────────
NEVO_PREAMBLE_HEADERS = (
"ספרות:",
"חקיקה שאוזכרה:",
"מיני-רציו:",
)
# Strong decision-opening patterns — highly distinctive first words of real decision
# body. These rarely appear inside Nevo's own summary block, so first match wins.
DECISION_OPENING = re.compile(
r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)"
)
# Section headers that definitively mark decision body start.
DECISION_SECTION_HEADERS = {
"רקע",
"פתח דבר",
"תמצית טענות הצדדים",
"העובדות",
"הרקע העובדתי",
"מבוא",
}
# Nevo postamble markers — everything from first match onwards is stripped.
NEVO_POSTAMBLE_MARKERS = (
"5129371512937154678313",
"בעניין עריכה ושינויים במסמכי פסיקה",
"נוסח מסמך זה כפוף לשינויי ניסוח ועריכה",
)
# Nevo inline watermark codes — appear as prefixes embedded in real paragraphs
# (e.g. "5129371ניתנה פה אחד" or "054678313האם ההיתר..."). These must be
# stripped from paragraph content, not used as postamble boundaries.
NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*")
# Nevo PDF page header: "עמוד X מתוך Y" or "עמוד X בן Y" (Hebrew variants)
PDF_PAGE_HEADER_RE = re.compile(
r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*"
)
# Short orphan lines starting with "עמוד" — OCR artifacts from merged footer text
# (e.g. "עמודירבי", "עמוד :", "עמודי", "עמוד ר"). Conservative: up to 12 chars.
PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$")
# "עמוד" followed by number (with optional garbled Nevo URL line after)
PDF_PAGE_BLOCK_RE = re.compile(
r"(?m)^\s*עמוד\s*\n\s*\d+[·.]?\s*\n[^\n]*\n", re.UNICODE
)
# Standalone "עמוד N" at line start
PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.]?\s*$")
# Nevo watermark URL (and common OCR-garbled variants)
NEVO_URL_RE = re.compile(
r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)",
re.IGNORECASE,
)
def find_decision_start(paragraphs: list[str]) -> int:
"""Find index of first real decision paragraph, skipping Nevo preamble.
Strategy:
1. If no Nevo headers present → start at 0.
2. Otherwise, scan past Nevo headers; look for first paragraph matching
DECISION_OPENING regex or DECISION_SECTION_HEADERS.
3. Fallback: first paragraph after "ועדת הערר ... קבעה כלהלן:" bullet block
that doesn't look like summary (heuristic: longer, has proper sentence).
"""
has_nevo_preamble = any(
any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10]
)
if not has_nevo_preamble:
return 0
# Scan for strong decision-opening markers
for i, p in enumerate(paragraphs):
stripped = p.strip()
if stripped in DECISION_SECTION_HEADERS:
return i
if DECISION_OPENING.match(stripped):
return i
# Fallback: find "ועדת הערר ... קבעה כלהלן" and take first long para after bullets
for i, p in enumerate(paragraphs):
if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p:
# Skip summary paragraphs (Nevo typically has 3-8 of these)
for j in range(i + 1, min(i + 15, len(paragraphs))):
if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"):
# Check if this looks like real decision content
return j
break
# Last resort: strip only the first 10 paragraphs of preamble
return min(10, len(paragraphs) - 1)
def find_decision_end(paragraphs: list[str]) -> int:
"""Find exclusive end index: first paragraph that is a Nevo postamble marker."""
for i, p in enumerate(paragraphs):
for marker in NEVO_POSTAMBLE_MARKERS:
if marker in p:
return i
return len(paragraphs)
# ── DOCX proofreading ─────────────────────────────────────────────
def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]:
"""Remove Nevo inline watermark codes from paragraph prefixes; drop pure-code paras."""
out: list[str] = []
for p in paragraphs:
stripped = NEVO_INLINE_CODE_RE.sub("", p).strip()
if stripped:
out.append(stripped)
return out
def proofread_docx(path: Path) -> tuple[str, dict]:
"""Extract clean decision text from Nevo DOCX. Returns (markdown, stats)."""
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
start = find_decision_start(paragraphs)
end = find_decision_end(paragraphs)
clean = _strip_inline_nevo_codes(paragraphs[start:end])
md = "\n\n".join(clean)
return md, {
"total_paragraphs": len(paragraphs),
"preamble_stripped": start,
"postamble_stripped": len(paragraphs) - end,
"clean_paragraphs": len(clean),
}
# ── PDF proofreading (Google Vision OCR) ──────────────────────────
_vision_client: vision.ImageAnnotatorClient | None = None
def _get_vision_client() -> vision.ImageAnnotatorClient:
global _vision_client
if _vision_client is None:
api_key = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY")
if not api_key:
raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set")
_vision_client = vision.ImageAnnotatorClient(
client_options={"api_key": api_key}
)
return _vision_client
# Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי'
_HEBREW_ABBREV_FIXES: dict[str, str] = {
"עוהייד": 'עוה"ד',
"עוייד": 'עו"ד',
"הנייל": 'הנ"ל',
"מצייב": 'מצ"ב',
"ביהמייש": 'ביהמ"ש',
"תייז": 'ת"ז',
"עייי": 'ע"י',
"אחייכ": 'אח"כ',
"סייק": 'ס"ק',
"דייר": 'ד"ר',
"חווייד": 'חוו"ד',
"מייר": 'מ"ר',
"יחייד": 'יח"ד',
"בייכ": 'ב"כ',
"בייה": 'ב"ה',
"שייח": 'ש"ח',
"יוייר": 'יו"ר',
"בליימ": 'בל"מ',
"תבייע": 'תב"ע',
"תמייא": 'תמ"א',
"סייה": 'ס"ה',
"שייפ": 'ש"פ',
"שצייפ": 'שצ"פ',
"שבייצ": 'שב"צ',
"עסיים": 'עס"ם',
"הייה": 'ה"ה',
"פסייד": 'פס"ד',
"תיידא": 'תיד"א',
"בגייץ": 'בג"ץ',
"עתיים": 'עת"ם',
"עעיים": 'עע"ם',
# Hebrew calendar day prefixes (כ"א .. כ"ט etc.)
"כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד',
"כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט',
"לייא": 'ל"א',
"יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד',
"טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט',
# Hebrew calendar years (תשפ"ה, תשפ"ד...)
"תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג',
"תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו',
"תשפיין": 'תשפ"ן',
}
_ABBREV_PATTERN = re.compile(
"|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
)
def _fix_hebrew_quotes(text: str) -> str:
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
def _ocr_page_image(image_bytes: bytes, page_num: int) -> str:
client = _get_vision_client()
image = vision.Image(content=image_bytes)
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(language_hints=["he"]),
)
if response.error.message:
raise RuntimeError(f"Vision error page {page_num}: {response.error.message}")
text = response.full_text_annotation.text if response.full_text_annotation else ""
return _fix_hebrew_quotes(text)
_FOOTER_JUNK_RE = re.compile(
r"^("
r"\s*|" # blank
r"[-·*.\"\'׳״]+|" # stray punctuation
r"\d{1,3}[\s\-·*.\"\'׳״]*|" # page number with any stray char
r"עמוד[\s\d\-·*.\"\'׳״]*|" # "עמוד" / "עמוד N" w/ trailing noise
r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}" # garbled latin (nevo URL variants)
r")$"
)
def _clean_page_text(text: str) -> str:
"""Strip Nevo page headers, footers and watermarks from a single page's OCR text.
Nevo footer on each page looks like:
עמוד
N (or "", "N*")
nevo.co.il (or OCR-garbled: "new coal", "neto coal", etc.)
- (optional stray dash)
Google Vision OCRs this block at the end of each page's text.
"""
# 1. Strip top header "עמוד X מתוך Y" anywhere
text = PDF_PAGE_HEADER_RE.sub("\n", text)
# 2. Walk back from end, dropping footer junk lines
lines = text.split("\n")
while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()):
lines.pop()
text = "\n".join(lines)
# 3. Final pass: strip any leftover Nevo URLs mid-text and orphan "עמוד X" lines
text = NEVO_URL_RE.sub("", text)
text = PDF_PAGE_NUM_LINE_RE.sub("", text)
text = PDF_PAGE_ORPHAN_RE.sub("", text)
return text.strip()
def proofread_pdf(path: Path) -> tuple[str, dict]:
"""Extract clean decision text from Nevo PDF via Google Vision OCR."""
doc = fitz.open(str(path))
pages: list[str] = []
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
text = _ocr_page_image(img_bytes, i + 1)
pages.append(_clean_page_text(text))
# Small delay between API calls to be safe
time.sleep(0.1)
doc.close()
body = "\n\n".join(p for p in pages if p)
body = re.sub(r"\n{3,}", "\n\n", body)
body = re.sub(r"[ \t]+\n", "\n", body)
for marker in NEVO_POSTAMBLE_MARKERS:
idx = body.find(marker)
if idx != -1:
body = body[:idx].rstrip()
break
return body, {
"pages": len(pages),
"chars": len(body),
}
# ── Orchestration ─────────────────────────────────────────────────
SKIP_FILES = {
"הכנת שאלות מחקר.docx",
"סוכן_מנתח_ומחקר_משפטי_Paperclip_מדריך.docx",
"README.md",
}
def output_filename(src: Path) -> str:
"""Build clean output filename preserving case identifier."""
stem = src.stem
# Normalize: replace spaces with - where helpful, but keep Hebrew intact
return f"{stem}.md"
def main(argv: list[str]) -> int:
OUTPUT_DIR.mkdir(exist_ok=True)
RAW_DIR.mkdir(exist_ok=True)
# Filter files
only = argv[1:] if len(argv) > 1 else None
files: list[Path] = []
for p in sorted(TRAINING_DIR.iterdir()):
if p.is_dir() or p.name.startswith("."):
continue
if p.name in SKIP_FILES:
continue
if p.suffix.lower() not in (".docx", ".pdf"):
continue
if only and p.name not in only:
continue
files.append(p)
print(f"Processing {len(files)} files...\n")
for path in files:
try:
if path.suffix.lower() == ".docx":
md, stats = proofread_docx(path)
else:
md, stats = proofread_pdf(path)
out_path = OUTPUT_DIR / output_filename(path)
out_path.write_text(md, encoding="utf-8")
print(f"{path.name}")
print(f"{out_path.name} ({len(md):,} chars) {stats}")
except Exception as e:
print(f"{path.name}: {e}")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))