Add training corpus UI with Nevo proofreading pipeline

- New proofreader service strips Nevo editorial additions (front matter,
  postamble, page headers, watermarks, inline codes) from DOCX/PDF/MD
- PDF pages use Google Vision OCR for clean Hebrew RTL extraction
- New training page at #/training with drag-and-drop upload, automatic
  metadata extraction (decision number, date, categories), reviewable
  preview, and style pattern report grouped by type
- API endpoints: /api/training/{analyze,upload,corpus,patterns,
  analyze-style,analyze-style/status}
- Fix claude_session.query to pipe prompt via stdin, avoiding ARG_MAX
  overflow when analyzing 900K+ char corpus
- CLI scripts for batch proofreading and corpus upload

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-11 11:04:58 +00:00
parent ecda95d610
commit 32f18de049
6 changed files with 1960 additions and 3 deletions

View File

@@ -24,6 +24,9 @@ LONG_TIMEOUT = 300 # For complex tasks like block writing
def query(prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1) -> str:
"""Send a prompt to Claude Code headless and return the text response.
Passes the prompt via stdin (not argv) to avoid the OS ARG_MAX limit —
prompts can be 500K+ chars when analyzing a full style corpus.
Args:
prompt: The prompt to send.
timeout: Max seconds to wait.
@@ -36,14 +39,18 @@ def query(prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1) -> st
RuntimeError: If claude CLI is not available or fails.
"""
cmd = [
"claude", "-p", prompt,
"claude", "-p",
"--output-format", "json",
"--max-turns", str(max_turns),
]
try:
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=timeout,
cmd,
input=prompt,
capture_output=True,
text=True,
timeout=timeout,
)
except FileNotFoundError:
raise RuntimeError("Claude CLI not found. Install Claude Code or add 'claude' to PATH.")

View File

@@ -0,0 +1,404 @@
"""Nevo proofreading service for training corpus.
Strips Nevo editorial additions (front matter, back matter, page headers,
watermarks, inline watermark codes) from legal decision DOCX/PDF/MD files.
Also extracts metadata (decision number, date, subject categories) via
heuristics on cleaned text.
Used by:
* CLI script: scripts/proofread_training_corpus.py
* Web API: /api/training/analyze
"""
from __future__ import annotations
import asyncio
import re
import time
from datetime import date as date_type
from pathlib import Path
from typing import Any
import fitz
from docx import Document
from google.cloud import vision
from legal_mcp import config
# ── Nevo pattern detection ────────────────────────────────────────
NEVO_PREAMBLE_HEADERS = (
"ספרות:",
"חקיקה שאוזכרה:",
"מיני-רציו:",
)
DECISION_OPENING = re.compile(
r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)"
)
DECISION_SECTION_HEADERS = {
"רקע",
"פתח דבר",
"תמצית טענות הצדדים",
"העובדות",
"הרקע העובדתי",
"מבוא",
}
NEVO_POSTAMBLE_MARKERS = (
"5129371512937154678313",
"בעניין עריכה ושינויים במסמכי פסיקה",
"נוסח מסמך זה כפוף לשינויי ניסוח ועריכה",
)
NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*")
PDF_PAGE_HEADER_RE = re.compile(
r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*"
)
PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$")
PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.*]?\s*$")
NEVO_URL_RE = re.compile(
r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)",
re.IGNORECASE,
)
_FOOTER_JUNK_RE = re.compile(
r"^("
r"\s*|"
r"[-·*.\"\'׳״]+|"
r"\d{1,3}[\s\-·*.\"\'׳״]*|"
r"עמוד[\s\d\-·*.\"\'׳״]*|"
r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}"
r")$"
)
# Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי'
_HEBREW_ABBREV_FIXES: dict[str, str] = {
"עוהייד": 'עוה"ד', "עוייד": 'עו"ד', "הנייל": 'הנ"ל', "מצייב": 'מצ"ב',
"ביהמייש": 'ביהמ"ש', "תייז": 'ת"ז', "עייי": 'ע"י', "אחייכ": 'אח"כ',
"סייק": 'ס"ק', "דייר": 'ד"ר', "חווייד": 'חוו"ד', "מייר": 'מ"ר',
"יחייד": 'יח"ד', "בייכ": 'ב"כ', "בייה": 'ב"ה', "שייח": 'ש"ח',
"יוייר": 'יו"ר', "בליימ": 'בל"מ', "תבייע": 'תב"ע', "תמייא": 'תמ"א',
"סייה": 'ס"ה', "שייפ": 'ש"פ', "שצייפ": 'שצ"פ', "שבייצ": 'שב"צ',
"עסיים": 'עס"ם', "הייה": 'ה"ה', "פסייד": 'פס"ד', "תיידא": 'תיד"א',
"בגייץ": 'בג"ץ', "עתיים": 'עת"ם', "עעיים": 'עע"ם',
"כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד',
"כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט',
"לייא": 'ל"א',
"יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד',
"טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט',
"תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג',
"תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו',
"תשפיין": 'תשפ"ן',
}
_ABBREV_PATTERN = re.compile(
"|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
)
def _fix_hebrew_quotes(text: str) -> str:
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
# ── Google Vision OCR ────────────────────────────────────────────
_vision_client: vision.ImageAnnotatorClient | None = None
def _get_vision_client() -> vision.ImageAnnotatorClient:
global _vision_client
if _vision_client is None:
if not config.GOOGLE_CLOUD_VISION_API_KEY:
raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set")
_vision_client = vision.ImageAnnotatorClient(
client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY}
)
return _vision_client
def _ocr_page_image(image_bytes: bytes, page_num: int) -> str:
client = _get_vision_client()
image = vision.Image(content=image_bytes)
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(language_hints=["he"]),
)
if response.error.message:
raise RuntimeError(f"Vision error page {page_num}: {response.error.message}")
text = response.full_text_annotation.text if response.full_text_annotation else ""
return _fix_hebrew_quotes(text)
# ── DOCX proofreading ────────────────────────────────────────────
def _find_decision_start(paragraphs: list[str]) -> int:
"""Find first real decision paragraph, skipping Nevo preamble."""
has_nevo_preamble = any(
any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10]
)
if not has_nevo_preamble:
return 0
for i, p in enumerate(paragraphs):
stripped = p.strip()
if stripped in DECISION_SECTION_HEADERS:
return i
if DECISION_OPENING.match(stripped):
return i
for i, p in enumerate(paragraphs):
if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p:
for j in range(i + 1, min(i + 15, len(paragraphs))):
if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"):
return j
break
return min(10, len(paragraphs) - 1)
def _find_decision_end(paragraphs: list[str]) -> int:
"""First paragraph that is a Nevo postamble marker (exclusive end)."""
for i, p in enumerate(paragraphs):
for marker in NEVO_POSTAMBLE_MARKERS:
if marker in p:
return i
return len(paragraphs)
def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]:
out: list[str] = []
for p in paragraphs:
stripped = NEVO_INLINE_CODE_RE.sub("", p).strip()
if stripped:
out.append(stripped)
return out
def proofread_docx(path: Path) -> tuple[str, dict]:
"""Extract clean decision text from Nevo DOCX. Returns (markdown, stats)."""
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
start = _find_decision_start(paragraphs)
end = _find_decision_end(paragraphs)
clean = _strip_inline_nevo_codes(paragraphs[start:end])
md = "\n\n".join(clean)
return md, {
"source_type": "docx",
"total_paragraphs": len(paragraphs),
"preamble_stripped": start,
"postamble_stripped": len(paragraphs) - end,
"clean_paragraphs": len(clean),
}
# ── PDF proofreading ─────────────────────────────────────────────
def _clean_page_text(text: str) -> str:
text = PDF_PAGE_HEADER_RE.sub("\n", text)
lines = text.split("\n")
while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()):
lines.pop()
text = "\n".join(lines)
text = NEVO_URL_RE.sub("", text)
text = PDF_PAGE_NUM_LINE_RE.sub("", text)
text = PDF_PAGE_ORPHAN_RE.sub("", text)
return text.strip()
async def proofread_pdf(path: Path) -> tuple[str, dict]:
"""Extract clean decision text from Nevo PDF via Google Vision OCR."""
doc = fitz.open(str(path))
pages: list[str] = []
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
text = await asyncio.to_thread(_ocr_page_image, img_bytes, i + 1)
pages.append(_clean_page_text(text))
await asyncio.sleep(0.1)
doc.close()
body = "\n\n".join(p for p in pages if p)
body = re.sub(r"\n{3,}", "\n\n", body)
body = re.sub(r"[ \t]+\n", "\n", body)
for marker in NEVO_POSTAMBLE_MARKERS:
idx = body.find(marker)
if idx != -1:
body = body[:idx].rstrip()
break
return body, {
"source_type": "pdf",
"pages": len(pages),
"chars": len(body),
}
# ── MD/TXT passthrough ───────────────────────────────────────────
def proofread_md(path: Path) -> tuple[str, dict]:
"""Plain text passthrough for already-clean .md/.txt files."""
text = path.read_text(encoding="utf-8")
return text, {"source_type": "md", "chars": len(text)}
async def proofread(path: Path) -> tuple[str, dict]:
"""Proofread a file based on its extension. Returns (clean_text, stats)."""
suffix = path.suffix.lower()
if suffix == ".docx":
return proofread_docx(path)
if suffix == ".pdf":
return await proofread_pdf(path)
if suffix in (".md", ".txt"):
return proofread_md(path)
raise ValueError(f"Unsupported file type: {suffix}")
# ── Metadata extraction ──────────────────────────────────────────
FILENAME_NUMBER_PATTERNS = [
re.compile(r"^ARAR-(\d{2})-(\d{3,4})"),
re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"),
re.compile(r"^ערר\s+(\d{3,4})\s*-"),
]
LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})")
def decision_number_from_filename(stem: str) -> str | None:
"""Extract NUMBER/YY from a filename stem."""
m = FILENAME_NUMBER_PATTERNS[0].match(stem)
if m:
return f"{m.group(2)}/{m.group(1)}"
m = FILENAME_NUMBER_PATTERNS[1].match(stem)
if m:
return f"{m.group(1)}/{m.group(2)}"
m = FILENAME_NUMBER_PATTERNS[2].match(stem)
if m:
return f"{m.group(1)}/??"
m = LEGACY_MULTI_PATTERN.search(stem)
if m:
return f"{m.group(1)}+{m.group(2)}/??"
return None
HEBREW_MONTHS = {
"ינואר": 1, "בינואר": 1, "פברואר": 2, "בפברואר": 2,
"מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3,
"אפריל": 4, "באפריל": 4, "מאי": 5, "במאי": 5,
"יוני": 6, "ביוני": 6, "יולי": 7, "ביולי": 7,
"אוגוסט": 8, "באוגוסט": 8, "ספטמבר": 9, "בספטמבר": 9,
"אוקטובר": 10, "באוקטובר": 10, "נובמבר": 11, "בנובמבר": 11,
"דצמבר": 12, "בדצמבר": 12,
}
DATE_RE = re.compile(
r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})"
)
NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?")
def decision_date_from_text(text: str) -> str | None:
tail = text[-2500:] if len(text) > 2500 else text
nitna_match = NITNA_RE.search(tail)
search_text = tail[nitna_match.start():] if nitna_match else tail
m = DATE_RE.search(search_text)
if not m:
m = DATE_RE.search(tail)
if not m:
return None
day = int(m.group(1))
month = HEBREW_MONTHS.get(m.group(2))
year = int(m.group(3))
if not month:
return None
try:
return date_type(year, month, day).isoformat()
except ValueError:
return None
def finalize_decision_number(number: str | None, date_iso: str | None) -> str:
if not number:
return f"??/{date_iso[2:4]}" if date_iso else ""
if number.endswith("/??"):
return number.replace("/??", f"/{date_iso[2:4]}") if date_iso else number.replace("/??", "")
return number
def categorize(text: str) -> list[str]:
"""Heuristic subject category detection based on opening + repetition."""
opening = text[:2000]
t = text
cats: list[str] = []
if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t):
cats.append('תמ"א 38')
if len(re.findall(r"היטל(?:י)?\s+השבחה", t)) >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening):
cats.append("היטל השבחה")
p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197"
if len(re.findall(p197_re, t)) >= 2 or re.search(p197_re, opening):
cats.append("פיצויים 197")
if t.count("שימוש חורג") >= 3 or "שימוש חורג" in opening:
cats.append("שימוש חורג")
if len(re.findall(r"\bהקלה\b|\bהקלות\b", t)) >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening):
cats.append("הקלה")
if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t):
cats.append("חלוקה")
if re.search(
r"הפקדת\s+ה?תכנית|אישור\s+ה?תכנית|המלצה\s+להפקיד|"
r"להפקיד\s+את\s+ה?תכנית|לדון\s+בתכנית|דנה\s+בתכנית|"
r"החלטה\s+לאשר\s+ה?תכנית",
opening,
):
cats.append("תכנית")
if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening):
cats.append("היתר")
has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats
if has_permit_subject and "בנייה" not in cats:
cats.append("בנייה")
return cats or ["בנייה"]
async def analyze_file(path: Path) -> dict[str, Any]:
"""Proofread a file and extract metadata for review.
Returns a dict suitable for UI preview with: clean text, metadata,
stats, and a short text preview for visual verification.
"""
clean_text, stats = await proofread(path)
num_raw = decision_number_from_filename(path.stem)
d_iso = decision_date_from_text(clean_text)
number = finalize_decision_number(num_raw, d_iso)
cats = categorize(clean_text)
return {
"filename": path.name,
"clean_text": clean_text,
"preview": clean_text[:500],
"decision_number": number,
"decision_date": d_iso or "",
"subject_categories": cats,
"stats": stats,
"chars": len(clean_text),
}