From 32f18de049a2536727fcfdee8099d98fd28cbe83 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 11 Apr 2026 11:04:58 +0000 Subject: [PATCH] Add training corpus UI with Nevo proofreading pipeline - New proofreader service strips Nevo editorial additions (front matter, postamble, page headers, watermarks, inline codes) from DOCX/PDF/MD - PDF pages use Google Vision OCR for clean Hebrew RTL extraction - New training page at #/training with drag-and-drop upload, automatic metadata extraction (decision number, date, categories), reviewable preview, and style pattern report grouped by type - API endpoints: /api/training/{analyze,upload,corpus,patterns, analyze-style,analyze-style/status} - Fix claude_session.query to pipe prompt via stdin, avoiding ARG_MAX overflow when analyzing 900K+ char corpus - CLI scripts for batch proofreading and corpus upload Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/legal_mcp/services/claude_session.py | 11 +- .../src/legal_mcp/services/proofreader.py | 404 +++++++++++++ scripts/batch_upload_training.py | 349 +++++++++++ scripts/proofread_training_corpus.py | 382 ++++++++++++ web/app.py | 257 +++++++- web/static/index.html | 560 ++++++++++++++++++ 6 files changed, 1960 insertions(+), 3 deletions(-) create mode 100644 mcp-server/src/legal_mcp/services/proofreader.py create mode 100644 scripts/batch_upload_training.py create mode 100644 scripts/proofread_training_corpus.py diff --git a/mcp-server/src/legal_mcp/services/claude_session.py b/mcp-server/src/legal_mcp/services/claude_session.py index 01b3390..99368cb 100644 --- a/mcp-server/src/legal_mcp/services/claude_session.py +++ b/mcp-server/src/legal_mcp/services/claude_session.py @@ -24,6 +24,9 @@ LONG_TIMEOUT = 300 # For complex tasks like block writing def query(prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1) -> str: """Send a prompt to Claude Code headless and return the text response. + Passes the prompt via stdin (not argv) to avoid the OS ARG_MAX limit — + prompts can be 500K+ chars when analyzing a full style corpus. + Args: prompt: The prompt to send. timeout: Max seconds to wait. @@ -36,14 +39,18 @@ def query(prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1) -> st RuntimeError: If claude CLI is not available or fails. """ cmd = [ - "claude", "-p", prompt, + "claude", "-p", "--output-format", "json", "--max-turns", str(max_turns), ] try: result = subprocess.run( - cmd, capture_output=True, text=True, timeout=timeout, + cmd, + input=prompt, + capture_output=True, + text=True, + timeout=timeout, ) except FileNotFoundError: raise RuntimeError("Claude CLI not found. Install Claude Code or add 'claude' to PATH.") diff --git a/mcp-server/src/legal_mcp/services/proofreader.py b/mcp-server/src/legal_mcp/services/proofreader.py new file mode 100644 index 0000000..e1db884 --- /dev/null +++ b/mcp-server/src/legal_mcp/services/proofreader.py @@ -0,0 +1,404 @@ +"""Nevo proofreading service for training corpus. + +Strips Nevo editorial additions (front matter, back matter, page headers, +watermarks, inline watermark codes) from legal decision DOCX/PDF/MD files. + +Also extracts metadata (decision number, date, subject categories) via +heuristics on cleaned text. + +Used by: + * CLI script: scripts/proofread_training_corpus.py + * Web API: /api/training/analyze +""" + +from __future__ import annotations + +import asyncio +import re +import time +from datetime import date as date_type +from pathlib import Path +from typing import Any + +import fitz +from docx import Document +from google.cloud import vision + +from legal_mcp import config + +# ── Nevo pattern detection ──────────────────────────────────────── + +NEVO_PREAMBLE_HEADERS = ( + "ספרות:", + "חקיקה שאוזכרה:", + "מיני-רציו:", +) + +DECISION_OPENING = re.compile( + r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)" +) + +DECISION_SECTION_HEADERS = { + "רקע", + "פתח דבר", + "תמצית טענות הצדדים", + "העובדות", + "הרקע העובדתי", + "מבוא", +} + +NEVO_POSTAMBLE_MARKERS = ( + "5129371512937154678313", + "בעניין עריכה ושינויים במסמכי פסיקה", + "נוסח מסמך זה כפוף לשינויי ניסוח ועריכה", +) + +NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*") + +PDF_PAGE_HEADER_RE = re.compile( + r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*" +) +PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$") +PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.*]?\s*$") +NEVO_URL_RE = re.compile( + r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)", + re.IGNORECASE, +) + +_FOOTER_JUNK_RE = re.compile( + r"^(" + r"\s*|" + r"[-·*.\"\'׳״]+|" + r"\d{1,3}[\s\-·*.\"\'׳״]*|" + r"עמוד[\s\d\-·*.\"\'׳״]*|" + r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}" + r")$" +) + +# Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי' +_HEBREW_ABBREV_FIXES: dict[str, str] = { + "עוהייד": 'עוה"ד', "עוייד": 'עו"ד', "הנייל": 'הנ"ל', "מצייב": 'מצ"ב', + "ביהמייש": 'ביהמ"ש', "תייז": 'ת"ז', "עייי": 'ע"י', "אחייכ": 'אח"כ', + "סייק": 'ס"ק', "דייר": 'ד"ר', "חווייד": 'חוו"ד', "מייר": 'מ"ר', + "יחייד": 'יח"ד', "בייכ": 'ב"כ', "בייה": 'ב"ה', "שייח": 'ש"ח', + "יוייר": 'יו"ר', "בליימ": 'בל"מ', "תבייע": 'תב"ע', "תמייא": 'תמ"א', + "סייה": 'ס"ה', "שייפ": 'ש"פ', "שצייפ": 'שצ"פ', "שבייצ": 'שב"צ', + "עסיים": 'עס"ם', "הייה": 'ה"ה', "פסייד": 'פס"ד', "תיידא": 'תיד"א', + "בגייץ": 'בג"ץ', "עתיים": 'עת"ם', "עעיים": 'עע"ם', + "כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד', + "כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט', + "לייא": 'ל"א', + "יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד', + "טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט', + "תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג', + "תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו', + "תשפיין": 'תשפ"ן', +} +_ABBREV_PATTERN = re.compile( + "|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True)) +) + + +def _fix_hebrew_quotes(text: str) -> str: + return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text) + + +# ── Google Vision OCR ──────────────────────────────────────────── + +_vision_client: vision.ImageAnnotatorClient | None = None + + +def _get_vision_client() -> vision.ImageAnnotatorClient: + global _vision_client + if _vision_client is None: + if not config.GOOGLE_CLOUD_VISION_API_KEY: + raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set") + _vision_client = vision.ImageAnnotatorClient( + client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY} + ) + return _vision_client + + +def _ocr_page_image(image_bytes: bytes, page_num: int) -> str: + client = _get_vision_client() + image = vision.Image(content=image_bytes) + response = client.document_text_detection( + image=image, + image_context=vision.ImageContext(language_hints=["he"]), + ) + if response.error.message: + raise RuntimeError(f"Vision error page {page_num}: {response.error.message}") + text = response.full_text_annotation.text if response.full_text_annotation else "" + return _fix_hebrew_quotes(text) + + +# ── DOCX proofreading ──────────────────────────────────────────── + + +def _find_decision_start(paragraphs: list[str]) -> int: + """Find first real decision paragraph, skipping Nevo preamble.""" + has_nevo_preamble = any( + any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10] + ) + if not has_nevo_preamble: + return 0 + + for i, p in enumerate(paragraphs): + stripped = p.strip() + if stripped in DECISION_SECTION_HEADERS: + return i + if DECISION_OPENING.match(stripped): + return i + + for i, p in enumerate(paragraphs): + if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p: + for j in range(i + 1, min(i + 15, len(paragraphs))): + if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"): + return j + break + + return min(10, len(paragraphs) - 1) + + +def _find_decision_end(paragraphs: list[str]) -> int: + """First paragraph that is a Nevo postamble marker (exclusive end).""" + for i, p in enumerate(paragraphs): + for marker in NEVO_POSTAMBLE_MARKERS: + if marker in p: + return i + return len(paragraphs) + + +def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]: + out: list[str] = [] + for p in paragraphs: + stripped = NEVO_INLINE_CODE_RE.sub("", p).strip() + if stripped: + out.append(stripped) + return out + + +def proofread_docx(path: Path) -> tuple[str, dict]: + """Extract clean decision text from Nevo DOCX. Returns (markdown, stats).""" + doc = Document(str(path)) + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + + start = _find_decision_start(paragraphs) + end = _find_decision_end(paragraphs) + + clean = _strip_inline_nevo_codes(paragraphs[start:end]) + md = "\n\n".join(clean) + + return md, { + "source_type": "docx", + "total_paragraphs": len(paragraphs), + "preamble_stripped": start, + "postamble_stripped": len(paragraphs) - end, + "clean_paragraphs": len(clean), + } + + +# ── PDF proofreading ───────────────────────────────────────────── + + +def _clean_page_text(text: str) -> str: + text = PDF_PAGE_HEADER_RE.sub("\n", text) + + lines = text.split("\n") + while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()): + lines.pop() + text = "\n".join(lines) + + text = NEVO_URL_RE.sub("", text) + text = PDF_PAGE_NUM_LINE_RE.sub("", text) + text = PDF_PAGE_ORPHAN_RE.sub("", text) + + return text.strip() + + +async def proofread_pdf(path: Path) -> tuple[str, dict]: + """Extract clean decision text from Nevo PDF via Google Vision OCR.""" + doc = fitz.open(str(path)) + pages: list[str] = [] + for i, page in enumerate(doc): + pix = page.get_pixmap(dpi=300) + img_bytes = pix.tobytes("png") + text = await asyncio.to_thread(_ocr_page_image, img_bytes, i + 1) + pages.append(_clean_page_text(text)) + await asyncio.sleep(0.1) + doc.close() + + body = "\n\n".join(p for p in pages if p) + body = re.sub(r"\n{3,}", "\n\n", body) + body = re.sub(r"[ \t]+\n", "\n", body) + + for marker in NEVO_POSTAMBLE_MARKERS: + idx = body.find(marker) + if idx != -1: + body = body[:idx].rstrip() + break + + return body, { + "source_type": "pdf", + "pages": len(pages), + "chars": len(body), + } + + +# ── MD/TXT passthrough ─────────────────────────────────────────── + + +def proofread_md(path: Path) -> tuple[str, dict]: + """Plain text passthrough for already-clean .md/.txt files.""" + text = path.read_text(encoding="utf-8") + return text, {"source_type": "md", "chars": len(text)} + + +async def proofread(path: Path) -> tuple[str, dict]: + """Proofread a file based on its extension. Returns (clean_text, stats).""" + suffix = path.suffix.lower() + if suffix == ".docx": + return proofread_docx(path) + if suffix == ".pdf": + return await proofread_pdf(path) + if suffix in (".md", ".txt"): + return proofread_md(path) + raise ValueError(f"Unsupported file type: {suffix}") + + +# ── Metadata extraction ────────────────────────────────────────── + +FILENAME_NUMBER_PATTERNS = [ + re.compile(r"^ARAR-(\d{2})-(\d{3,4})"), + re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"), + re.compile(r"^ערר\s+(\d{3,4})\s*-"), +] +LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})") + + +def decision_number_from_filename(stem: str) -> str | None: + """Extract NUMBER/YY from a filename stem.""" + m = FILENAME_NUMBER_PATTERNS[0].match(stem) + if m: + return f"{m.group(2)}/{m.group(1)}" + m = FILENAME_NUMBER_PATTERNS[1].match(stem) + if m: + return f"{m.group(1)}/{m.group(2)}" + m = FILENAME_NUMBER_PATTERNS[2].match(stem) + if m: + return f"{m.group(1)}/??" + m = LEGACY_MULTI_PATTERN.search(stem) + if m: + return f"{m.group(1)}+{m.group(2)}/??" + return None + + +HEBREW_MONTHS = { + "ינואר": 1, "בינואר": 1, "פברואר": 2, "בפברואר": 2, + "מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3, + "אפריל": 4, "באפריל": 4, "מאי": 5, "במאי": 5, + "יוני": 6, "ביוני": 6, "יולי": 7, "ביולי": 7, + "אוגוסט": 8, "באוגוסט": 8, "ספטמבר": 9, "בספטמבר": 9, + "אוקטובר": 10, "באוקטובר": 10, "נובמבר": 11, "בנובמבר": 11, + "דצמבר": 12, "בדצמבר": 12, +} +DATE_RE = re.compile( + r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})" +) +NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?") + + +def decision_date_from_text(text: str) -> str | None: + tail = text[-2500:] if len(text) > 2500 else text + nitna_match = NITNA_RE.search(tail) + search_text = tail[nitna_match.start():] if nitna_match else tail + m = DATE_RE.search(search_text) + if not m: + m = DATE_RE.search(tail) + if not m: + return None + day = int(m.group(1)) + month = HEBREW_MONTHS.get(m.group(2)) + year = int(m.group(3)) + if not month: + return None + try: + return date_type(year, month, day).isoformat() + except ValueError: + return None + + +def finalize_decision_number(number: str | None, date_iso: str | None) -> str: + if not number: + return f"??/{date_iso[2:4]}" if date_iso else "" + if number.endswith("/??"): + return number.replace("/??", f"/{date_iso[2:4]}") if date_iso else number.replace("/??", "") + return number + + +def categorize(text: str) -> list[str]: + """Heuristic subject category detection based on opening + repetition.""" + opening = text[:2000] + t = text + + cats: list[str] = [] + + if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t): + cats.append('תמ"א 38') + + if len(re.findall(r"היטל(?:י)?\s+השבחה", t)) >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening): + cats.append("היטל השבחה") + + p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197" + if len(re.findall(p197_re, t)) >= 2 or re.search(p197_re, opening): + cats.append("פיצויים 197") + + if t.count("שימוש חורג") >= 3 or "שימוש חורג" in opening: + cats.append("שימוש חורג") + + if len(re.findall(r"\bהקלה\b|\bהקלות\b", t)) >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening): + cats.append("הקלה") + + if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t): + cats.append("חלוקה") + + if re.search( + r"הפקדת\s+ה?תכנית|אישור\s+ה?תכנית|המלצה\s+להפקיד|" + r"להפקיד\s+את\s+ה?תכנית|לדון\s+בתכנית|דנה\s+בתכנית|" + r"החלטה\s+לאשר\s+ה?תכנית", + opening, + ): + cats.append("תכנית") + + if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening): + cats.append("היתר") + + has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats + if has_permit_subject and "בנייה" not in cats: + cats.append("בנייה") + + return cats or ["בנייה"] + + +async def analyze_file(path: Path) -> dict[str, Any]: + """Proofread a file and extract metadata for review. + + Returns a dict suitable for UI preview with: clean text, metadata, + stats, and a short text preview for visual verification. + """ + clean_text, stats = await proofread(path) + num_raw = decision_number_from_filename(path.stem) + d_iso = decision_date_from_text(clean_text) + number = finalize_decision_number(num_raw, d_iso) + cats = categorize(clean_text) + + return { + "filename": path.name, + "clean_text": clean_text, + "preview": clean_text[:500], + "decision_number": number, + "decision_date": d_iso or "", + "subject_categories": cats, + "stats": stats, + "chars": len(clean_text), + } diff --git a/scripts/batch_upload_training.py b/scripts/batch_upload_training.py new file mode 100644 index 0000000..6013196 --- /dev/null +++ b/scripts/batch_upload_training.py @@ -0,0 +1,349 @@ +"""Batch upload proofread training corpus to style DB. + +Two-phase workflow: + --preview Extract metadata from all .md files, print review table, don't upload + --upload Actually upload all files (with optional --only FILE to run one) + +Metadata extraction: + * decision_number: from filename (ARAR-YY-NNNN / ערר NNNN-YY) or decision date year + * decision_date: from "ניתנה ... ב " near end of text + * categories: keyword heuristics on body text +""" + +from __future__ import annotations + +import argparse +import asyncio +import os +import re +import sys +from pathlib import Path + +PROOFREAD_DIR = Path("/home/chaim/legal-ai/data/training/proofread") + +# Manual metadata overrides for files where auto-extraction can't determine values. +METADATA_OVERRIDES: dict[str, dict] = { + "ARAR-25-1067 - יחיעם יפה ואח׳.md": { + "decision_date": "2025-11-27", # no "ניתנה" signature in file; user-provided + }, +} + +# Files to skip — already in style_corpus from legacy ingestion +# (verified by exact character-count match with existing DB rows). +SKIP_FILES = { + "תמא 38-בית הכרם-1126+1141-החלטה.md", # → corpus: 1126/1141 + "היתר בניה-בית שמש-1180+1181-החלטה.md", # → corpus: 1180/1181 + "היתר בניה-הראל-1043+1054-החלטה.md", # → corpus: 1043/1054 + "היתר בניה-הראל-1071+1077-החלטה.md", # → corpus: 1071/1077 +} + +# Load env vars needed by mcp-server +ENV_FILE = Path.home() / ".env" +if ENV_FILE.exists(): + for line in ENV_FILE.read_text().splitlines(): + if "=" in line and not line.startswith("#"): + k, v = line.split("=", 1) + os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'")) + +# Make mcp-server package importable +sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src") + + +# ── Decision number extraction ─────────────────────────────────── + +FILENAME_NUMBER_PATTERNS = [ + # ARAR-YY-NNNN[-X] - title.md + re.compile(r"^ARAR-(\d{2})-(\d{3,4})"), + # ערר NNNN-YY title.md or ערר NNNN-YY title + re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"), + # ערר NNNN - title (no year in filename — needs date lookup) + re.compile(r"^ערר\s+(\d{3,4})\s*-"), +] + +LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})") + + +def decision_number_from_filename(stem: str) -> tuple[str | None, str | None]: + """Return (number, year_short) or (multi_number, None) or (None, None). + + year_short is YY (last 2 digits) if extractable from filename. + For legacy files with 'NNNN+NNNN' or no year, returns partial info + that must be completed from decision date. + """ + # ARAR-YY-NNNN + m = FILENAME_NUMBER_PATTERNS[0].match(stem) + if m: + year, num = m.group(1), m.group(2) + return f"{num}/{year}", year + + # ערר NNNN-YY + m = FILENAME_NUMBER_PATTERNS[1].match(stem) + if m: + num, year = m.group(1), m.group(2) + return f"{num}/{year}", year + + # ערר NNNN - title (no year) + m = FILENAME_NUMBER_PATTERNS[2].match(stem) + if m: + num = m.group(1) + return f"{num}/??", None + + # Legacy: "NNNN+NNNN" merged decisions + m = LEGACY_MULTI_PATTERN.search(stem) + if m: + return f"{m.group(1)}+{m.group(2)}/??", None + + return None, None + + +# ── Decision date extraction ───────────────────────────────────── + +HEBREW_MONTHS = { + "ינואר": 1, "בינואר": 1, + "פברואר": 2, "בפברואר": 2, + "מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3, + "אפריל": 4, "באפריל": 4, + "מאי": 5, "במאי": 5, + "יוני": 6, "ביוני": 6, + "יולי": 7, "ביולי": 7, + "אוגוסט": 8, "באוגוסט": 8, + "ספטמבר": 9, "בספטמבר": 9, + "אוקטובר": 10, "באוקטובר": 10, + "נובמבר": 11, "בנובמבר": 11, + "דצמבר": 12, "בדצמבר": 12, +} + +# Matches " ב, " or " , " (with optional commas) +DATE_RE = re.compile( + r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})" +) + +NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?") + + +def decision_date_from_text(text: str) -> str | None: + """Extract decision date in YYYY-MM-DD format from 'ניתנה... DATE' section. + + Searches the last ~2000 chars where the signing block lives. + """ + tail = text[-2500:] if len(text) > 2500 else text + + # Prefer dates near "ניתנה" marker + nitna_match = NITNA_RE.search(tail) + search_text = tail[nitna_match.start():] if nitna_match else tail + + m = DATE_RE.search(search_text) + if not m: + # Fall back: search whole tail + m = DATE_RE.search(tail) + if not m: + return None + + day = int(m.group(1)) + month = HEBREW_MONTHS.get(m.group(2)) + year = int(m.group(3)) + if not month: + return None + try: + from datetime import date + return date(year, month, day).isoformat() + except ValueError: + return None + + +# ── Subject category extraction ────────────────────────────────── + +# Categories as defined in the tool signature. +ALL_CATEGORIES = [ + "בנייה", "שימוש חורג", "תכנית", "היתר", "הקלה", + "חלוקה", 'תמ"א 38', "היטל השבחה", "פיצויים 197", +] + + +def categorize(text: str) -> list[str]: + """Heuristic category detection based on subject matter, not incidental mentions. + + Strategy: the real subject is established in the opening 2000 chars + (first decision-opening paragraph). Secondary signal is repetition count + — casual mentions in law citations don't repeat. + """ + opening = text[:2000] # subject is stated up front + t = text + + cats: list[str] = [] + + # תמ"א 38 — very specific marker, single mention is fine + if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t): + cats.append('תמ"א 38') + + # היטל השבחה — require real engagement: must appear in opening OR 3+ times + hsbacha_count = len(re.findall(r"היטל(?:י)?\s+השבחה", t)) + if hsbacha_count >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening): + cats.append("היטל השבחה") + + # פיצויים 197 — require multiple mentions OR in opening + p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197" + p197_count = len(re.findall(p197_re, t)) + if p197_count >= 2 or re.search(p197_re, opening): + cats.append("פיצויים 197") + + # שימוש חורג — must appear in opening OR 3+ times (avoids law-quote false positives) + shimush_count = t.count("שימוש חורג") + if shimush_count >= 3 or "שימוש חורג" in opening: + cats.append("שימוש חורג") + + # הקלה — real subject if 3+ mentions AND appears in opening + hakala_count = len(re.findall(r"\bהקלה\b|\bהקלות\b", t)) + if hakala_count >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening): + cats.append("הקלה") + + # חלוקה — "איחוד וחלוקה" or "חלוקה חדשה" (specific phrases) + if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t): + cats.append("חלוקה") + + # תכנית — plan-level appeal (primary subject). Allow ה/ב/ל prefixes on תכנית. + tochnit_opening = bool(re.search( + r"הפקדת\s+ה?תכנית|" + r"אישור\s+ה?תכנית|" + r"המלצה\s+להפקיד|" + r"להפקיד\s+את\s+ה?תכנית|" + r"לדון\s+בתכנית|" + r"דנה\s+בתכנית|" + r"החלטה\s+לאשר\s+ה?תכנית", + opening, + )) + if tochnit_opening: + cats.append("תכנית") + + # היתר — "בקשה להיתר" or "היתר בניה" as subject in opening + if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening): + cats.append("היתר") + + # בנייה — default/fallback for building-permit cases + # (not for plan-level תכנית-only cases) + has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats + if has_permit_subject and "בנייה" not in cats: + cats.append("בנייה") + + # If nothing matched, default to בנייה + return cats or ["בנייה"] + + +# ── Year fallback from date ────────────────────────────────────── + + +def finalize_decision_number(number: str | None, date_iso: str | None) -> str: + """If filename number is missing year, fill it from decision date.""" + if not number: + if date_iso: + # Extract last 2 digits of Hebrew year via Gregorian year + return f"??/{date_iso[2:4]}" + return "" + if number.endswith("/??"): + if date_iso: + yy = date_iso[2:4] + return number.replace("/??", f"/{yy}") + return number.replace("/??", "") + return number + + +# ── Main metadata extraction ───────────────────────────────────── + + +def extract_metadata(path: Path) -> dict: + text = path.read_text(encoding="utf-8") + num_from_name, _ = decision_number_from_filename(path.stem) + date_iso = decision_date_from_text(text) + decision_number = finalize_decision_number(num_from_name, date_iso) + cats = categorize(text) + meta = { + "file": path.name, + "decision_number": decision_number, + "decision_date": date_iso or "??", + "categories": cats, + "chars": len(text), + } + # Apply manual overrides + if path.name in METADATA_OVERRIDES: + meta.update(METADATA_OVERRIDES[path.name]) + return meta + + +def print_preview(results: list[dict]) -> None: + """Print review table of metadata for all files.""" + print(f"\n{'#':<3} {'FILE':<55} {'NUMBER':<15} {'DATE':<12} {'CATEGORIES'}") + print("-" * 130) + for i, r in enumerate(results, 1): + file_short = r["file"] if len(r["file"]) <= 53 else r["file"][:50] + "..." + cats = ", ".join(r["categories"]) + print(f"{i:<3} {file_short:<55} {r['decision_number']:<15} {r['decision_date']:<12} {cats}") + print() + # Highlight issues + issues = [r for r in results if r["decision_date"] == "??" or not r["decision_number"] or "??" in r["decision_number"]] + if issues: + print(f"⚠️ {len(issues)} files with incomplete metadata:") + for r in issues: + print(f" - {r['file']} → number={r['decision_number']!r} date={r['decision_date']!r}") + + +# ── Upload ─────────────────────────────────────────────────────── + + +async def upload_one(meta: dict) -> dict: + from legal_mcp.tools.documents import document_upload_training + + path = PROOFREAD_DIR / meta["file"] + result = await document_upload_training( + file_path=str(path), + decision_number=meta["decision_number"], + decision_date=meta["decision_date"] if meta["decision_date"] != "??" else "", + subject_categories=meta["categories"], + title=path.stem, + ) + return {"file": meta["file"], "result": result} + + +async def upload_all(results: list[dict]) -> None: + for i, meta in enumerate(results, 1): + try: + r = await upload_one(meta) + print(f"[{i}/{len(results)}] ✓ {meta['file']}") + print(f" {r['result'][:200]}") + except Exception as e: + print(f"[{i}/{len(results)}] ✗ {meta['file']}: {e}") + + +# ── CLI ────────────────────────────────────────────────────────── + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--preview", action="store_true", help="Show metadata table without uploading") + ap.add_argument("--upload", action="store_true", help="Upload all files to style corpus") + ap.add_argument("--only", help="Only process this specific filename") + args = ap.parse_args() + + files = sorted(PROOFREAD_DIR.glob("*.md")) + files = [f for f in files if f.name not in SKIP_FILES] + if args.only: + files = [f for f in files if f.name == args.only] + if not files: + print(f"File not found: {args.only}") + return 1 + + results = [extract_metadata(f) for f in files] + + if args.preview or not args.upload: + print_preview(results) + if not args.upload: + return 0 + + if args.upload: + print(f"\n>>> Uploading {len(results)} files to style corpus...\n") + asyncio.run(upload_all(results)) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/proofread_training_corpus.py b/scripts/proofread_training_corpus.py new file mode 100644 index 0000000..96dcbe4 --- /dev/null +++ b/scripts/proofread_training_corpus.py @@ -0,0 +1,382 @@ +"""Proofread training corpus: strip Nevo additions from DOCX/PDF, output clean Markdown. + +Nevo DOCX additions: + Front: ספרות / חקיקה שאוזכרה / מיני-רציו / topic tags / Nevo summary paragraphs + Back: 5129371512937154678313 / "בעניין עריכה ושינויים" link / "54678313-..." / "נוסח מסמך זה כפוף" + +Nevo PDF additions: + "עמוד X מתוך Y" header on every page + +PDF text extraction uses Google Cloud Vision OCR — PyMuPDF fragments Hebrew RTL +text unusably (words split mid-word, reading order broken). OCR gives clean output. +""" + +from __future__ import annotations + +import io +import os +import re +import sys +import time +from pathlib import Path + +import fitz +from docx import Document + +# Load GOOGLE_CLOUD_VISION_API_KEY from ~/.env if not already set +if not os.environ.get("GOOGLE_CLOUD_VISION_API_KEY"): + env_path = Path.home() / ".env" + if env_path.exists(): + for line in env_path.read_text().splitlines(): + if line.startswith("GOOGLE_CLOUD_VISION_API_KEY="): + os.environ["GOOGLE_CLOUD_VISION_API_KEY"] = line.split("=", 1)[1].strip().strip('"').strip("'") + break + +from google.cloud import vision # noqa: E402 + +TRAINING_DIR = Path("/home/chaim/legal-ai/data/training") +OUTPUT_DIR = TRAINING_DIR / "proofread" +RAW_DIR = TRAINING_DIR / "raw" + +# ── Nevo pattern detection ──────────────────────────────────────── + +NEVO_PREAMBLE_HEADERS = ( + "ספרות:", + "חקיקה שאוזכרה:", + "מיני-רציו:", +) + +# Strong decision-opening patterns — highly distinctive first words of real decision +# body. These rarely appear inside Nevo's own summary block, so first match wins. +DECISION_OPENING = re.compile( + r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)" +) + +# Section headers that definitively mark decision body start. +DECISION_SECTION_HEADERS = { + "רקע", + "פתח דבר", + "תמצית טענות הצדדים", + "העובדות", + "הרקע העובדתי", + "מבוא", +} + +# Nevo postamble markers — everything from first match onwards is stripped. +NEVO_POSTAMBLE_MARKERS = ( + "5129371512937154678313", + "בעניין עריכה ושינויים במסמכי פסיקה", + "נוסח מסמך זה כפוף לשינויי ניסוח ועריכה", +) + +# Nevo inline watermark codes — appear as prefixes embedded in real paragraphs +# (e.g. "5129371ניתנה פה אחד" or "054678313האם ההיתר..."). These must be +# stripped from paragraph content, not used as postamble boundaries. +NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*") + +# Nevo PDF page header: "עמוד X מתוך Y" or "עמוד X בן Y" (Hebrew variants) +PDF_PAGE_HEADER_RE = re.compile( + r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*" +) +# Short orphan lines starting with "עמוד" — OCR artifacts from merged footer text +# (e.g. "עמודירבי", "עמוד :", "עמודי", "עמוד ר"). Conservative: up to 12 chars. +PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$") +# "עמוד" followed by number (with optional garbled Nevo URL line after) +PDF_PAGE_BLOCK_RE = re.compile( + r"(?m)^\s*עמוד\s*\n\s*\d+[·.]?\s*\n[^\n]*\n", re.UNICODE +) +# Standalone "עמוד N" at line start +PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.]?\s*$") +# Nevo watermark URL (and common OCR-garbled variants) +NEVO_URL_RE = re.compile( + r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)", + re.IGNORECASE, +) + + +def find_decision_start(paragraphs: list[str]) -> int: + """Find index of first real decision paragraph, skipping Nevo preamble. + + Strategy: + 1. If no Nevo headers present → start at 0. + 2. Otherwise, scan past Nevo headers; look for first paragraph matching + DECISION_OPENING regex or DECISION_SECTION_HEADERS. + 3. Fallback: first paragraph after "ועדת הערר ... קבעה כלהלן:" bullet block + that doesn't look like summary (heuristic: longer, has proper sentence). + """ + has_nevo_preamble = any( + any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10] + ) + if not has_nevo_preamble: + return 0 + + # Scan for strong decision-opening markers + for i, p in enumerate(paragraphs): + stripped = p.strip() + if stripped in DECISION_SECTION_HEADERS: + return i + if DECISION_OPENING.match(stripped): + return i + + # Fallback: find "ועדת הערר ... קבעה כלהלן" and take first long para after bullets + for i, p in enumerate(paragraphs): + if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p: + # Skip summary paragraphs (Nevo typically has 3-8 of these) + for j in range(i + 1, min(i + 15, len(paragraphs))): + if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"): + # Check if this looks like real decision content + return j + break + + # Last resort: strip only the first 10 paragraphs of preamble + return min(10, len(paragraphs) - 1) + + +def find_decision_end(paragraphs: list[str]) -> int: + """Find exclusive end index: first paragraph that is a Nevo postamble marker.""" + for i, p in enumerate(paragraphs): + for marker in NEVO_POSTAMBLE_MARKERS: + if marker in p: + return i + return len(paragraphs) + + +# ── DOCX proofreading ───────────────────────────────────────────── + + +def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]: + """Remove Nevo inline watermark codes from paragraph prefixes; drop pure-code paras.""" + out: list[str] = [] + for p in paragraphs: + stripped = NEVO_INLINE_CODE_RE.sub("", p).strip() + if stripped: + out.append(stripped) + return out + + +def proofread_docx(path: Path) -> tuple[str, dict]: + """Extract clean decision text from Nevo DOCX. Returns (markdown, stats).""" + doc = Document(str(path)) + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + + start = find_decision_start(paragraphs) + end = find_decision_end(paragraphs) + + clean = _strip_inline_nevo_codes(paragraphs[start:end]) + md = "\n\n".join(clean) + + return md, { + "total_paragraphs": len(paragraphs), + "preamble_stripped": start, + "postamble_stripped": len(paragraphs) - end, + "clean_paragraphs": len(clean), + } + + +# ── PDF proofreading (Google Vision OCR) ────────────────────────── + +_vision_client: vision.ImageAnnotatorClient | None = None + + +def _get_vision_client() -> vision.ImageAnnotatorClient: + global _vision_client + if _vision_client is None: + api_key = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY") + if not api_key: + raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set") + _vision_client = vision.ImageAnnotatorClient( + client_options={"api_key": api_key} + ) + return _vision_client + + +# Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי' +_HEBREW_ABBREV_FIXES: dict[str, str] = { + "עוהייד": 'עוה"ד', + "עוייד": 'עו"ד', + "הנייל": 'הנ"ל', + "מצייב": 'מצ"ב', + "ביהמייש": 'ביהמ"ש', + "תייז": 'ת"ז', + "עייי": 'ע"י', + "אחייכ": 'אח"כ', + "סייק": 'ס"ק', + "דייר": 'ד"ר', + "חווייד": 'חוו"ד', + "מייר": 'מ"ר', + "יחייד": 'יח"ד', + "בייכ": 'ב"כ', + "בייה": 'ב"ה', + "שייח": 'ש"ח', + "יוייר": 'יו"ר', + "בליימ": 'בל"מ', + "תבייע": 'תב"ע', + "תמייא": 'תמ"א', + "סייה": 'ס"ה', + "שייפ": 'ש"פ', + "שצייפ": 'שצ"פ', + "שבייצ": 'שב"צ', + "עסיים": 'עס"ם', + "הייה": 'ה"ה', + "פסייד": 'פס"ד', + "תיידא": 'תיד"א', + "בגייץ": 'בג"ץ', + "עתיים": 'עת"ם', + "עעיים": 'עע"ם', + # Hebrew calendar day prefixes (כ"א .. כ"ט etc.) + "כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד', + "כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט', + "לייא": 'ל"א', + "יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד', + "טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט', + # Hebrew calendar years (תשפ"ה, תשפ"ד...) + "תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג', + "תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו', + "תשפיין": 'תשפ"ן', +} +_ABBREV_PATTERN = re.compile( + "|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True)) +) + + +def _fix_hebrew_quotes(text: str) -> str: + return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text) + + +def _ocr_page_image(image_bytes: bytes, page_num: int) -> str: + client = _get_vision_client() + image = vision.Image(content=image_bytes) + response = client.document_text_detection( + image=image, + image_context=vision.ImageContext(language_hints=["he"]), + ) + if response.error.message: + raise RuntimeError(f"Vision error page {page_num}: {response.error.message}") + text = response.full_text_annotation.text if response.full_text_annotation else "" + return _fix_hebrew_quotes(text) + + +_FOOTER_JUNK_RE = re.compile( + r"^(" + r"\s*|" # blank + r"[-·*.\"\'׳״]+|" # stray punctuation + r"\d{1,3}[\s\-·*.\"\'׳״]*|" # page number with any stray char + r"עמוד[\s\d\-·*.\"\'׳״]*|" # "עמוד" / "עמוד N" w/ trailing noise + r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}" # garbled latin (nevo URL variants) + r")$" +) + + +def _clean_page_text(text: str) -> str: + """Strip Nevo page headers, footers and watermarks from a single page's OCR text. + + Nevo footer on each page looks like: + עמוד + N (or "N·", "N*") + nevo.co.il (or OCR-garbled: "new coal", "neto coal", etc.) + - (optional stray dash) + + Google Vision OCRs this block at the end of each page's text. + """ + # 1. Strip top header "עמוד X מתוך Y" anywhere + text = PDF_PAGE_HEADER_RE.sub("\n", text) + + # 2. Walk back from end, dropping footer junk lines + lines = text.split("\n") + while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()): + lines.pop() + text = "\n".join(lines) + + # 3. Final pass: strip any leftover Nevo URLs mid-text and orphan "עמוד X" lines + text = NEVO_URL_RE.sub("", text) + text = PDF_PAGE_NUM_LINE_RE.sub("", text) + text = PDF_PAGE_ORPHAN_RE.sub("", text) + + return text.strip() + + +def proofread_pdf(path: Path) -> tuple[str, dict]: + """Extract clean decision text from Nevo PDF via Google Vision OCR.""" + doc = fitz.open(str(path)) + pages: list[str] = [] + for i, page in enumerate(doc): + pix = page.get_pixmap(dpi=300) + img_bytes = pix.tobytes("png") + text = _ocr_page_image(img_bytes, i + 1) + pages.append(_clean_page_text(text)) + # Small delay between API calls to be safe + time.sleep(0.1) + doc.close() + + body = "\n\n".join(p for p in pages if p) + body = re.sub(r"\n{3,}", "\n\n", body) + body = re.sub(r"[ \t]+\n", "\n", body) + + for marker in NEVO_POSTAMBLE_MARKERS: + idx = body.find(marker) + if idx != -1: + body = body[:idx].rstrip() + break + + return body, { + "pages": len(pages), + "chars": len(body), + } + + +# ── Orchestration ───────────────────────────────────────────────── + + +SKIP_FILES = { + "הכנת שאלות מחקר.docx", + "סוכן_מנתח_ומחקר_משפטי_Paperclip_מדריך.docx", + "README.md", +} + + +def output_filename(src: Path) -> str: + """Build clean output filename preserving case identifier.""" + stem = src.stem + # Normalize: replace spaces with - where helpful, but keep Hebrew intact + return f"{stem}.md" + + +def main(argv: list[str]) -> int: + OUTPUT_DIR.mkdir(exist_ok=True) + RAW_DIR.mkdir(exist_ok=True) + + # Filter files + only = argv[1:] if len(argv) > 1 else None + files: list[Path] = [] + for p in sorted(TRAINING_DIR.iterdir()): + if p.is_dir() or p.name.startswith("."): + continue + if p.name in SKIP_FILES: + continue + if p.suffix.lower() not in (".docx", ".pdf"): + continue + if only and p.name not in only: + continue + files.append(p) + + print(f"Processing {len(files)} files...\n") + + for path in files: + try: + if path.suffix.lower() == ".docx": + md, stats = proofread_docx(path) + else: + md, stats = proofread_pdf(path) + + out_path = OUTPUT_DIR / output_filename(path) + out_path.write_text(md, encoding="utf-8") + print(f"✓ {path.name}") + print(f" → {out_path.name} ({len(md):,} chars) {stats}") + except Exception as e: + print(f"✗ {path.name}: {e}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/web/app.py b/web/app.py index 76d15a6..46c9820 100644 --- a/web/app.py +++ b/web/app.py @@ -28,7 +28,7 @@ from pydantic import BaseModel import asyncpg from legal_mcp import config -from legal_mcp.services import chunker, db, embeddings, extractor, processor +from legal_mcp.services import chunker, db, embeddings, extractor, processor, proofreader from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools # Import integration clients (same directory) @@ -163,6 +163,261 @@ async def classify_file(req: ClassifyRequest): return {"task_id": task_id} +# ── Training Corpus: Analyze & Upload ───────────────────────────── + + +@app.post("/api/training/analyze") +async def training_analyze(filename: str = Form(...)): + """Proofread an uploaded file and extract metadata for review. + + Input: filename in UPLOAD_DIR (from /api/upload). + Output: clean text preview + extracted metadata (number, date, categories). + """ + source = UPLOAD_DIR / filename + if not source.exists() or not source.parent.samefile(UPLOAD_DIR): + raise HTTPException(404, "File not found in uploads") + + try: + result = await proofreader.analyze_file(source) + except Exception as e: + logger.exception("Proofread failed for %s", filename) + raise HTTPException(500, f"Proofreading failed: {e}") + + return result + + +class TrainingUploadRequest(BaseModel): + filename: str # name in UPLOAD_DIR + decision_number: str = "" + decision_date: str = "" # YYYY-MM-DD + subject_categories: list[str] = [] + title: str = "" + + +@app.post("/api/training/upload") +async def training_upload(req: TrainingUploadRequest): + """Upload a proofread file to the style corpus. + + Runs proofreading again to guarantee clean text (not raw file content), + then inserts into style_corpus + chunks + embeddings. + """ + source = UPLOAD_DIR / req.filename + if not source.exists() or not source.parent.samefile(UPLOAD_DIR): + raise HTTPException(404, "File not found in uploads") + + # Check for duplicate by decision_number + if req.decision_number: + pool = await db.get_pool() + async with pool.acquire() as conn: + exists = await conn.fetchval( + "SELECT 1 FROM style_corpus WHERE decision_number = $1 LIMIT 1", + req.decision_number, + ) + if exists: + raise HTTPException( + 409, + f"החלטה {req.decision_number} כבר קיימת בקורפוס", + ) + + task_id = str(uuid4()) + _progress[task_id] = {"status": "queued", "filename": req.filename} + asyncio.create_task(_process_proofread_training(task_id, source, req)) + return {"task_id": task_id} + + +async def _process_proofread_training( + task_id: str, source: Path, req: TrainingUploadRequest +): + """Background task: proofread → store in corpus → chunk → embed.""" + from datetime import date as date_type + + try: + title = req.title or source.stem.split("_", 1)[-1] + + # 1. Proofread (strip Nevo additions) + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "proofreading"} + clean_text, stats = await proofreader.proofread(source) + + # 2. Save proofread .md to training dir (alongside original) + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "saving"} + training_dir = config.TRAINING_DIR + proofread_dir = training_dir / "proofread" + training_dir.mkdir(parents=True, exist_ok=True) + proofread_dir.mkdir(exist_ok=True) + + # Copy original to training dir + original_name = re.sub(r"^\d+_", "", source.name) + orig_dest = training_dir / original_name + shutil.copy2(str(source), str(orig_dest)) + + # Save cleaned version + proofread_name = Path(original_name).stem + ".md" + proofread_dest = proofread_dir / proofread_name + proofread_dest.write_text(clean_text, encoding="utf-8") + + # 3. Parse date + d_date = None + if req.decision_date: + d_date = date_type.fromisoformat(req.decision_date) + + # 4. Add to style corpus + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "corpus"} + corpus_id = await db.add_to_style_corpus( + document_id=None, + decision_number=req.decision_number, + decision_date=d_date, + subject_categories=req.subject_categories, + full_text=clean_text, + ) + + # 5. Chunk + embed + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"} + chunks = chunker.chunk_document(clean_text) + chunk_count = 0 + if chunks: + doc = await db.create_document( + case_id=None, + doc_type="decision", + title=f"[קורפוס] {title}", + file_path=str(orig_dest), + page_count=stats.get("pages", 0), + ) + doc_id = UUID(doc["id"]) + await db.update_document( + doc_id, extracted_text=clean_text, extraction_status="completed" + ) + + _progress[task_id] = { + "status": "processing", "filename": req.filename, "step": "embedding", + } + texts = [c.content for c in chunks] + embs = await embeddings.embed_texts(texts, input_type="document") + chunk_dicts = [ + { + "content": c.content, + "section_type": c.section_type, + "embedding": emb, + "page_number": c.page_number, + "chunk_index": c.chunk_index, + } + for c, emb in zip(chunks, embs) + ] + await db.store_chunks(doc_id, None, chunk_dicts) + chunk_count = len(chunks) + + # 6. Cleanup upload + source.unlink(missing_ok=True) + + _progress[task_id] = { + "status": "completed", + "filename": req.filename, + "result": { + "corpus_id": str(corpus_id), + "title": title, + "chars": len(clean_text), + "chunks": chunk_count, + "proofread_stats": stats, + }, + } + except Exception as e: + logger.exception("Training upload failed for %s", req.filename) + _progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename} + + +@app.get("/api/training/patterns") +async def training_patterns(): + """List all extracted style patterns, grouped by type.""" + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT pattern_type, pattern_text, frequency, context, examples " + "FROM style_patterns " + "ORDER BY pattern_type, frequency DESC" + ) + + grouped: dict[str, list] = {} + for r in rows: + pt = r["pattern_type"] + examples = r["examples"] + if isinstance(examples, str): + try: + examples = json.loads(examples) + except Exception: + examples = [] + grouped.setdefault(pt, []).append({ + "pattern_text": r["pattern_text"], + "frequency": r["frequency"], + "context": r["context"] or "", + "examples": examples or [], + }) + return {"total": len(rows), "by_type": grouped} + + +_style_analysis_state = {"running": False, "started_at": None, "result": None, "error": None} + + +@app.post("/api/training/analyze-style") +async def training_analyze_style(): + """Kick off style analysis over the corpus. Returns immediately.""" + if _style_analysis_state["running"]: + raise HTTPException(409, "ניתוח סגנון כבר רץ") + + _style_analysis_state.update( + {"running": True, "started_at": time.time(), "result": None, "error": None} + ) + + async def _run(): + from legal_mcp.services.style_analyzer import analyze_corpus + try: + result = await analyze_corpus() + _style_analysis_state["result"] = result + except Exception as e: + logger.exception("Style analysis failed") + _style_analysis_state["error"] = str(e) + finally: + _style_analysis_state["running"] = False + + asyncio.create_task(_run()) + return {"status": "started"} + + +@app.get("/api/training/analyze-style/status") +async def training_analyze_style_status(): + """Poll status of the running style analysis.""" + state = dict(_style_analysis_state) + if state["started_at"]: + state["elapsed"] = int(time.time() - state["started_at"]) + return state + + +@app.get("/api/training/corpus") +async def training_corpus_list(): + """List all decisions currently in the style corpus.""" + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT id, decision_number, decision_date, subject_categories, " + " length(full_text) as chars, created_at " + "FROM style_corpus " + "ORDER BY created_at DESC" + ) + return [ + { + "id": str(r["id"]), + "decision_number": r["decision_number"] or "", + "decision_date": str(r["decision_date"]) if r["decision_date"] else "", + "subject_categories": ( + json.loads(r["subject_categories"]) + if isinstance(r["subject_categories"], str) + else r["subject_categories"] or [] + ), + "chars": r["chars"], + "created_at": r["created_at"].isoformat() if r["created_at"] else "", + } + for r in rows + ] + + @app.get("/api/progress/{task_id}") async def progress_stream(task_id: str): """SSE stream of processing progress.""" diff --git a/web/static/index.html b/web/static/index.html index a49cc19..84a5681 100644 --- a/web/static/index.html +++ b/web/static/index.html @@ -283,6 +283,120 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255, } .skill-install-result.error { background: #ffebee; border-color: #ffcdd2; } +/* ── Training Corpus Upload ───────────────────────────── */ +.training-review { + border: 1px solid #e5e5e5; border-radius: 8px; padding: 14px 16px; + margin-bottom: 12px; background: #fafafa; +} +.training-review .review-header { + display: flex; align-items: center; gap: 10px; + padding-bottom: 10px; margin-bottom: 12px; + border-bottom: 1px solid #eee; +} +.training-review .review-header strong { font-size: 0.95em; color: #1a1a2e; flex: 1; } +.training-review .review-meta { font-size: 0.78em; color: #888; } +.training-review .btn-icon { + background: transparent; border: none; color: #aaa; cursor: pointer; + font-size: 1.1em; padding: 4px 8px; border-radius: 4px; +} +.training-review .btn-icon:hover { background: #ffebee; color: #c62828; } +.training-review .review-fields { + display: grid; grid-template-columns: 1fr 160px; gap: 14px; margin-bottom: 12px; +} +.training-review .review-fields label { + display: flex; flex-direction: column; gap: 4px; + font-size: 0.8em; color: #666; font-weight: 500; +} +.training-review .review-fields input { + padding: 7px 10px; border: 1px solid #ddd; border-radius: 6px; + font-size: 0.88em; font-family: inherit; +} +.training-review .review-fields input:focus { + outline: none; border-color: #e94560; +} +.training-review .review-cats { margin-bottom: 10px; } +.training-review .review-cats-label { + font-size: 0.8em; color: #666; font-weight: 500; margin-bottom: 6px; +} +.training-review .review-cats-list { display: flex; flex-wrap: wrap; gap: 6px; } +.cat-chip { + display: inline-flex; align-items: center; gap: 5px; + padding: 4px 10px; border: 1px solid #ddd; border-radius: 14px; + font-size: 0.78em; cursor: pointer; background: #fff; + transition: background 0.12s; +} +.cat-chip:hover { background: #f0f0f0; } +.cat-chip input[type="checkbox"] { margin: 0; cursor: pointer; } +.cat-chip:has(input:checked) { background: #ffe4ea; border-color: #e94560; color: #c62828; } + +.review-preview { + margin-top: 6px; border: 1px solid #eee; border-radius: 6px; + background: #fff; padding: 8px 12px; +} +.review-preview summary { + cursor: pointer; font-size: 0.78em; color: #888; font-weight: 500; +} +.review-preview pre { + margin-top: 10px; font-size: 0.78em; color: #333; direction: rtl; + white-space: pre-wrap; font-family: inherit; line-height: 1.5; + max-height: 250px; overflow-y: auto; +} + +.training-task { + padding: 10px 14px; margin-bottom: 6px; border-radius: 6px; + background: #f7f7f7; font-size: 0.85em; + display: flex; align-items: center; gap: 10px; +} +.training-task:last-child { margin-bottom: 0; } + +.corpus-table { width: 100%; border-collapse: collapse; font-size: 0.82em; } +.corpus-table th, .corpus-table td { + text-align: right; padding: 8px 10px; border-bottom: 1px solid #eee; +} +.corpus-table th { + background: #f7f7f7; font-weight: 600; color: #555; + font-size: 0.78em; text-transform: uppercase; +} +.corpus-table tr:hover td { background: #fafafa; } +.cat-tag { + display: inline-block; padding: 2px 8px; margin: 0 2px; + background: #e3f2fd; color: #1565c0; border-radius: 10px; + font-size: 0.72em; font-weight: 500; +} + +/* Pattern groups */ +.pattern-group { + border: 1px solid #eee; border-radius: 8px; margin-bottom: 10px; + background: #fff; +} +.pattern-group[open] { background: #fafafa; } +.pattern-group summary { + padding: 12px 16px; cursor: pointer; font-size: 0.9em; + display: flex; align-items: center; gap: 10px; list-style: none; +} +.pattern-group summary::-webkit-details-marker { display: none; } +.pattern-group summary::before { + content: '▸'; transition: transform 0.15s; font-size: 0.9em; color: #888; +} +.pattern-group[open] summary::before { transform: rotate(90deg); } +.pattern-count { + margin-right: auto; background: #e3f2fd; color: #1565c0; + padding: 2px 10px; border-radius: 10px; font-size: 0.76em; font-weight: 500; +} +.pattern-list { + padding: 4px 16px 14px 16px; display: flex; flex-direction: column; gap: 8px; +} +.pattern-item { + padding: 10px 14px; background: #fff; border: 1px solid #eee; + border-radius: 6px; font-size: 0.84em; +} +.pattern-text { color: #1a1a2e; font-weight: 500; } +.pattern-context { color: #666; font-size: 0.88em; margin-top: 4px; } +.pattern-meta { + color: #999; font-size: 0.78em; margin-top: 6px; + display: flex; gap: 10px; +} + @media (max-width: 800px) { .main { padding: 16px; } header { padding: 14px 16px; } @@ -302,6 +416,7 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255, תיקים + תיק חדש העלאה + אימון סגנון Skills @@ -552,6 +667,75 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
+ + +
+ + +
+
+

+ העלה החלטות קודמות של דפנה כדי ללמד את המערכת את סגנון הכתיבה שלה. + הקבצים יעברו הגהה אוטומטית (הסרת תוספות נבו, כותרות, סימני מים) + וחילוץ מטא-דאטה (מספר החלטה, תאריך, קטגוריות) לסקירה לפני ההעלאה. +

+
+
📚
+

גרור קבצי החלטה לכאן או לחץ לבחירה

+

PDF, DOCX, MD — עד 50MB. ניתן להעלות מספר קבצים בבת אחת.

+ +
+
+
+ + + + + +
+
+ קורפוס הסגנון + +
+
+
טוען...
+
+
+ +
+
+ דוח סגנון — דפוסים שחולצו + + + + +
+
+
טוען...
+
+
+
@@ -614,6 +798,11 @@ function handleRoute() { document.getElementById('navUpload').classList.add('active'); subtitle = 'העלאת מסמכים'; loadLegacyPending(); + } else if (hash === '#/training') { + document.getElementById('page-training').classList.add('active'); + document.getElementById('navTraining').classList.add('active'); + subtitle = 'אימון סגנון'; + initTrainingPage(); } document.getElementById('pageSubtitle').textContent = subtitle; @@ -1559,6 +1748,377 @@ async function restartPaperclip() { // Init legacy upload listeners setupLegacyUpload(); + +// ── Training Corpus Upload ───────────────────────────────────────── + +const ALL_CATEGORIES = [ + 'בנייה', 'שימוש חורג', 'תכנית', 'היתר', 'הקלה', + 'חלוקה', 'תמ"א 38', 'היטל השבחה', 'פיצויים 197', +]; + +let _trainingReviews = []; // in-progress metadata awaiting user approval + +function initTrainingPage() { + setupTrainingDropZone(); + loadCorpusList(); + loadStylePatterns(); + pollStyleAnalysisStatus(); + // Reset review state on re-entry + _trainingReviews = []; + document.getElementById('trainingAnalysisCard').style.display = 'none'; + document.getElementById('trainingTasksCard').style.display = 'none'; + document.getElementById('trainingReviewList').innerHTML = ''; + document.getElementById('trainingTasksList').innerHTML = ''; +} + +function setupTrainingDropZone() { + const zone = document.getElementById('trainingDropZone'); + const input = document.getElementById('trainingFileInput'); + if (zone._wired) return; + zone._wired = true; + + zone.addEventListener('click', () => input.click()); + zone.addEventListener('dragover', (e) => { e.preventDefault(); zone.classList.add('dragging'); }); + zone.addEventListener('dragleave', () => zone.classList.remove('dragging')); + zone.addEventListener('drop', (e) => { + e.preventDefault(); + zone.classList.remove('dragging'); + handleTrainingFiles(e.dataTransfer.files); + }); + input.addEventListener('change', () => handleTrainingFiles(input.files)); +} + +async function handleTrainingFiles(fileList) { + const files = Array.from(fileList || []); + if (!files.length) return; + + const card = document.getElementById('trainingAnalysisCard'); + const status = document.getElementById('trainingAnalysisStatus'); + card.style.display = ''; + status.textContent = `מעלה ומנתח ${files.length} קבצים...`; + + for (const file of files) { + try { + // 1. Upload to pending dir + status.textContent = `מעלה: ${file.name}...`; + const fd = new FormData(); + fd.append('file', file); + const upRes = await fetch(API + '/upload', { method: 'POST', body: fd }); + if (!upRes.ok) throw new Error(`Upload failed: ${await upRes.text()}`); + const uploadInfo = await upRes.json(); + + // 2. Analyze (proofread + extract metadata) + status.textContent = `מנתח: ${file.name}...`; + const analyzeFd = new FormData(); + analyzeFd.append('filename', uploadInfo.filename); + const anRes = await fetch(API + '/training/analyze', { method: 'POST', body: analyzeFd }); + if (!anRes.ok) throw new Error(`Analyze failed: ${await anRes.text()}`); + const analysis = await anRes.json(); + + _trainingReviews.push({ + ...analysis, + _pendingName: uploadInfo.filename, + _originalName: file.name, + _status: 'ready', + }); + } catch (e) { + toast(`שגיאה בעיבוד ${file.name}: ${e.message}`, 'error'); + } + } + + status.textContent = ''; + renderTrainingReview(); +} + +function renderTrainingReview() { + const list = document.getElementById('trainingReviewList'); + if (!_trainingReviews.length) { + list.innerHTML = '
אין קבצים לסקירה
'; + document.getElementById('trainingAnalysisCard').style.display = 'none'; + return; + } + + list.innerHTML = _trainingReviews.map((r, i) => renderReviewRow(r, i)).join(''); +} + +function renderReviewRow(r, idx) { + const catsHtml = ALL_CATEGORIES.map(c => { + const checked = r.subject_categories.includes(c) ? 'checked' : ''; + return ``; + }).join(''); + + return ` +
+
+ ${esc(r._originalName)} + ${r.chars.toLocaleString('he-IL')} תווים · ${esc(r.stats.source_type)} + +
+
+ + +
+
+
קטגוריות:
+
${catsHtml}
+
+
+ תצוגה מקדימה של טקסט מנוקה (500 תווים ראשונים) +
${esc(r.preview)}
+
+
+ `; +} + +function toggleCat(idx, cat) { + const r = _trainingReviews[idx]; + const i = r.subject_categories.indexOf(cat); + if (i >= 0) r.subject_categories.splice(i, 1); + else r.subject_categories.push(cat); +} + +function removeTrainingReview(idx) { + const r = _trainingReviews[idx]; + // Clean up the uploaded pending file + if (r._pendingName) { + fetch(API + '/uploads/' + encodeURIComponent(r._pendingName), { method: 'DELETE' }) + .catch(() => {}); + } + _trainingReviews.splice(idx, 1); + renderTrainingReview(); +} + +function cancelTrainingReview() { + // Delete all pending uploads + for (const r of _trainingReviews) { + if (r._pendingName) { + fetch(API + '/uploads/' + encodeURIComponent(r._pendingName), { method: 'DELETE' }) + .catch(() => {}); + } + } + _trainingReviews = []; + renderTrainingReview(); +} + +async function uploadAllTraining() { + const btn = document.getElementById('trainingUploadBtn'); + btn.disabled = true; + const tasksCard = document.getElementById('trainingTasksCard'); + const tasksList = document.getElementById('trainingTasksList'); + tasksCard.style.display = ''; + tasksList.innerHTML = ''; + + for (let i = 0; i < _trainingReviews.length; i++) { + const r = _trainingReviews[i]; + const row = document.createElement('div'); + row.className = 'training-task'; + row.innerHTML = ` ${esc(r._originalName)} — ${esc(r.decision_number || '—')}`; + tasksList.appendChild(row); + + try { + const res = await fetch(API + '/training/upload', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + filename: r._pendingName, + decision_number: r.decision_number, + decision_date: r.decision_date, + subject_categories: r.subject_categories, + title: r._originalName.replace(/\.[^.]+$/, ''), + }), + }); + if (!res.ok) { + const err = await res.text(); + throw new Error(err); + } + const { task_id } = await res.json(); + const result = await pollTrainingProgress(task_id, row, r._originalName); + row.innerHTML = ` ${esc(r._originalName)} — ${result.chars.toLocaleString('he-IL')} תווים, ${result.chunks} קטעים`; + } catch (e) { + row.innerHTML = ` ${esc(r._originalName)} — ${esc(e.message.substring(0, 200))}`; + } + } + + _trainingReviews = []; + renderTrainingReview(); + btn.disabled = false; + loadCorpusList(); + toast('ההעלאה הושלמה', 'success'); +} + +const TRAINING_STEP_LABELS = { + queued: 'בתור', + proofreading: 'הגהה', + saving: 'שמירה', + corpus: 'קליטה לקורפוס', + chunking: 'פיצול לקטעים', + embedding: 'יצירת embeddings', + completed: 'הושלם', + failed: 'נכשל', +}; + +function pollTrainingProgress(taskId, row, name) { + return new Promise((resolve, reject) => { + const es = new EventSource(API + '/progress/' + taskId); + es.onmessage = (e) => { + const data = JSON.parse(e.data); + const label = TRAINING_STEP_LABELS[data.step] || TRAINING_STEP_LABELS[data.status] || data.status; + row.innerHTML = ` ${esc(name)} — ${esc(label)}...`; + if (data.status === 'completed') { + es.close(); + resolve(data.result); + } else if (data.status === 'failed') { + es.close(); + reject(new Error(data.error || 'Processing failed')); + } + }; + es.onerror = () => { + es.close(); + reject(new Error('connection lost')); + }; + }); +} + +// ── Style Analysis (patterns) ──────────────────────────── + +const PATTERN_TYPE_LABELS = { + opening_formula: 'נוסחאות פתיחה', + closing_formula: 'נוסחאות סיום', + transition: 'ביטויי מעבר', + characteristic_phrase: 'ביטויים אופייניים', + argument_flow: 'זרימת טיעון', + analysis_structure: 'מבנה ניתוח', + evidence_handling: 'טיפול בראיות', + citation_style: 'סגנון ציטוט', +}; + +async function loadStylePatterns() { + const container = document.getElementById('patternsList'); + const count = document.getElementById('patternsCount'); + try { + const res = await fetch(API + '/training/patterns'); + const data = await res.json(); + count.textContent = `${data.total} דפוסים`; + if (!data.total) { + container.innerHTML = '
אין דפוסים עדיין. לחץ "נתח קורפוס" כדי לחלץ דפוסים מההחלטות הקיימות.
'; + return; + } + const typeOrder = [ + 'opening_formula', 'transition', 'characteristic_phrase', + 'argument_flow', 'analysis_structure', 'evidence_handling', + 'citation_style', 'closing_formula', + ]; + const types = typeOrder.filter(t => data.by_type[t]); + Object.keys(data.by_type).forEach(t => { if (!types.includes(t)) types.push(t); }); + + container.innerHTML = types.map(type => ` +
+ + ${esc(PATTERN_TYPE_LABELS[type] || type)} + ${data.by_type[type].length} + +
+ ${data.by_type[type].map(p => ` +
+
${esc(p.pattern_text)}
+ ${p.context ? `
${esc(p.context)}
` : ''} +
+ תדירות: ${p.frequency} + ${p.examples && p.examples.length ? `· ${p.examples.length} דוגמאות` : ''} +
+
+ `).join('')} +
+
+ `).join(''); + } catch (e) { + container.innerHTML = `
שגיאה בטעינה: ${esc(e.message)}
`; + } +} + +async function runStyleAnalysis() { + const btn = document.getElementById('analyzeStyleBtn'); + btn.disabled = true; + try { + const res = await fetch(API + '/training/analyze-style', { method: 'POST' }); + if (res.status === 409) { + toast('ניתוח כבר רץ ברקע', 'warn'); + } else if (!res.ok) { + throw new Error(await res.text()); + } else { + toast('ניתוח סגנון התחיל — 2-5 דקות', 'success'); + } + pollStyleAnalysisStatus(); + } catch (e) { + toast('שגיאה: ' + e.message, 'error'); + btn.disabled = false; + } +} + +async function pollStyleAnalysisStatus() { + const btn = document.getElementById('analyzeStyleBtn'); + try { + const res = await fetch(API + '/training/analyze-style/status'); + const state = await res.json(); + if (state.running) { + btn.disabled = true; + btn.innerHTML = ` מנתח... ${state.elapsed || 0}s`; + setTimeout(pollStyleAnalysisStatus, 3000); + } else { + btn.disabled = false; + btn.textContent = 'נתח קורפוס'; + if (state.error) { + toast('ניתוח נכשל: ' + state.error.substring(0, 150), 'error'); + } else if (state.result) { + toast('הניתוח הושלם — הדפוסים עודכנו', 'success'); + loadStylePatterns(); + } + } + } catch (e) { + btn.disabled = false; + } +} + +async function loadCorpusList() { + const container = document.getElementById('corpusList'); + const count = document.getElementById('corpusCount'); + try { + const res = await fetch(API + '/training/corpus'); + const rows = await res.json(); + count.textContent = `${rows.length} החלטות`; + if (!rows.length) { + container.innerHTML = '
הקורפוס ריק
'; + return; + } + container.innerHTML = ` + + + + + + ${rows.map(r => ` + + + + + + + + `).join('')} + +
מספרתאריךקטגוריותתוויםנוצר
${esc(r.decision_number || '—')}${esc(r.decision_date || '—')}${(r.subject_categories || []).map(c => `${esc(c)}`).join('')}${r.chars.toLocaleString('he-IL')}${esc(r.created_at ? r.created_at.substring(0, 10) : '—')}
+ `; + } catch (e) { + container.innerHTML = `
שגיאה בטעינה: ${esc(e.message)}
`; + } +}