Add training corpus UI with Nevo proofreading pipeline

- New proofreader service strips Nevo editorial additions (front matter, postamble, page headers, watermarks, inline codes) from DOCX/PDF/MD - PDF pages use Google Vision OCR for clean Hebrew RTL extraction - New training page at #/training with drag-and-drop upload, automatic metadata extraction (decision number, date, categories), reviewable preview, and style pattern report grouped by type - API endpoints: /api/training/{analyze,upload,corpus,patterns, analyze-style,analyze-style/status} - Fix claude_session.query to pipe prompt via stdin, avoiding ARG_MAX overflow when analyzing 900K+ char corpus - CLI scripts for batch proofreading and corpus upload Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 11:04:58 +00:00
parent ecda95d610
commit 32f18de049
6 changed files with 1960 additions and 3 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -28,7 +28,7 @@ from pydantic import BaseModel
 import asyncpg

 from legal_mcp import config
-from legal_mcp.services import chunker, db, embeddings, extractor, processor
+from legal_mcp.services import chunker, db, embeddings, extractor, processor, proofreader
 from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools

 # Import integration clients (same directory)
@@ -163,6 +163,261 @@ async def classify_file(req: ClassifyRequest):
    return {"task_id": task_id}


+# ── Training Corpus: Analyze & Upload ─────────────────────────────
+
+
+@app.post("/api/training/analyze")
+async def training_analyze(filename: str = Form(...)):
+    """Proofread an uploaded file and extract metadata for review.
+
+    Input: filename in UPLOAD_DIR (from /api/upload).
+    Output: clean text preview + extracted metadata (number, date, categories).
+    """
+    source = UPLOAD_DIR / filename
+    if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
+        raise HTTPException(404, "File not found in uploads")
+
+    try:
+        result = await proofreader.analyze_file(source)
+    except Exception as e:
+        logger.exception("Proofread failed for %s", filename)
+        raise HTTPException(500, f"Proofreading failed: {e}")
+
+    return result
+
+
+class TrainingUploadRequest(BaseModel):
+    filename: str                      # name in UPLOAD_DIR
+    decision_number: str = ""
+    decision_date: str = ""            # YYYY-MM-DD
+    subject_categories: list[str] = []
+    title: str = ""
+
+
+@app.post("/api/training/upload")
+async def training_upload(req: TrainingUploadRequest):
+    """Upload a proofread file to the style corpus.
+
+    Runs proofreading again to guarantee clean text (not raw file content),
+    then inserts into style_corpus + chunks + embeddings.
+    """
+    source = UPLOAD_DIR / req.filename
+    if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
+        raise HTTPException(404, "File not found in uploads")
+
+    # Check for duplicate by decision_number
+    if req.decision_number:
+        pool = await db.get_pool()
+        async with pool.acquire() as conn:
+            exists = await conn.fetchval(
+                "SELECT 1 FROM style_corpus WHERE decision_number = $1 LIMIT 1",
+                req.decision_number,
+            )
+        if exists:
+            raise HTTPException(
+                409,
+                f"החלטה {req.decision_number} כבר קיימת בקורפוס",
+            )
+
+    task_id = str(uuid4())
+    _progress[task_id] = {"status": "queued", "filename": req.filename}
+    asyncio.create_task(_process_proofread_training(task_id, source, req))
+    return {"task_id": task_id}
+
+
+async def _process_proofread_training(
+    task_id: str, source: Path, req: TrainingUploadRequest
+):
+    """Background task: proofread → store in corpus → chunk → embed."""
+    from datetime import date as date_type
+
+    try:
+        title = req.title or source.stem.split("_", 1)[-1]
+
+        # 1. Proofread (strip Nevo additions)
+        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "proofreading"}
+        clean_text, stats = await proofreader.proofread(source)
+
+        # 2. Save proofread .md to training dir (alongside original)
+        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "saving"}
+        training_dir = config.TRAINING_DIR
+        proofread_dir = training_dir / "proofread"
+        training_dir.mkdir(parents=True, exist_ok=True)
+        proofread_dir.mkdir(exist_ok=True)
+
+        # Copy original to training dir
+        original_name = re.sub(r"^\d+_", "", source.name)
+        orig_dest = training_dir / original_name
+        shutil.copy2(str(source), str(orig_dest))
+
+        # Save cleaned version
+        proofread_name = Path(original_name).stem + ".md"
+        proofread_dest = proofread_dir / proofread_name
+        proofread_dest.write_text(clean_text, encoding="utf-8")
+
+        # 3. Parse date
+        d_date = None
+        if req.decision_date:
+            d_date = date_type.fromisoformat(req.decision_date)
+
+        # 4. Add to style corpus
+        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "corpus"}
+        corpus_id = await db.add_to_style_corpus(
+            document_id=None,
+            decision_number=req.decision_number,
+            decision_date=d_date,
+            subject_categories=req.subject_categories,
+            full_text=clean_text,
+        )
+
+        # 5. Chunk + embed
+        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
+        chunks = chunker.chunk_document(clean_text)
+        chunk_count = 0
+        if chunks:
+            doc = await db.create_document(
+                case_id=None,
+                doc_type="decision",
+                title=f"[קורפוס] {title}",
+                file_path=str(orig_dest),
+                page_count=stats.get("pages", 0),
+            )
+            doc_id = UUID(doc["id"])
+            await db.update_document(
+                doc_id, extracted_text=clean_text, extraction_status="completed"
+            )
+
+            _progress[task_id] = {
+                "status": "processing", "filename": req.filename, "step": "embedding",
+            }
+            texts = [c.content for c in chunks]
+            embs = await embeddings.embed_texts(texts, input_type="document")
+            chunk_dicts = [
+                {
+                    "content": c.content,
+                    "section_type": c.section_type,
+                    "embedding": emb,
+                    "page_number": c.page_number,
+                    "chunk_index": c.chunk_index,
+                }
+                for c, emb in zip(chunks, embs)
+            ]
+            await db.store_chunks(doc_id, None, chunk_dicts)
+            chunk_count = len(chunks)
+
+        # 6. Cleanup upload
+        source.unlink(missing_ok=True)
+
+        _progress[task_id] = {
+            "status": "completed",
+            "filename": req.filename,
+            "result": {
+                "corpus_id": str(corpus_id),
+                "title": title,
+                "chars": len(clean_text),
+                "chunks": chunk_count,
+                "proofread_stats": stats,
+            },
+        }
+    except Exception as e:
+        logger.exception("Training upload failed for %s", req.filename)
+        _progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}
+
+
+@app.get("/api/training/patterns")
+async def training_patterns():
+    """List all extracted style patterns, grouped by type."""
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT pattern_type, pattern_text, frequency, context, examples "
+            "FROM style_patterns "
+            "ORDER BY pattern_type, frequency DESC"
+        )
+
+    grouped: dict[str, list] = {}
+    for r in rows:
+        pt = r["pattern_type"]
+        examples = r["examples"]
+        if isinstance(examples, str):
+            try:
+                examples = json.loads(examples)
+            except Exception:
+                examples = []
+        grouped.setdefault(pt, []).append({
+            "pattern_text": r["pattern_text"],
+            "frequency": r["frequency"],
+            "context": r["context"] or "",
+            "examples": examples or [],
+        })
+    return {"total": len(rows), "by_type": grouped}
+
+
+_style_analysis_state = {"running": False, "started_at": None, "result": None, "error": None}
+
+
+@app.post("/api/training/analyze-style")
+async def training_analyze_style():
+    """Kick off style analysis over the corpus. Returns immediately."""
+    if _style_analysis_state["running"]:
+        raise HTTPException(409, "ניתוח סגנון כבר רץ")
+
+    _style_analysis_state.update(
+        {"running": True, "started_at": time.time(), "result": None, "error": None}
+    )
+
+    async def _run():
+        from legal_mcp.services.style_analyzer import analyze_corpus
+        try:
+            result = await analyze_corpus()
+            _style_analysis_state["result"] = result
+        except Exception as e:
+            logger.exception("Style analysis failed")
+            _style_analysis_state["error"] = str(e)
+        finally:
+            _style_analysis_state["running"] = False
+
+    asyncio.create_task(_run())
+    return {"status": "started"}
+
+
+@app.get("/api/training/analyze-style/status")
+async def training_analyze_style_status():
+    """Poll status of the running style analysis."""
+    state = dict(_style_analysis_state)
+    if state["started_at"]:
+        state["elapsed"] = int(time.time() - state["started_at"])
+    return state
+
+
+@app.get("/api/training/corpus")
+async def training_corpus_list():
+    """List all decisions currently in the style corpus."""
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT id, decision_number, decision_date, subject_categories, "
+            "       length(full_text) as chars, created_at "
+            "FROM style_corpus "
+            "ORDER BY created_at DESC"
+        )
+    return [
+        {
+            "id": str(r["id"]),
+            "decision_number": r["decision_number"] or "",
+            "decision_date": str(r["decision_date"]) if r["decision_date"] else "",
+            "subject_categories": (
+                json.loads(r["subject_categories"])
+                if isinstance(r["subject_categories"], str)
+                else r["subject_categories"] or []
+            ),
+            "chars": r["chars"],
+            "created_at": r["created_at"].isoformat() if r["created_at"] else "",
+        }
+        for r in rows
+    ]
+
+
@app.get("/api/progress/{task_id}")
 async def progress_stream(task_id: str):
    """SSE stream of processing progress."""