Add training corpus UI with Nevo proofreading pipeline
- New proofreader service strips Nevo editorial additions (front matter,
postamble, page headers, watermarks, inline codes) from DOCX/PDF/MD
- PDF pages use Google Vision OCR for clean Hebrew RTL extraction
- New training page at #/training with drag-and-drop upload, automatic
metadata extraction (decision number, date, categories), reviewable
preview, and style pattern report grouped by type
- API endpoints: /api/training/{analyze,upload,corpus,patterns,
analyze-style,analyze-style/status}
- Fix claude_session.query to pipe prompt via stdin, avoiding ARG_MAX
overflow when analyzing 900K+ char corpus
- CLI scripts for batch proofreading and corpus upload
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
257
web/app.py
257
web/app.py
@@ -28,7 +28,7 @@ from pydantic import BaseModel
|
||||
import asyncpg
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor, processor
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor, processor, proofreader
|
||||
from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools
|
||||
|
||||
# Import integration clients (same directory)
|
||||
@@ -163,6 +163,261 @@ async def classify_file(req: ClassifyRequest):
|
||||
return {"task_id": task_id}
|
||||
|
||||
|
||||
# ── Training Corpus: Analyze & Upload ─────────────────────────────
|
||||
|
||||
|
||||
@app.post("/api/training/analyze")
|
||||
async def training_analyze(filename: str = Form(...)):
|
||||
"""Proofread an uploaded file and extract metadata for review.
|
||||
|
||||
Input: filename in UPLOAD_DIR (from /api/upload).
|
||||
Output: clean text preview + extracted metadata (number, date, categories).
|
||||
"""
|
||||
source = UPLOAD_DIR / filename
|
||||
if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
|
||||
raise HTTPException(404, "File not found in uploads")
|
||||
|
||||
try:
|
||||
result = await proofreader.analyze_file(source)
|
||||
except Exception as e:
|
||||
logger.exception("Proofread failed for %s", filename)
|
||||
raise HTTPException(500, f"Proofreading failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class TrainingUploadRequest(BaseModel):
|
||||
filename: str # name in UPLOAD_DIR
|
||||
decision_number: str = ""
|
||||
decision_date: str = "" # YYYY-MM-DD
|
||||
subject_categories: list[str] = []
|
||||
title: str = ""
|
||||
|
||||
|
||||
@app.post("/api/training/upload")
|
||||
async def training_upload(req: TrainingUploadRequest):
|
||||
"""Upload a proofread file to the style corpus.
|
||||
|
||||
Runs proofreading again to guarantee clean text (not raw file content),
|
||||
then inserts into style_corpus + chunks + embeddings.
|
||||
"""
|
||||
source = UPLOAD_DIR / req.filename
|
||||
if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
|
||||
raise HTTPException(404, "File not found in uploads")
|
||||
|
||||
# Check for duplicate by decision_number
|
||||
if req.decision_number:
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
exists = await conn.fetchval(
|
||||
"SELECT 1 FROM style_corpus WHERE decision_number = $1 LIMIT 1",
|
||||
req.decision_number,
|
||||
)
|
||||
if exists:
|
||||
raise HTTPException(
|
||||
409,
|
||||
f"החלטה {req.decision_number} כבר קיימת בקורפוס",
|
||||
)
|
||||
|
||||
task_id = str(uuid4())
|
||||
_progress[task_id] = {"status": "queued", "filename": req.filename}
|
||||
asyncio.create_task(_process_proofread_training(task_id, source, req))
|
||||
return {"task_id": task_id}
|
||||
|
||||
|
||||
async def _process_proofread_training(
|
||||
task_id: str, source: Path, req: TrainingUploadRequest
|
||||
):
|
||||
"""Background task: proofread → store in corpus → chunk → embed."""
|
||||
from datetime import date as date_type
|
||||
|
||||
try:
|
||||
title = req.title or source.stem.split("_", 1)[-1]
|
||||
|
||||
# 1. Proofread (strip Nevo additions)
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "proofreading"}
|
||||
clean_text, stats = await proofreader.proofread(source)
|
||||
|
||||
# 2. Save proofread .md to training dir (alongside original)
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "saving"}
|
||||
training_dir = config.TRAINING_DIR
|
||||
proofread_dir = training_dir / "proofread"
|
||||
training_dir.mkdir(parents=True, exist_ok=True)
|
||||
proofread_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Copy original to training dir
|
||||
original_name = re.sub(r"^\d+_", "", source.name)
|
||||
orig_dest = training_dir / original_name
|
||||
shutil.copy2(str(source), str(orig_dest))
|
||||
|
||||
# Save cleaned version
|
||||
proofread_name = Path(original_name).stem + ".md"
|
||||
proofread_dest = proofread_dir / proofread_name
|
||||
proofread_dest.write_text(clean_text, encoding="utf-8")
|
||||
|
||||
# 3. Parse date
|
||||
d_date = None
|
||||
if req.decision_date:
|
||||
d_date = date_type.fromisoformat(req.decision_date)
|
||||
|
||||
# 4. Add to style corpus
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "corpus"}
|
||||
corpus_id = await db.add_to_style_corpus(
|
||||
document_id=None,
|
||||
decision_number=req.decision_number,
|
||||
decision_date=d_date,
|
||||
subject_categories=req.subject_categories,
|
||||
full_text=clean_text,
|
||||
)
|
||||
|
||||
# 5. Chunk + embed
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
|
||||
chunks = chunker.chunk_document(clean_text)
|
||||
chunk_count = 0
|
||||
if chunks:
|
||||
doc = await db.create_document(
|
||||
case_id=None,
|
||||
doc_type="decision",
|
||||
title=f"[קורפוס] {title}",
|
||||
file_path=str(orig_dest),
|
||||
page_count=stats.get("pages", 0),
|
||||
)
|
||||
doc_id = UUID(doc["id"])
|
||||
await db.update_document(
|
||||
doc_id, extracted_text=clean_text, extraction_status="completed"
|
||||
)
|
||||
|
||||
_progress[task_id] = {
|
||||
"status": "processing", "filename": req.filename, "step": "embedding",
|
||||
}
|
||||
texts = [c.content for c in chunks]
|
||||
embs = await embeddings.embed_texts(texts, input_type="document")
|
||||
chunk_dicts = [
|
||||
{
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"embedding": emb,
|
||||
"page_number": c.page_number,
|
||||
"chunk_index": c.chunk_index,
|
||||
}
|
||||
for c, emb in zip(chunks, embs)
|
||||
]
|
||||
await db.store_chunks(doc_id, None, chunk_dicts)
|
||||
chunk_count = len(chunks)
|
||||
|
||||
# 6. Cleanup upload
|
||||
source.unlink(missing_ok=True)
|
||||
|
||||
_progress[task_id] = {
|
||||
"status": "completed",
|
||||
"filename": req.filename,
|
||||
"result": {
|
||||
"corpus_id": str(corpus_id),
|
||||
"title": title,
|
||||
"chars": len(clean_text),
|
||||
"chunks": chunk_count,
|
||||
"proofread_stats": stats,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.exception("Training upload failed for %s", req.filename)
|
||||
_progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}
|
||||
|
||||
|
||||
@app.get("/api/training/patterns")
|
||||
async def training_patterns():
|
||||
"""List all extracted style patterns, grouped by type."""
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT pattern_type, pattern_text, frequency, context, examples "
|
||||
"FROM style_patterns "
|
||||
"ORDER BY pattern_type, frequency DESC"
|
||||
)
|
||||
|
||||
grouped: dict[str, list] = {}
|
||||
for r in rows:
|
||||
pt = r["pattern_type"]
|
||||
examples = r["examples"]
|
||||
if isinstance(examples, str):
|
||||
try:
|
||||
examples = json.loads(examples)
|
||||
except Exception:
|
||||
examples = []
|
||||
grouped.setdefault(pt, []).append({
|
||||
"pattern_text": r["pattern_text"],
|
||||
"frequency": r["frequency"],
|
||||
"context": r["context"] or "",
|
||||
"examples": examples or [],
|
||||
})
|
||||
return {"total": len(rows), "by_type": grouped}
|
||||
|
||||
|
||||
_style_analysis_state = {"running": False, "started_at": None, "result": None, "error": None}
|
||||
|
||||
|
||||
@app.post("/api/training/analyze-style")
|
||||
async def training_analyze_style():
|
||||
"""Kick off style analysis over the corpus. Returns immediately."""
|
||||
if _style_analysis_state["running"]:
|
||||
raise HTTPException(409, "ניתוח סגנון כבר רץ")
|
||||
|
||||
_style_analysis_state.update(
|
||||
{"running": True, "started_at": time.time(), "result": None, "error": None}
|
||||
)
|
||||
|
||||
async def _run():
|
||||
from legal_mcp.services.style_analyzer import analyze_corpus
|
||||
try:
|
||||
result = await analyze_corpus()
|
||||
_style_analysis_state["result"] = result
|
||||
except Exception as e:
|
||||
logger.exception("Style analysis failed")
|
||||
_style_analysis_state["error"] = str(e)
|
||||
finally:
|
||||
_style_analysis_state["running"] = False
|
||||
|
||||
asyncio.create_task(_run())
|
||||
return {"status": "started"}
|
||||
|
||||
|
||||
@app.get("/api/training/analyze-style/status")
|
||||
async def training_analyze_style_status():
|
||||
"""Poll status of the running style analysis."""
|
||||
state = dict(_style_analysis_state)
|
||||
if state["started_at"]:
|
||||
state["elapsed"] = int(time.time() - state["started_at"])
|
||||
return state
|
||||
|
||||
|
||||
@app.get("/api/training/corpus")
|
||||
async def training_corpus_list():
|
||||
"""List all decisions currently in the style corpus."""
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT id, decision_number, decision_date, subject_categories, "
|
||||
" length(full_text) as chars, created_at "
|
||||
"FROM style_corpus "
|
||||
"ORDER BY created_at DESC"
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": str(r["id"]),
|
||||
"decision_number": r["decision_number"] or "",
|
||||
"decision_date": str(r["decision_date"]) if r["decision_date"] else "",
|
||||
"subject_categories": (
|
||||
json.loads(r["subject_categories"])
|
||||
if isinstance(r["subject_categories"], str)
|
||||
else r["subject_categories"] or []
|
||||
),
|
||||
"chars": r["chars"],
|
||||
"created_at": r["created_at"].isoformat() if r["created_at"] else "",
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
@app.get("/api/progress/{task_id}")
|
||||
async def progress_stream(task_id: str):
|
||||
"""SSE stream of processing progress."""
|
||||
|
||||
Reference in New Issue
Block a user