feat: external precedent library with auto halacha extraction

Adds a third corpus of legal authority distinct from style_corpus (Daphna's prior decisions for voice) and case_precedents (chair-attached quotes per case). The new corpus holds chair-uploaded court rulings and other appeals committee decisions, with binding rules (הלכות) extracted automatically and queued for chair approval. Pipeline (web/app.py + services/precedent_library.py): file → extract → chunk → Voyage embed → halacha_extractor → store + publish progress over the existing Redis SSE channel. Schema V7 (services/db.py): extends case_law with source_kind + extraction status fields under a CHECK constraint pinning practice_area to the three appeals committee domains (rishuy_uvniya, betterment_levy, compensation_197). New precedent_chunks (vector(1024)) and halachot tables (vector(1024) over rule_statement, IVFFlat indexes, gin on practice_areas/subject_tags). Halachot start as pending_review; only approved/published rows are visible to search_precedent_library. Agents: legal-writer, legal-researcher, legal-analyst, legal-ceo, legal-qa get search_precedent_library. legal-writer prompt explains the three-corpus distinction and CREAC use; legal-qa now verifies that every cited halacha resolves to an approved row in the corpus. UI: /precedents page with four tabs — library / semantic search / pending review (J/K nav, A/R/E shortcuts, badge count) / stats. Reuses the existing upload-sheet progress + SSE pattern. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 08:38:18 +00:00
parent a6edb75bbf
commit 7ee90dce31
23 changed files with 3853 additions and 67 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -3514,3 +3514,314 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
            "chunks": chunk_count,
        },
    })
+
+
+# ── External Precedent Library ────────────────────────────────────
+# Chair-uploaded court rulings + appeals committee decisions, with
+# automatic halacha extraction. Distinct from /api/training (style
+# corpus) and /api/cases/{n}/precedents (chair-attached quotes).
+
+from legal_mcp.services import precedent_library as plib_service  # noqa: E402
+
+
+_PRACTICE_AREAS = {"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
+_SOURCE_TYPES = {"", "court_ruling", "appeals_committee"}
+
+
+def _make_progress_publisher(task_id: str, filename: str):
+    """Build an async callback that pipes ingestion progress to Redis."""
+    async def publish(status: str, percent: int, message: str) -> None:
+        await _progress.set(task_id, {
+            "status": status if status in ("completed", "failed") else "processing",
+            "stage": status,
+            "filename": filename,
+            "step": message,
+            "percent": percent,
+        })
+    return publish
+
+
+class PrecedentUpdateRequest(BaseModel):
+    case_name: str | None = None
+    court: str | None = None
+    decision_date: str | None = None
+    practice_area: str | None = None
+    appeal_subtype: str | None = None
+    subject_tags: list[str] | None = None
+    summary: str | None = None
+    headnote: str | None = None
+    key_quote: str | None = None
+    source_url: str | None = None
+    source_type: str | None = None
+    precedent_level: str | None = None
+    is_binding: bool | None = None
+
+
+class HalachaUpdateRequest(BaseModel):
+    review_status: str | None = None
+    reviewer: str | None = "דפנה"
+    rule_statement: str | None = None
+    reasoning_summary: str | None = None
+    subject_tags: list[str] | None = None
+    practice_areas: list[str] | None = None
+
+
+@app.post("/api/precedent-library/upload")
+async def precedent_library_upload(
+    file: UploadFile = File(...),
+    citation: str = Form(...),
+    case_name: str = Form(""),
+    court: str = Form(""),
+    decision_date: str = Form(""),
+    source_type: str = Form(""),
+    precedent_level: str = Form(""),
+    practice_area: str = Form(""),
+    appeal_subtype: str = Form(""),
+    subject_tags: str = Form("[]"),  # JSON array string
+    is_binding: bool = Form(True),
+    headnote: str = Form(""),
+    summary: str = Form(""),
+):
+    """Upload a court ruling / appeals committee decision to the
+    authoritative precedent library. Halachot are extracted in the
+    background and queued for chair approval.
+    """
+    if practice_area not in _PRACTICE_AREAS:
+        raise HTTPException(400, "practice_area לא תקין")
+    if source_type not in _SOURCE_TYPES:
+        raise HTTPException(400, "source_type לא תקין")
+    if not citation.strip():
+        raise HTTPException(400, "citation חובה")
+
+    suffix = Path(file.filename or "").suffix.lower()
+    if suffix not in ALLOWED_EXTENSIONS:
+        raise HTTPException(400, f"סוג קובץ לא נתמך: {suffix}")
+
+    UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+    staged = UPLOAD_DIR / f"plib_{uuid4().hex[:8]}_{file.filename}"
+    size = 0
+    with staged.open("wb") as out:
+        while chunk := await file.read(1024 * 1024):
+            size += len(chunk)
+            if size > MAX_FILE_SIZE:
+                staged.unlink(missing_ok=True)
+                raise HTTPException(413, "קובץ גדול מדי")
+            out.write(chunk)
+
+    try:
+        tags = json.loads(subject_tags) if subject_tags else []
+        if not isinstance(tags, list):
+            tags = []
+    except json.JSONDecodeError:
+        tags = []
+
+    task_id = str(uuid4())
+    await _progress.set(task_id, {
+        "status": "queued", "filename": file.filename or "",
+        "stage": "queued", "percent": 0,
+    })
+
+    publish = _make_progress_publisher(task_id, file.filename or "")
+
+    async def _run():
+        try:
+            await plib_service.ingest_precedent(
+                file_path=staged,
+                citation=citation.strip(),
+                case_name=case_name.strip(),
+                court=court.strip(),
+                decision_date=decision_date or None,
+                source_type=source_type,
+                precedent_level=precedent_level,
+                practice_area=practice_area,
+                appeal_subtype=appeal_subtype.strip(),
+                subject_tags=tags,
+                is_binding=is_binding,
+                headnote=headnote.strip(),
+                summary=summary.strip(),
+                progress=publish,
+            )
+        except Exception as e:
+            logger.exception("precedent-library upload failed")
+            await _progress.set(task_id, {
+                "status": "failed", "error": str(e),
+                "filename": file.filename or "",
+            })
+        finally:
+            staged.unlink(missing_ok=True)
+
+    asyncio.create_task(_run())
+    return {"task_id": task_id}
+
+
+@app.get("/api/precedent-library")
+async def precedent_library_list(
+    practice_area: str = "",
+    court: str = "",
+    precedent_level: str = "",
+    source_type: str = "",
+    search: str = "",
+    limit: int = 100,
+    offset: int = 0,
+):
+    rows = await db.list_external_case_law(
+        practice_area=practice_area, court=court,
+        precedent_level=precedent_level, source_type=source_type,
+        search=search, limit=limit, offset=offset,
+    )
+    return {"items": rows, "count": len(rows)}
+
+
+@app.get("/api/precedent-library/stats")
+async def precedent_library_stats():
+    return await db.precedent_library_stats()
+
+
+@app.get("/api/precedent-library/search")
+async def precedent_library_search(
+    q: str,
+    practice_area: str = "",
+    court: str = "",
+    precedent_level: str = "",
+    appeal_subtype: str = "",
+    subject_tag: str = "",
+    limit: int = 10,
+    include_halachot: bool = True,
+):
+    if not q or len(q.strip()) < 2:
+        return {"items": [], "count": 0}
+    results = await plib_service.search_library(
+        query=q.strip(),
+        practice_area=practice_area,
+        court=court,
+        precedent_level=precedent_level,
+        appeal_subtype=appeal_subtype,
+        subject_tag=subject_tag,
+        limit=limit,
+        include_halachot=include_halachot,
+    )
+    return {"items": results, "count": len(results)}
+
+
+@app.get("/api/precedent-library/{case_law_id}")
+async def precedent_library_get(case_law_id: str):
+    try:
+        cid = UUID(case_law_id)
+    except ValueError:
+        raise HTTPException(400, "case_law_id לא תקין")
+    record = await plib_service.get_precedent(cid)
+    if not record:
+        raise HTTPException(404, "פסיקה לא נמצאה")
+    return record
+
+
+@app.patch("/api/precedent-library/{case_law_id}")
+async def precedent_library_update(case_law_id: str, req: PrecedentUpdateRequest):
+    try:
+        cid = UUID(case_law_id)
+    except ValueError:
+        raise HTTPException(400, "case_law_id לא תקין")
+    fields = {k: v for k, v in req.model_dump(exclude_unset=True).items() if v is not None}
+    if "practice_area" in fields and fields["practice_area"] not in _PRACTICE_AREAS:
+        raise HTTPException(400, "practice_area לא תקין")
+    if "decision_date" in fields and fields["decision_date"]:
+        try:
+            from datetime import date as date_type
+            fields["date"] = date_type.fromisoformat(fields.pop("decision_date")[:10])
+        except ValueError:
+            raise HTTPException(400, "decision_date לא תקין")
+    record = await db.update_case_law(cid, **fields)
+    if not record:
+        raise HTTPException(404, "פסיקה לא נמצאה")
+    return record
+
+
+@app.delete("/api/precedent-library/{case_law_id}")
+async def precedent_library_delete(case_law_id: str):
+    try:
+        cid = UUID(case_law_id)
+    except ValueError:
+        raise HTTPException(400, "case_law_id לא תקין")
+    ok = await plib_service.delete_precedent(cid)
+    if not ok:
+        raise HTTPException(404, "פסיקה לא נמצאה")
+    return {"deleted": True, "case_law_id": case_law_id}
+
+
+@app.post("/api/precedent-library/{case_law_id}/extract-halachot")
+async def precedent_library_reextract(case_law_id: str):
+    """Re-run halacha extraction in background. Returns a task_id for SSE."""
+    try:
+        cid = UUID(case_law_id)
+    except ValueError:
+        raise HTTPException(400, "case_law_id לא תקין")
+    record = await db.get_case_law(cid)
+    if not record:
+        raise HTTPException(404, "פסיקה לא נמצאה")
+
+    task_id = str(uuid4())
+    label = record.get("case_number") or case_law_id
+    await _progress.set(task_id, {
+        "status": "queued", "filename": label, "stage": "queued", "percent": 0,
+    })
+    publish = _make_progress_publisher(task_id, label)
+
+    async def _run():
+        try:
+            await plib_service.reextract_halachot(cid, progress=publish)
+        except Exception as e:
+            logger.exception("re-extract halachot failed")
+            await _progress.set(task_id, {
+                "status": "failed", "error": str(e), "filename": label,
+            })
+
+    asyncio.create_task(_run())
+    return {"task_id": task_id}
+
+
+@app.get("/api/halachot")
+async def halachot_list(
+    case_law_id: str = "",
+    review_status: str = "",
+    practice_area: str = "",
+    limit: int = 200,
+    offset: int = 0,
+):
+    cid: UUID | None = None
+    if case_law_id:
+        try:
+            cid = UUID(case_law_id)
+        except ValueError:
+            raise HTTPException(400, "case_law_id לא תקין")
+    rows = await db.list_halachot(
+        case_law_id=cid,
+        review_status=review_status or None,
+        practice_area=practice_area or None,
+        limit=limit, offset=offset,
+    )
+    return {"items": rows, "count": len(rows)}
+
+
+@app.patch("/api/halachot/{halacha_id}")
+async def halacha_update(halacha_id: str, req: HalachaUpdateRequest):
+    """Approve / reject / edit a halacha. Used by the chair review queue."""
+    try:
+        hid = UUID(halacha_id)
+    except ValueError:
+        raise HTTPException(400, "halacha_id לא תקין")
+    if req.review_status and req.review_status not in {
+        "pending_review", "approved", "rejected", "published",
+    }:
+        raise HTTPException(400, "review_status לא תקין")
+    row = await db.update_halacha(
+        halacha_id=hid,
+        review_status=req.review_status,
+        reviewer=req.reviewer or "",
+        rule_statement=req.rule_statement,
+        reasoning_summary=req.reasoning_summary,
+        subject_tags=req.subject_tags,
+        practice_areas=req.practice_areas,
+    )
+    if not row:
+        raise HTTPException(404, "הלכה לא נמצאה")
+    return row