diff --git a/mcp-server/src/legal_mcp/services/plans_extractor.py b/mcp-server/src/legal_mcp/services/plans_extractor.py index 2fe62c8..5b77353 100644 --- a/mcp-server/src/legal_mcp/services/plans_extractor.py +++ b/mcp-server/src/legal_mcp/services/plans_extractor.py @@ -13,6 +13,8 @@ claude_session המקומי בלבד (כמו שאר המחלצים) — לא Ant from __future__ import annotations import logging +import os +import re from uuid import UUID from legal_mcp.services import claude_session, db @@ -23,6 +25,18 @@ logger = logging.getLogger(__name__) # not a pinned model id — the session model is whatever is configured). MODEL_TAG = "claude_local" +# ── mavat auto-enrichment (Phase C trigger 2) ────────────────────────────────── +# When an extracted candidate is missing its validity (gazette_date / yalkut), we +# fill the gaps from the official source (mavat) via the host bridge. Conservative +# by design: only modern numeric plan numbers resolve on mavat search, each fetch +# drives a real browser (~30-60s, serial), so we gate by format + cap per call and +# fail soft. Set PLAN_ENRICH_FROM_MAVAT=0 to disable. +_ENRICH_ENABLED = os.environ.get("PLAN_ENRICH_FROM_MAVAT", "1").strip() not in ("0", "false", "") +_ENRICH_MAX_PER_CALL = int(os.environ.get("PLAN_ENRICH_MAX_PER_CALL", "8")) +# mavat search resolves the modern "NN-NNNNNNN" identifiers; legacy forms +# (מי/820, 5166/ב, תמ"א 38) don't, so don't waste a browser launch on them. +_MAVAT_NUM_RE = re.compile(r"^\d{2,4}-\d{6,8}$") + EXTRACT_PLANS_PROMPT = """אתה מחלץ מידע עובדתי על תכניות בניין-עיר (תב"ע) עבור מרשם-תכניות של ועדת ערר. @@ -112,16 +126,69 @@ async def extract_plans_from_text(text: str) -> list[dict]: return candidates +def _needs_enrichment(c: dict) -> bool: + """A candidate is worth enriching iff its validity is incomplete AND its + number is a mavat-resolvable modern identifier.""" + if not (_ENRICH_ENABLED and _MAVAT_NUM_RE.match((c.get("plan_number") or "").strip())): + return False + return not (c.get("gazette_date") and c.get("yalkut_number")) + + +async def _enrich_from_mavat(c: dict) -> tuple[dict, bool]: + """Fill a candidate's MISSING fields from mavat (never override case-grounded + values). Returns (candidate, enriched?). Fails soft — a bridge-down / not-found + / blocked fetch leaves the candidate untouched (logged, never swallowed).""" + from legal_mcp.services import plans_fetch + + num = c["plan_number"].strip() + try: + fetched = await plans_fetch.fetch_plan(num) + except plans_fetch.PlanFetchUnavailable as e: + logger.info("plan-enrich: bridge unavailable for %s — %s", num, e) + return c, False + except plans_fetch.PlanFetchError as e: + logger.info("plan-enrich: mavat had no usable result for %s — %s", num, e) + return c, False + except Exception as e: # noqa: BLE001 — never let enrichment break extraction + logger.warning("plan-enrich: unexpected error for %s — %s", num, e) + return c, False + + enriched = dict(c) + filled: list[str] = [] + for f in ("gazette_date", "yalkut_number", "display_name", "plan_type", "purpose"): + if not enriched.get(f) and fetched.get(f): + enriched[f] = fetched[f] + filled.append(f) + if filled: + logger.info("plan-enrich: %s filled %s from mavat (%s)", + num, ",".join(filled), fetched.get("source_url", "")) + return enriched, True + return c, False + + async def upsert_candidates( candidates: list[dict], *, source_case_number: str = "", source_document_id: UUID | None = None, model_used: str = MODEL_TAG, + enrich: bool = True, ) -> list[dict]: - """Upsert extracted candidates into the registry as pending_review (G10).""" + """Upsert extracted candidates into the registry as pending_review (G10). + + When ``enrich`` (default) and a candidate's validity is incomplete, its + missing fields are pulled from mavat first (capped per call). The row still + enters pending_review — enrichment changes the candidate, not the chair gate. + """ out: list[dict] = [] + enriched_count = 0 for c in candidates: + used = model_used + if enrich and enriched_count < _ENRICH_MAX_PER_CALL and _needs_enrichment(c): + c, did = await _enrich_from_mavat(c) + if did: + enriched_count += 1 + used = f"{model_used}+mavat" try: plan = await db.upsert_plan( plan_number=c["plan_number"], @@ -133,12 +200,18 @@ async def upsert_candidates( review_status="pending_review", source_case_number=source_case_number, source_document_id=source_document_id, - model_used=model_used, + model_used=used, ) out.append(plan) except ValueError as e: # Don't swallow — surface the bad candidate so it isn't silently dropped. logger.warning("upsert_candidates: skipped %r — %s", c.get("plan_number"), e) + if enrich and enriched_count >= _ENRICH_MAX_PER_CALL: + logger.warning( + "plan-enrich: hit the per-call cap (%d) — remaining candidates kept " + "as-extracted (no silent truncation; raise PLAN_ENRICH_MAX_PER_CALL).", + _ENRICH_MAX_PER_CALL, + ) return out