From 43621e830032f7b17ac04d70fb10447c12e59b5d Mon Sep 17 00:00:00 2001 From: Chaim Date: Wed, 17 Jun 2026 11:21:54 +0000 Subject: [PATCH] =?UTF-8?q?feat(plans):=20=D7=94=D7=A2=D7=A9=D7=A8=D7=94-?= =?UTF-8?q?=D7=90=D7=95=D7=98=D7=95=D7=9E=D7=98=D7=99=D7=AA=20=D7=A9=D7=9C?= =?UTF-8?q?=20=D7=AA=D7=95=D7=A7=D7=A3-=D7=AA=D7=91"=D7=A2=20=D7=9E-mavat?= =?UTF-8?q?=20=D7=91=D7=97=D7=99=D7=9C=D7=95=D7=A5=20(Phase=20C=20=D7=98?= =?UTF-8?q?=D7=A8=D7=99=D7=92=D7=A8=202)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit כש-extract_plans מוצא מספר-תכנית עם תוקף-חסר (תאריך-רשומות / י"פ), upsert_candidates ממלא את החוסר מ-מנהל-התכנון לפני ה-upsert. הרשומה עדיין נכנסת pending_review — ההעשרה משנה את המועמד, לא את שער-היו"ר. שמרני בכוונה: - ממלא רק שדות-חסרים — לא דורס ערכים מעוגני-תיק (display_name/purpose מהחילוץ נשמרים). - מגודר לפורמט-mavat מודרני (\d{2,4}-\d{6,8}); מספרים-ישנים (מי/820, תמ"א 38) מדולגים (לא יבזבזו השקת-דפדפן). - תקרה PLAN_ENRICH_MAX_PER_CALL=8 (מתועד אם נחצה — בלי silent-cap). - fail-soft: גשר-למטה / לא-נמצא / חסום → המועמד נשאר כפי-שחולץ (לוג, לא בליעה שקטה). - דגל-כיבוי PLAN_ENRICH_FROM_MAVAT=0. - מקור-ההעשרה מסומן ב-model_used="claude_local+mavat". INV-AH: ערך-תוקף שנמשך נושא מקור (mavat); שדה-חסר נשאר ריק. G10: שער- היו"ר נשמר. G2: מרחיב את plans_fetch (#292), לא מסלול מקביל. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/legal_mcp/services/plans_extractor.py | 77 ++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/plans_extractor.py b/mcp-server/src/legal_mcp/services/plans_extractor.py index 2fe62c8..5b77353 100644 --- a/mcp-server/src/legal_mcp/services/plans_extractor.py +++ b/mcp-server/src/legal_mcp/services/plans_extractor.py @@ -13,6 +13,8 @@ claude_session המקומי בלבד (כמו שאר המחלצים) — לא Ant from __future__ import annotations import logging +import os +import re from uuid import UUID from legal_mcp.services import claude_session, db @@ -23,6 +25,18 @@ logger = logging.getLogger(__name__) # not a pinned model id — the session model is whatever is configured). MODEL_TAG = "claude_local" +# ── mavat auto-enrichment (Phase C trigger 2) ────────────────────────────────── +# When an extracted candidate is missing its validity (gazette_date / yalkut), we +# fill the gaps from the official source (mavat) via the host bridge. Conservative +# by design: only modern numeric plan numbers resolve on mavat search, each fetch +# drives a real browser (~30-60s, serial), so we gate by format + cap per call and +# fail soft. Set PLAN_ENRICH_FROM_MAVAT=0 to disable. +_ENRICH_ENABLED = os.environ.get("PLAN_ENRICH_FROM_MAVAT", "1").strip() not in ("0", "false", "") +_ENRICH_MAX_PER_CALL = int(os.environ.get("PLAN_ENRICH_MAX_PER_CALL", "8")) +# mavat search resolves the modern "NN-NNNNNNN" identifiers; legacy forms +# (מי/820, 5166/ב, תמ"א 38) don't, so don't waste a browser launch on them. +_MAVAT_NUM_RE = re.compile(r"^\d{2,4}-\d{6,8}$") + EXTRACT_PLANS_PROMPT = """אתה מחלץ מידע עובדתי על תכניות בניין-עיר (תב"ע) עבור מרשם-תכניות של ועדת ערר. @@ -112,16 +126,69 @@ async def extract_plans_from_text(text: str) -> list[dict]: return candidates +def _needs_enrichment(c: dict) -> bool: + """A candidate is worth enriching iff its validity is incomplete AND its + number is a mavat-resolvable modern identifier.""" + if not (_ENRICH_ENABLED and _MAVAT_NUM_RE.match((c.get("plan_number") or "").strip())): + return False + return not (c.get("gazette_date") and c.get("yalkut_number")) + + +async def _enrich_from_mavat(c: dict) -> tuple[dict, bool]: + """Fill a candidate's MISSING fields from mavat (never override case-grounded + values). Returns (candidate, enriched?). Fails soft — a bridge-down / not-found + / blocked fetch leaves the candidate untouched (logged, never swallowed).""" + from legal_mcp.services import plans_fetch + + num = c["plan_number"].strip() + try: + fetched = await plans_fetch.fetch_plan(num) + except plans_fetch.PlanFetchUnavailable as e: + logger.info("plan-enrich: bridge unavailable for %s — %s", num, e) + return c, False + except plans_fetch.PlanFetchError as e: + logger.info("plan-enrich: mavat had no usable result for %s — %s", num, e) + return c, False + except Exception as e: # noqa: BLE001 — never let enrichment break extraction + logger.warning("plan-enrich: unexpected error for %s — %s", num, e) + return c, False + + enriched = dict(c) + filled: list[str] = [] + for f in ("gazette_date", "yalkut_number", "display_name", "plan_type", "purpose"): + if not enriched.get(f) and fetched.get(f): + enriched[f] = fetched[f] + filled.append(f) + if filled: + logger.info("plan-enrich: %s filled %s from mavat (%s)", + num, ",".join(filled), fetched.get("source_url", "")) + return enriched, True + return c, False + + async def upsert_candidates( candidates: list[dict], *, source_case_number: str = "", source_document_id: UUID | None = None, model_used: str = MODEL_TAG, + enrich: bool = True, ) -> list[dict]: - """Upsert extracted candidates into the registry as pending_review (G10).""" + """Upsert extracted candidates into the registry as pending_review (G10). + + When ``enrich`` (default) and a candidate's validity is incomplete, its + missing fields are pulled from mavat first (capped per call). The row still + enters pending_review — enrichment changes the candidate, not the chair gate. + """ out: list[dict] = [] + enriched_count = 0 for c in candidates: + used = model_used + if enrich and enriched_count < _ENRICH_MAX_PER_CALL and _needs_enrichment(c): + c, did = await _enrich_from_mavat(c) + if did: + enriched_count += 1 + used = f"{model_used}+mavat" try: plan = await db.upsert_plan( plan_number=c["plan_number"], @@ -133,12 +200,18 @@ async def upsert_candidates( review_status="pending_review", source_case_number=source_case_number, source_document_id=source_document_id, - model_used=model_used, + model_used=used, ) out.append(plan) except ValueError as e: # Don't swallow — surface the bad candidate so it isn't silently dropped. logger.warning("upsert_candidates: skipped %r — %s", c.get("plan_number"), e) + if enrich and enriched_count >= _ENRICH_MAX_PER_CALL: + logger.warning( + "plan-enrich: hit the per-call cap (%d) — remaining candidates kept " + "as-extracted (no silent truncation; raise PLAN_ENRICH_MAX_PER_CALL).", + _ENRICH_MAX_PER_CALL, + ) return out