feat(plans): העשרה-אוטומטית של תוקף-תב"ע מ-mavat בחילוץ (Phase C טריגר 2) #293

Merged
chaim merged 1 commits from worktree-plan-enrich-extractor into main 2026-06-17 11:28:04 +00:00

View File

@@ -13,6 +13,8 @@ claude_session המקומי בלבד (כמו שאר המחלצים) — לא Ant
from __future__ import annotations
import logging
import os
import re
from uuid import UUID
from legal_mcp.services import claude_session, db
@@ -23,6 +25,18 @@ logger = logging.getLogger(__name__)
# not a pinned model id — the session model is whatever is configured).
MODEL_TAG = "claude_local"
# ── mavat auto-enrichment (Phase C trigger 2) ──────────────────────────────────
# When an extracted candidate is missing its validity (gazette_date / yalkut), we
# fill the gaps from the official source (mavat) via the host bridge. Conservative
# by design: only modern numeric plan numbers resolve on mavat search, each fetch
# drives a real browser (~30-60s, serial), so we gate by format + cap per call and
# fail soft. Set PLAN_ENRICH_FROM_MAVAT=0 to disable.
_ENRICH_ENABLED = os.environ.get("PLAN_ENRICH_FROM_MAVAT", "1").strip() not in ("0", "false", "")
_ENRICH_MAX_PER_CALL = int(os.environ.get("PLAN_ENRICH_MAX_PER_CALL", "8"))
# mavat search resolves the modern "NN-NNNNNNN" identifiers; legacy forms
# (מי/820, 5166/ב, תמ"א 38) don't, so don't waste a browser launch on them.
_MAVAT_NUM_RE = re.compile(r"^\d{2,4}-\d{6,8}$")
EXTRACT_PLANS_PROMPT = """אתה מחלץ מידע עובדתי על תכניות בניין-עיר (תב"ע) עבור מרשם-תכניות של ועדת ערר.
@@ -112,16 +126,69 @@ async def extract_plans_from_text(text: str) -> list[dict]:
return candidates
def _needs_enrichment(c: dict) -> bool:
"""A candidate is worth enriching iff its validity is incomplete AND its
number is a mavat-resolvable modern identifier."""
if not (_ENRICH_ENABLED and _MAVAT_NUM_RE.match((c.get("plan_number") or "").strip())):
return False
return not (c.get("gazette_date") and c.get("yalkut_number"))
async def _enrich_from_mavat(c: dict) -> tuple[dict, bool]:
"""Fill a candidate's MISSING fields from mavat (never override case-grounded
values). Returns (candidate, enriched?). Fails soft — a bridge-down / not-found
/ blocked fetch leaves the candidate untouched (logged, never swallowed)."""
from legal_mcp.services import plans_fetch
num = c["plan_number"].strip()
try:
fetched = await plans_fetch.fetch_plan(num)
except plans_fetch.PlanFetchUnavailable as e:
logger.info("plan-enrich: bridge unavailable for %s%s", num, e)
return c, False
except plans_fetch.PlanFetchError as e:
logger.info("plan-enrich: mavat had no usable result for %s%s", num, e)
return c, False
except Exception as e: # noqa: BLE001 — never let enrichment break extraction
logger.warning("plan-enrich: unexpected error for %s%s", num, e)
return c, False
enriched = dict(c)
filled: list[str] = []
for f in ("gazette_date", "yalkut_number", "display_name", "plan_type", "purpose"):
if not enriched.get(f) and fetched.get(f):
enriched[f] = fetched[f]
filled.append(f)
if filled:
logger.info("plan-enrich: %s filled %s from mavat (%s)",
num, ",".join(filled), fetched.get("source_url", ""))
return enriched, True
return c, False
async def upsert_candidates(
candidates: list[dict],
*,
source_case_number: str = "",
source_document_id: UUID | None = None,
model_used: str = MODEL_TAG,
enrich: bool = True,
) -> list[dict]:
"""Upsert extracted candidates into the registry as pending_review (G10)."""
"""Upsert extracted candidates into the registry as pending_review (G10).
When ``enrich`` (default) and a candidate's validity is incomplete, its
missing fields are pulled from mavat first (capped per call). The row still
enters pending_review — enrichment changes the candidate, not the chair gate.
"""
out: list[dict] = []
enriched_count = 0
for c in candidates:
used = model_used
if enrich and enriched_count < _ENRICH_MAX_PER_CALL and _needs_enrichment(c):
c, did = await _enrich_from_mavat(c)
if did:
enriched_count += 1
used = f"{model_used}+mavat"
try:
plan = await db.upsert_plan(
plan_number=c["plan_number"],
@@ -133,12 +200,18 @@ async def upsert_candidates(
review_status="pending_review",
source_case_number=source_case_number,
source_document_id=source_document_id,
model_used=model_used,
model_used=used,
)
out.append(plan)
except ValueError as e:
# Don't swallow — surface the bad candidate so it isn't silently dropped.
logger.warning("upsert_candidates: skipped %r%s", c.get("plan_number"), e)
if enrich and enriched_count >= _ENRICH_MAX_PER_CALL:
logger.warning(
"plan-enrich: hit the per-call cap (%d) — remaining candidates kept "
"as-extracted (no silent truncation; raise PLAN_ENRICH_MAX_PER_CALL).",
_ENRICH_MAX_PER_CALL,
)
return out