feat: #34 citation graph + #32 wide-modal precedent edit + #13 verify

## #34 — Daphna's internal citation graph New schema V16 (V15 was already used by proceeding_type): table ``precedent_internal_citations`` (source→cited, with cited_case_law_id nullable for citations whose target isn't in the corpus yet) + 3 indexes (source, target, unlinked). New service ``citation_extractor.py`` with regex patterns for ערר / בל"מ / עע"מ / בר"מ / עמ"נ / ע"א / בג"ץ / רע"א — accepts both ``\/`` and ``-`` separators, requires actual parenthesized district label to avoid greedy mid-paragraph captures. Resolves citations against ``case_law.case_number`` substring; default confidence 0.90 linked, 0.75 unlinked. ON CONFLICT DO NOTHING on (source, cited_case_number). 3 new MCP tools: ``extract_internal_citations``, ``list_internal_citations``, ``list_incoming_citations``. Optional flag ``include_cited_by=True`` on ``search_internal_decisions`` appends cited-by candidates as ``match_type='cited_by'`` stubs. Bulk-extracted from 40 internal_committee rows authored by דפנה תמיר: **353 distinct citations, 348 stored, 96 linked / 252 unlinked**. Top citers: 1079/24 (30), 1024/24 (19), 1009/25 (18). Top unlinked target: ע"א 3213/97 (cited 5x) — natural #35 candidates. ## #32 — Wide-modal precedent edit `precedent-edit-sheet.tsx`: ``<Sheet side="left">`` → centered ``<Dialog>`` with ``sm:max-w-4xl`` ``max-h-[90vh]`` ``overflow-y-auto``. Component API unchanged so existing callers (`/precedents/[id]/page.tsx`, `library-list-panel.tsx`) work as-is. RTL preserved. Mobile falls back to near-full-width via shadcn default. ## #13 — 403/17 verification `case_law e151fc25-...` (אהרון ברק - תכנית רחביה) already in perfect shape after Stage A work: all metadata fields populated, 351 halachot with avg_conf=0.864 (well above 0.78 threshold). No re-extraction needed; closing task as verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 10:37:53 +00:00
parent 9f4f8c60a4
commit 7ad995aade
6 changed files with 797 additions and 33 deletions
--- a/mcp-server/src/legal_mcp/tools/citations.py
+++ b/mcp-server/src/legal_mcp/tools/citations.py
@@ -0,0 +1,135 @@
+"""MCP tools for the internal-decisions citation graph (TaskMaster #34).
+
+The citation graph captures pointers between Daphna's (and other internal
+committee chairs') decisions: when one ruling cites another, ``precedent_
+internal_citations`` records the edge — resolved against ``case_law`` when
+the cited row exists, kept as a stub when it doesn't.
+
+Three tools:
+
+- ``extract_internal_citations`` — run regex extraction on one row (by id) or
+  on every internal-committee row filtered by chair (e.g. Daphna only).
+  Idempotent: re-running does not duplicate rows (ON CONFLICT DO NOTHING).
+- ``list_internal_citations`` — outgoing edges from a source row. Optional
+  ``linked_only`` filter for rows resolved to existing case_law UUIDs.
+- ``list_incoming_citations`` — incoming edges to a target row ("which
+  Daphna decisions cite this ruling?").
+
+These tools are *manual triggers*. The pipeline runs them after a new
+internal-decision upload, but the chair / researcher can also re-run on
+demand (for example after fixing OCR or after uploading a previously-
+missing decision so that newer rows now link to it).
+"""
+
+from __future__ import annotations
+
+import json
+from uuid import UUID
+
+from legal_mcp.services import citation_extractor
+
+
+def _ok(payload) -> str:
+    return json.dumps(payload, ensure_ascii=False, indent=2, default=str)
+
+
+def _err(msg: str) -> str:
+    return json.dumps({"error": msg}, ensure_ascii=False)
+
+
+async def extract_internal_citations(
+    case_law_id: str = "",
+    chair_name: str = "",
+    limit: int = 0,
+) -> str:
+    """חילוץ ציטוטים פנימיים מהחלטות ועדת ערר ושמירה ב-precedent_internal_citations.
+
+    Args:
+        case_law_id: UUID של החלטה ספציפית. אם ריק וגם chair_name ריק — מריץ
+            על כל ההחלטות internal_committee. אם מסופק, חייב לעבור על שורה אחת
+            בלבד (משתמש בזה אחרי upload).
+        chair_name: שם יו"ר (כגון 'דפנה תמיר'). מסנן את האצווה. ריק = כל היו"רים.
+        limit: עליון על מספר רשומות שיעובדו (0 = ללא הגבלה). שימושי לבדיקה.
+
+    הכלי איידמפוטנטי — ON CONFLICT DO NOTHING על (source_case_law_id, cited_case_number).
+    מחזיר סטטיסטיקה: extracted, linked, new, skipped, failed.
+    """
+    if case_law_id.strip() and chair_name.strip():
+        return _err("יש לספק case_law_id או chair_name, לא שניהם")
+
+    if case_law_id.strip():
+        try:
+            cl_uuid = UUID(case_law_id.strip())
+        except ValueError:
+            return _err("case_law_id לא תקין")
+        try:
+            stats = await citation_extractor.extract_and_store(cl_uuid)
+        except Exception as e:
+            return _err(str(e))
+        return _ok(stats)
+
+    try:
+        stats = await citation_extractor.extract_all_internal_committee(
+            chair_name_filter=chair_name.strip(),
+            limit=int(limit) if limit else 0,
+        )
+    except Exception as e:
+        return _err(str(e))
+    return _ok(stats)
+
+
+async def list_internal_citations(
+    case_law_id: str = "",
+    linked_only: bool = False,
+    limit: int = 50,
+) -> str:
+    """רשימת ציטוטים יוצאים מהחלטה (מה ההחלטה הזו מצטטת).
+
+    Args:
+        case_law_id: UUID של ה-case_law (חובה).
+        linked_only: True = רק ציטוטים שקושרו ל-case_law קיים בקורפוס.
+        limit: עליון על מספר תוצאות (default 50).
+
+    Returns: JSON עם list של ציטוטים, כולל target_case_number/name/chair
+        כשהם linked. אם linked_only=False, ציטוטים בלתי קושרים יחזרו עם
+        cited_case_law_id=null וניתן להעלות אותם דרך internal_decision_upload.
+    """
+    if not case_law_id.strip():
+        return _err("case_law_id חובה")
+    try:
+        cl_uuid = UUID(case_law_id.strip())
+    except ValueError:
+        return _err("case_law_id לא תקין")
+    try:
+        rows = await citation_extractor.list_citations_for_case_law(
+            cl_uuid, linked_only=bool(linked_only),
+        )
+    except Exception as e:
+        return _err(str(e))
+    return _ok({"items": rows[: max(1, int(limit))], "count": len(rows)})
+
+
+async def list_incoming_citations(
+    case_law_id: str = "",
+    limit: int = 50,
+) -> str:
+    """רשימת ציטוטים נכנסים אל החלטה (אילו החלטות מצטטות אותה).
+
+    שימוש: רוצים לדעת אילו החלטות של דפנה הסתמכו על פסק דין מסוים?
+    מעבירים את ה-case_law_id של פסק הדין הזה.
+
+    Args:
+        case_law_id: UUID של ה-target case_law (חובה).
+        limit: עליון על מספר תוצאות.
+    """
+    if not case_law_id.strip():
+        return _err("case_law_id חובה")
+    try:
+        cl_uuid = UUID(case_law_id.strip())
+    except ValueError:
+        return _err("case_law_id לא תקין")
+    try:
+        rows = await citation_extractor.list_citations_to_case_law(cl_uuid)
+    except Exception as e:
+        return _err(str(e))
+    return _ok({"items": rows[: max(1, int(limit))], "count": len(rows)})
--- a/mcp-server/src/legal_mcp/tools/search.py
+++ b/mcp-server/src/legal_mcp/tools/search.py
@@ -189,6 +189,7 @@ async def search_internal_decisions(
    chair_name: str = "",
    limit: int = 10,
    include_halachot: bool = True,
+    include_cited_by: bool = False,
 ) -> str:
    """חיפוש בהחלטות ועדות ערר לתכנון ובנייה (כל המחוזות).

@@ -200,42 +201,135 @@ async def search_internal_decisions(
        chair_name: שם יו"ר הוועדה לסינון. ריק = כל היו"רים
        limit: מספר תוצאות מקסימלי
        include_halachot: האם לכלול הלכות שחולצו
+        include_cited_by: True = אחרי החיפוש הראשי, הוסף החלטות שה-hits
+            הראשיים מצטטים (מתוך precedent_internal_citations). default False
+            כדי לא לשבור caller-ים קיימים. match_type='cited_by' מציין שזו
+            תוצאה משנית.
    """
    from legal_mcp.services import internal_decisions as int_svc

+    # Bump the limit a bit when we're expanding via citations — the
+    # citation step is cheap and a few extra primary hits make the
+    # expansion more useful.
+    primary_limit = limit if not include_cited_by else max(limit, limit * 2)
+
    results = await int_svc.search_internal(
        query,
        practice_area=practice_area,
        appeal_subtype=appeal_subtype,
        district=district,
        chair_name=chair_name,
-        limit=limit,
+        limit=primary_limit,
        include_halachot=include_halachot,
    )

    if not results:
        return "לא נמצאו החלטות ועדת ערר רלוונטיות."

+    # Cap primary results back to ``limit`` (we over-fetched only to seed
+    # the citation expansion below — the user asked for ``limit`` items).
+    primary = results[:limit]
+
    formatted = []
-    for r in results:
-        entry = {
-            "score": round(float(r["score"]), 4),
-            "type": r.get("type", "passage"),
-            "case_number": r.get("case_number"),
-            "case_name": r.get("case_name"),
-            "court": r.get("court"),
-            "district": r.get("district"),
-            "chair_name": r.get("chair_name"),
-            "decision_date": r.get("decision_date"),
-        }
-        if r.get("type") == "halacha":
-            entry["rule"] = r.get("rule_statement")
-            entry["quote"] = r.get("supporting_quote")
-            entry["rule_type"] = r.get("rule_type")
-        else:
-            entry["content"] = r.get("content", "")
-            entry["section"] = r.get("section_type")
-            entry["page"] = r.get("page_number")
-        formatted.append(entry)
+    seen_case_law_ids: set[str] = set()
+    for r in primary:
+        clid = str(r.get("case_law_id") or "")
+        if clid:
+            seen_case_law_ids.add(clid)
+        formatted.append(_format_internal_row(r, match_type="primary"))
+
+    if include_cited_by and seen_case_law_ids:
+        from uuid import UUID
+        from legal_mcp.services import citation_extractor
+
+        try:
+            source_uuids = [UUID(s) for s in seen_case_law_ids]
+            cited_map = await citation_extractor.get_cited_case_law_ids(source_uuids)
+        except Exception as e:
+            logger.warning("include_cited_by lookup failed: %s", e)
+            cited_map = {}
+
+        # Flatten + dedup the cited case_law_ids that aren't already in
+        # the primary set.
+        cited_ids: set[str] = set()
+        for ids in cited_map.values():
+            for cid in ids:
+                if cid and cid not in seen_case_law_ids:
+                    cited_ids.add(cid)
+
+        if cited_ids:
+            cited_rows = await _fetch_case_law_summaries(list(cited_ids))
+            for row in cited_rows:
+                formatted.append(_format_internal_row(row, match_type="cited_by"))

    return json.dumps(formatted, ensure_ascii=False, indent=2)
+
+
+def _format_internal_row(r: dict, *, match_type: str = "primary") -> dict:
+    """Shape an internal-decision hit (or a cited_by stub) for the MCP response."""
+    entry: dict = {
+        "score": round(float(r.get("score", 0.0)), 4),
+        "type": r.get("type", "passage"),
+        "case_number": r.get("case_number"),
+        "case_name": r.get("case_name"),
+        "court": r.get("court"),
+        "district": r.get("district"),
+        "chair_name": r.get("chair_name"),
+        "decision_date": r.get("decision_date"),
+        "match_type": match_type,
+    }
+    if r.get("type") == "halacha":
+        entry["rule"] = r.get("rule_statement")
+        entry["quote"] = r.get("supporting_quote")
+        entry["rule_type"] = r.get("rule_type")
+    else:
+        entry["content"] = r.get("content", "")
+        entry["section"] = r.get("section_type")
+        entry["page"] = r.get("page_number")
+    return entry
+
+
+async def _fetch_case_law_summaries(case_law_ids: list[str]) -> list[dict]:
+    """Pull lightweight metadata for a set of case_law UUIDs (cited-by stubs).
+
+    Doesn't pull chunks/halachot — the goal is to surface the existence of
+    the related precedent, not to repeat search. The caller can drill in
+    via search_internal_decisions with chair_name+case_number if they want
+    full passages.
+    """
+    from uuid import UUID
+    pool = await db.get_pool()
+    uuid_list = []
+    for s in case_law_ids:
+        try:
+            uuid_list.append(UUID(s))
+        except ValueError:
+            continue
+    if not uuid_list:
+        return []
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            """
+            SELECT id::text AS case_law_id,
+                   case_number,
+                   case_name,
+                   court,
+                   district,
+                   chair_name,
+                   date AS decision_date,
+                   headnote AS content
+              FROM case_law
+             WHERE id = ANY($1::uuid[])
+            """,
+            uuid_list,
+        )
+    out: list[dict] = []
+    for r in rows:
+        d = dict(r)
+        if d.get("decision_date") is not None:
+            d["decision_date"] = d["decision_date"].isoformat()
+        # Stub rows show up with score 0 — they're not ranked, they're context.
+        d["score"] = 0.0
+        d["type"] = "passage"
+        out.append(d)
+    return out