From 7ad995aadec5fc539db12c3d5bb4284bd1aad77a Mon Sep 17 00:00:00 2001 From: Chaim Date: Tue, 26 May 2026 10:37:53 +0000 Subject: [PATCH] feat: #34 citation graph + #32 wide-modal precedent edit + #13 verify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## #34 — Daphna's internal citation graph New schema V16 (V15 was already used by proceeding_type): table ``precedent_internal_citations`` (source→cited, with cited_case_law_id nullable for citations whose target isn't in the corpus yet) + 3 indexes (source, target, unlinked). New service ``citation_extractor.py`` with regex patterns for ערר / בל"מ / עע"מ / בר"מ / עמ"נ / ע"א / בג"ץ / רע"א — accepts both ``\/`` and ``-`` separators, requires actual parenthesized district label to avoid greedy mid-paragraph captures. Resolves citations against ``case_law.case_number`` substring; default confidence 0.90 linked, 0.75 unlinked. ON CONFLICT DO NOTHING on (source, cited_case_number). 3 new MCP tools: ``extract_internal_citations``, ``list_internal_citations``, ``list_incoming_citations``. Optional flag ``include_cited_by=True`` on ``search_internal_decisions`` appends cited-by candidates as ``match_type='cited_by'`` stubs. Bulk-extracted from 40 internal_committee rows authored by דפנה תמיר: **353 distinct citations, 348 stored, 96 linked / 252 unlinked**. Top citers: 1079/24 (30), 1024/24 (19), 1009/25 (18). Top unlinked target: ע"א 3213/97 (cited 5x) — natural #35 candidates. ## #32 — Wide-modal precedent edit `precedent-edit-sheet.tsx`: ```` → centered ```` with ``sm:max-w-4xl`` ``max-h-[90vh]`` ``overflow-y-auto``. Component API unchanged so existing callers (`/precedents/[id]/page.tsx`, `library-list-panel.tsx`) work as-is. RTL preserved. Mobile falls back to near-full-width via shadcn default. ## #13 — 403/17 verification `case_law e151fc25-...` (אהרון ברק - תכנית רחביה) already in perfect shape after Stage A work: all metadata fields populated, 351 halachot with avg_conf=0.864 (well above 0.78 threshold). No re-extraction needed; closing task as verified. Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/src/legal_mcp/server.py | 67 +++ .../legal_mcp/services/citation_extractor.py | 434 ++++++++++++++++++ mcp-server/src/legal_mcp/services/db.py | 33 +- mcp-server/src/legal_mcp/tools/citations.py | 135 ++++++ mcp-server/src/legal_mcp/tools/search.py | 136 +++++- .../precedents/precedent-edit-sheet.tsx | 25 +- 6 files changed, 797 insertions(+), 33 deletions(-) create mode 100644 mcp-server/src/legal_mcp/services/citation_extractor.py create mode 100644 mcp-server/src/legal_mcp/tools/citations.py diff --git a/mcp-server/src/legal_mcp/server.py b/mcp-server/src/legal_mcp/server.py index 53ea944..7c67bd1 100644 --- a/mcp-server/src/legal_mcp/server.py +++ b/mcp-server/src/legal_mcp/server.py @@ -56,6 +56,7 @@ from legal_mcp.tools import ( # noqa: E402 internal_decisions as int_tools, legal_arguments as la_tools, missing_precedents as mp_tools, + citations as cit_tools, ) @@ -447,6 +448,7 @@ async def search_internal_decisions( chair_name: str = "", limit: int = 10, include_halachot: bool = True, + include_cited_by: bool = False, ) -> str: """חיפוש בהחלטות ועדות ערר לתכנון ובנייה (כל המחוזות). @@ -461,9 +463,13 @@ async def search_internal_decisions( chair_name: שם יו"ר הוועדה לסינון. ריק = כל היו"רים limit: מספר תוצאות מקסימלי include_halachot: האם לכלול הלכות שחולצו + include_cited_by: True = הוסף תוצאות עקיפות — לכל hit הוסף גם החלטות + שהוא מצטט (מתוך citation graph). שימושי לחיפוש "כל הקשור ל-X" + כשרוצים להרחיב מעבר לטקסט המקורי. default False. """ return await search.search_internal_decisions( query, practice_area, appeal_subtype, district, chair_name, limit, include_halachot, + include_cited_by=include_cited_by, ) @@ -803,6 +809,67 @@ async def missing_precedent_close( ) +# ── Internal citations graph (TaskMaster #34) ───────────────────── + + +@mcp.tool() +async def extract_internal_citations( + case_law_id: str = "", + chair_name: str = "", + limit: int = 0, +) -> str: + """חילוץ ציטוטים פנימיים מהחלטות ועדת ערר ושמירה ב-citation graph. + + משתמש בדפוסי regex עבריים ("ונפנה ל…", "כפי שקבעתי…", "ראה החלטתי…") + לזיהוי הפניות בין החלטות. אם case_law_id סופק — מריץ על שורה אחת + (שימושי אחרי upload). אם chair_name סופק — מריץ על כל ההחלטות של + אותו יו"ר. אם שניהם ריקים — מריץ על כל ה-internal_committee corpus. + + איידמפוטנטי: ניתן להריץ שוב ושוב בלי כפילויות. ציטוטים שמופנים + להחלטות שעדיין לא בקורפוס נשמרים כ-unlinked (cited_case_law_id=NULL) + ויראו ב-list_internal_citations כשהיו"ר יחליט אם להעלות אותן. + """ + return await cit_tools.extract_internal_citations( + case_law_id=case_law_id, + chair_name=chair_name, + limit=limit, + ) + + +@mcp.tool() +async def list_internal_citations( + case_law_id: str = "", + linked_only: bool = False, + limit: int = 50, +) -> str: + """רשימת ציטוטים יוצאים מהחלטה (מה ההחלטה מצטטת). + + משתמש לקבלת תמונה של בסיס הפסיקה שהחלטה הסתמכה עליו. + linked_only=True מסנן רק ציטוטים שזוהו ב-case_law של הקורפוס. + """ + return await cit_tools.list_internal_citations( + case_law_id=case_law_id, + linked_only=linked_only, + limit=limit, + ) + + +@mcp.tool() +async def list_incoming_citations( + case_law_id: str = "", + limit: int = 50, +) -> str: + """רשימת ציטוטים נכנסים אל החלטה (אילו החלטות מצטטות אותה). + + שימוש: רוצים לדעת אילו החלטות של דפנה (או של ועדות אחרות) הסתמכו + על פסק דין מסוים — מעבירים את ה-case_law_id של פסק הדין. + """ + return await cit_tools.list_incoming_citations( + case_law_id=case_law_id, + limit=limit, + ) + + @mcp.tool() async def record_chair_feedback( case_number: str, diff --git a/mcp-server/src/legal_mcp/services/citation_extractor.py b/mcp-server/src/legal_mcp/services/citation_extractor.py new file mode 100644 index 0000000..b0d0297 --- /dev/null +++ b/mcp-server/src/legal_mcp/services/citation_extractor.py @@ -0,0 +1,434 @@ +"""Internal citation graph extractor (TaskMaster #34). + +When Daphna (or any other internal_committee chair) cites another committee +decision inside the body of a ruling, she uses fairly stable phrases: + + "ונפנה לערר 1110/20 ירושלים שקופה …" + "כפי שקבעתי בערר 1041/24 …" + "בדומה לעמדתי בהחלטה ערר 8048/24 …" + "כפי שנקבע במחוז ת\"א בערר 1234/20 …" + "ראה החלטתי בערר 1015-01-24 …" + +This module scans the ``full_text`` of internal-committee ``case_law`` rows, +extracts those citations via regex, tries to link each cited case_number to a +row already in ``case_law`` (any source_kind), and stores the result in +``precedent_internal_citations``. Unresolved citations are kept with +``cited_case_law_id = NULL`` so the chair can see what's missing from the +corpus (and ``search_internal_decisions`` can surface "cited but absent" gaps). + +The result is a *citation graph* that downstream tools (search, researcher +agent) can join on to surface "decisions cited by this one" alongside +keyword/semantic hits — without re-running an LLM on every query. + +Patterns are *intentionally* permissive: we accept stray Hebrew quote marks +(both straight ``"`` and curly ``״``), optional district parens, and several +trigger phrases. False positives are de-duplicated downstream by the +``UNIQUE (source_case_law_id, cited_case_number)`` constraint and by case- +number normalization (see ``_normalize_case_number``). +""" + +from __future__ import annotations + +import logging +import re +from typing import Iterator +from uuid import UUID + +from legal_mcp.services import db + +logger = logging.getLogger(__name__) + + +# ── Patterns ───────────────────────────────────────────────────────── +# +# Two pattern families: +# 1. Appeals-committee citations ("ערר" / "בל\"מ") — primary target. +# These are the ones we resolve against ``case_law``. +# 2. Court rulings ("עע\"מ", "בר\"מ", "עמ\"נ", "ע\"א", "בג\"ץ", "רע\"א"). +# Stored as unlinked rows by default, so the researcher knows the +# decision quotes a higher court. +# +# Trigger words ("ונפנה", "כפי שקבעתי", "בדומה ל…", "ראה החלטתי", +# "כפי שנקבע") are *optional* — many citations appear without one (Daphna +# often introduces a quote with just "כפי שצוין בערר…"). We therefore +# match the citation core (prefix + number) and capture the surrounding +# sentence as context. +# +# Regex notes: +# * Hebrew gershayim/quotation: both straight (") and curly (״) are +# accepted via the character class [\"״]. +# * Case numbers can be NNNN/YY, NNNN-YY, or NNNN-MM-YY (the third form +# is the Nevo "filed" format: 1015-01-24 means file #1015 of Jan 2024). +# * Optional district paren: ערר (ועדות ערר - תכנון ובנייה ירושלים) +# 1110/20 — we allow up to 60 chars of parenthetical content. +# * \b doesn't behave well with Hebrew, so we anchor by whitespace or +# punctuation lookarounds. + +_TRIGGER = ( + r"(?:ונפנה\s+ל|" + r"כפי\s+ש(?:קבעתי|נקבע|פסקתי)\s+ב|" + r"בדומה\s+ל(?:עמדתי\s+ב)?|" + r"ראה\s+(?:את\s+)?(?:החלטתי\s+ב|פסיקת\s+ה?ועדה\s+ב)?|" + r"בעניין\s+|" + r"בהחלטת(?:י|ה|נו)?\s+ב?)?" +) + +# Optional district / committee parenthetical between the prefix and the +# case number. Matches things like "(ועדות ערר - תכנון ובנייה ירושלים)" +# or "(ירושלים)" or "(מרכז)". Up to 80 chars to be safe. Required actual +# parentheses (the `\(` and `\)` are NOT optional) — otherwise the regex +# greedily absorbs the next sentence's content and skips intermediate +# citations like "ראה גם ערר 1041/24 …\nכפי שקבעתי בערר (…) 1110/20". +_DISTRICT_PAREN = r"(?:\s*\([^)\n]{0,80}\)\s*)?" + +# Case-number core: 3-5 digits, optional separator and 2-4 digits (and +# optional third group for the NNNN-MM-YY format). +_NUM_RX = r"(\d{3,5}(?:[-/]\d{2,4}(?:[-/]\d{2,4})?)?)" + +_PATTERNS = [ + # 1. Appeals-committee — ערר / בל"מ + ( + "appeals_committee", + re.compile( + _TRIGGER + + r"(ערר|בל[\"״]מ)" + + _DISTRICT_PAREN + + r"\s*" + + _NUM_RX, + re.UNICODE, + ), + ), + # 2. Higher courts — עע"מ, בר"מ, עמ"נ, ע"א, בג"ץ, רע"א, דנ"א, בש"א + ( + "court_ruling", + re.compile( + _TRIGGER + + r"(עע[\"״]מ|בר[\"״]מ|עמ[\"״]נ|ע[\"״]א|בג[\"״]ץ|רע[\"״]א|דנ[\"״]א|בש[\"״]א)" + + r"\s*" + + _NUM_RX, + re.UNICODE, + ), + ), +] + + +# Context window for storing the match (characters before/after). +_CTX_BEFORE = 120 +_CTX_AFTER = 240 + + +def _normalize_case_number(raw: str) -> str: + """Normalize a case-number for matching. + + The same case can appear in the corpus as "1110/20", "1110-20", + "ערר 1110/20", "1110-01-20" — different rules for the third form, + which is the Nevo file format. We canonicalize by: + * stripping non-digit/separator chars + * unifying "/" → "-" + * lowercasing + The result is used only for matching, never for display. + """ + cleaned = re.sub(r"[^\d/\-]", "", raw or "") + return cleaned.replace("/", "-").strip("-") + + +def extract_citations_from_text(text: str) -> Iterator[dict]: + """Yield citation dicts extracted from ``text``. + + Each dict has: + prefix: matched prefix (ערר / בל\"מ / עע\"מ / …) + case_number: raw number as captured + case_number_norm: normalized (slashes → dashes, digits only) + raw: the full matched span + context: ±300 chars surrounding the match (whitespace normalized) + pattern_kind: 'appeals_committee' or 'court_ruling' + """ + if not text: + return + seen: set[tuple[str, str]] = set() + for kind, pattern in _PATTERNS: + for m in pattern.finditer(text): + # The `_TRIGGER` is wrapped in (?:...) so it does not add a + # capture group; group(1) is the prefix, group(2) is the number. + prefix = (m.group(1) or "").strip() + number = (m.group(2) or "").strip() + if not prefix or not number: + continue + norm = _normalize_case_number(number) + if not norm: + continue + key = (kind, norm) + if key in seen: + continue + seen.add(key) + + start = max(0, m.start() - _CTX_BEFORE) + end = min(len(text), m.end() + _CTX_AFTER) + context = text[start:end].replace("\n", " ").strip() + context = re.sub(r"\s+", " ", context) + + yield { + "prefix": prefix, + "case_number": number, + "case_number_norm": norm, + "raw": m.group(0).strip(), + "context": context[:1000], + "pattern_kind": kind, + } + + +async def _resolve_case_law_id(case_number_norm: str) -> UUID | None: + """Try to resolve a normalized citation to an existing case_law row. + + Strategy: + 1. Exact match on normalized case_number column (after rewriting + existing case_numbers the same way). + 2. Substring match — the corpus often stores the full Nevo header + ("ערר ‏(‏ועדות ערר - תכנון ובנייה ירושלים‏)‏ 1110/20 …"), so we + search by ``case_number ILIKE '%1110/20%' OR '%1110-20%'``. + + Returns None if no row matches. + """ + if not case_number_norm: + return None + pool = await db.get_pool() + # Build the two raw forms (with slash and with dash) for substring match. + parts = case_number_norm.split("-") + if len(parts) >= 2: + slash_form = "/".join(parts[:2]) if len(parts) == 2 else parts[0] + "/" + parts[-1] + else: + slash_form = case_number_norm + dash_form = case_number_norm + + async with pool.acquire() as conn: + # Substring match on either form (covers full Nevo headers and short forms). + row = await conn.fetchrow( + """ + SELECT id FROM case_law + WHERE case_number ILIKE $1 OR case_number ILIKE $2 + ORDER BY (source_kind = 'internal_committee') DESC, + LENGTH(case_number) ASC + LIMIT 1 + """, + f"%{slash_form}%", + f"%{dash_form}%", + ) + return UUID(str(row["id"])) if row else None + + +async def extract_and_store(case_law_id: UUID) -> dict: + """Extract citations from a single ``case_law`` row's ``full_text``, + resolve them against the corpus, and INSERT into + ``precedent_internal_citations`` (ON CONFLICT DO NOTHING). + + Returns: {extracted: N, linked: M, new: K, skipped: S} + extracted — total distinct citations found in the text + linked — how many resolved to an existing case_law row + new — rows actually inserted (not pre-existing) + skipped — citations skipped (self-citation, already stored) + """ + pool = await db.get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT id, case_number, full_text FROM case_law WHERE id = $1", + case_law_id, + ) + if not row: + return {"extracted": 0, "linked": 0, "new": 0, "skipped": 0, "error": "not_found"} + + text = row["full_text"] or "" + own_norm = _normalize_case_number(row["case_number"] or "") + + extracted = 0 + linked = 0 + new_count = 0 + skipped = 0 + + for cit in extract_citations_from_text(text): + extracted += 1 + if cit["case_number_norm"] == own_norm: + # Self-citation (e.g. document headers repeating the case number). + skipped += 1 + continue + + cited_id = await _resolve_case_law_id(cit["case_number_norm"]) + if cited_id is not None and cited_id == case_law_id: + skipped += 1 + continue + if cited_id is not None: + linked += 1 + + async with pool.acquire() as conn: + result = await conn.execute( + """ + INSERT INTO precedent_internal_citations ( + source_case_law_id, cited_case_number, cited_case_law_id, + match_context, match_pattern, confidence + ) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (source_case_law_id, cited_case_number) DO NOTHING + """, + case_law_id, + f"{cit['prefix']} {cit['case_number']}", + cited_id, + cit["context"], + cit["pattern_kind"], + 0.90 if cited_id is not None else 0.75, + ) + # asyncpg execute returns 'INSERT 0 N' — N is rows inserted. + try: + n_inserted = int(result.split()[-1]) + except (ValueError, IndexError): + n_inserted = 0 + if n_inserted == 1: + new_count += 1 + else: + skipped += 1 + + return { + "extracted": extracted, + "linked": linked, + "new": new_count, + "skipped": skipped, + } + + +async def extract_all_internal_committee( + chair_name_filter: str = "", + limit: int = 0, +) -> dict: + """Run extraction over every internal-committee row in ``case_law``. + + Args: + chair_name_filter: if non-empty, restrict to rows where chair_name + matches (exact match). Useful for running on Daphna only. + limit: hard cap on number of rows processed (0 = no cap). + + Returns: summary dict with per-row counts and aggregate totals. + """ + pool = await db.get_pool() + conditions = ["source_kind = 'internal_committee'", "full_text <> ''"] + params: list = [] + if chair_name_filter: + conditions.append("chair_name = $1") + params.append(chair_name_filter) + where = " WHERE " + " AND ".join(conditions) + limit_clause = f" LIMIT {int(limit)}" if limit and limit > 0 else "" + sql = f"SELECT id, case_number FROM case_law{where} ORDER BY created_at{limit_clause}" + + async with pool.acquire() as conn: + rows = await conn.fetch(sql, *params) + + totals = { + "processed": 0, + "extracted": 0, + "linked": 0, + "new": 0, + "skipped": 0, + "failed": 0, + "chair_name_filter": chair_name_filter, + "row_count": len(rows), + } + + for r in rows: + try: + stats = await extract_and_store(UUID(str(r["id"]))) + totals["processed"] += 1 + totals["extracted"] += stats.get("extracted", 0) + totals["linked"] += stats.get("linked", 0) + totals["new"] += stats.get("new", 0) + totals["skipped"] += stats.get("skipped", 0) + except Exception as e: + logger.exception("citation extraction failed for %s: %s", r["case_number"], e) + totals["failed"] += 1 + + return totals + + +async def list_citations_for_case_law( + case_law_id: UUID, + linked_only: bool = False, +) -> list[dict]: + """Return all citations *from* the given case_law row (outgoing edges).""" + pool = await db.get_pool() + where = "pic.source_case_law_id = $1" + if linked_only: + where += " AND pic.cited_case_law_id IS NOT NULL" + sql = f""" + SELECT pic.id::text AS id, + pic.cited_case_number, + pic.cited_case_law_id::text AS cited_case_law_id, + pic.match_context, + pic.match_pattern, + pic.confidence::float AS confidence, + pic.created_at, + cl.case_number AS target_case_number, + cl.case_name AS target_case_name, + cl.chair_name AS target_chair_name, + cl.district AS target_district + FROM precedent_internal_citations pic + LEFT JOIN case_law cl ON cl.id = pic.cited_case_law_id + WHERE {where} + ORDER BY pic.created_at + """ + async with pool.acquire() as conn: + rows = await conn.fetch(sql, case_law_id) + return [dict(r) for r in rows] + + +async def list_citations_to_case_law(case_law_id: UUID) -> list[dict]: + """Return all citations *to* the given case_law row (incoming edges). + + Useful for "which Daphna decisions cite this ruling?" queries. + """ + pool = await db.get_pool() + sql = """ + SELECT pic.id::text AS id, + pic.source_case_law_id::text AS source_case_law_id, + pic.cited_case_number, + pic.match_context, + pic.match_pattern, + pic.confidence::float AS confidence, + pic.created_at, + cl.case_number AS source_case_number, + cl.case_name AS source_case_name, + cl.chair_name AS source_chair_name, + cl.district AS source_district + FROM precedent_internal_citations pic + JOIN case_law cl ON cl.id = pic.source_case_law_id + WHERE pic.cited_case_law_id = $1 + ORDER BY pic.created_at DESC + """ + async with pool.acquire() as conn: + rows = await conn.fetch(sql, case_law_id) + return [dict(r) for r in rows] + + +async def get_cited_case_law_ids(source_case_law_ids: list[UUID]) -> dict[str, list[str]]: + """Bulk-fetch outgoing citation case_law_ids for the given source rows. + + Returns: {source_case_law_id (str): [cited_case_law_id (str), ...]} — + only including linked (resolved) citations. + + Used by search.search_internal_decisions(include_cited_by=True) to + expand result sets with the precedents the hits themselves cite, + without running a separate roundtrip per row. + """ + if not source_case_law_ids: + return {} + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + """ + SELECT source_case_law_id::text AS source_id, + cited_case_law_id::text AS cited_id + FROM precedent_internal_citations + WHERE source_case_law_id = ANY($1::uuid[]) + AND cited_case_law_id IS NOT NULL + """, + list(source_case_law_ids), + ) + out: dict[str, list[str]] = {} + for r in rows: + out.setdefault(r["source_id"], []).append(r["cited_id"]) + return out diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 4a52686..c2a504d 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -875,6 +875,36 @@ CREATE UNIQUE INDEX IF NOT EXISTS uq_cases_number_proc """ +# ── V16: Internal citations graph (TaskMaster #34) ──────────────── +# Auto-extracted citation graph between Daphna's (and other internal_committee) +# decisions. When an internal decision cites another committee decision in a +# patterned way ("ונפנה ל…", "כפי שקבעתי…", "ראה החלטתי…"), the citation +# extractor records the link here. ``cited_case_law_id`` is populated when the +# cited case_number resolves to a row in ``case_law``; otherwise it stays NULL +# and shows up in ``idx_pic_unlinked`` so the chair can decide whether to +# upload the missing decision. +SCHEMA_V16_SQL = """ +CREATE TABLE IF NOT EXISTS precedent_internal_citations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + source_case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, + cited_case_number TEXT NOT NULL, + cited_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL, + match_context TEXT, + match_pattern TEXT, + confidence NUMERIC(3,2) DEFAULT 0.85, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (source_case_law_id, cited_case_number) +); +CREATE INDEX IF NOT EXISTS idx_pic_source + ON precedent_internal_citations(source_case_law_id); +CREATE INDEX IF NOT EXISTS idx_pic_target + ON precedent_internal_citations(cited_case_law_id); +CREATE INDEX IF NOT EXISTS idx_pic_unlinked + ON precedent_internal_citations(cited_case_number) + WHERE cited_case_law_id IS NULL; +""" + + async def _run_schema_migrations(pool: asyncpg.Pool) -> None: async with pool.acquire() as conn: await conn.execute(SCHEMA_SQL) @@ -893,7 +923,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None: await conn.execute(SCHEMA_V13_SQL) await conn.execute(SCHEMA_V14_SQL) await conn.execute(SCHEMA_V15_SQL) - logger.info("Database schema initialized (v1-v15)") + await conn.execute(SCHEMA_V16_SQL) + logger.info("Database schema initialized (v1-v16)") async def init_schema() -> None: diff --git a/mcp-server/src/legal_mcp/tools/citations.py b/mcp-server/src/legal_mcp/tools/citations.py new file mode 100644 index 0000000..6273430 --- /dev/null +++ b/mcp-server/src/legal_mcp/tools/citations.py @@ -0,0 +1,135 @@ +"""MCP tools for the internal-decisions citation graph (TaskMaster #34). + +The citation graph captures pointers between Daphna's (and other internal +committee chairs') decisions: when one ruling cites another, ``precedent_ +internal_citations`` records the edge — resolved against ``case_law`` when +the cited row exists, kept as a stub when it doesn't. + +Three tools: + +- ``extract_internal_citations`` — run regex extraction on one row (by id) or + on every internal-committee row filtered by chair (e.g. Daphna only). + Idempotent: re-running does not duplicate rows (ON CONFLICT DO NOTHING). +- ``list_internal_citations`` — outgoing edges from a source row. Optional + ``linked_only`` filter for rows resolved to existing case_law UUIDs. +- ``list_incoming_citations`` — incoming edges to a target row ("which + Daphna decisions cite this ruling?"). + +These tools are *manual triggers*. The pipeline runs them after a new +internal-decision upload, but the chair / researcher can also re-run on +demand (for example after fixing OCR or after uploading a previously- +missing decision so that newer rows now link to it). +""" + +from __future__ import annotations + +import json +from uuid import UUID + +from legal_mcp.services import citation_extractor + + +def _ok(payload) -> str: + return json.dumps(payload, ensure_ascii=False, indent=2, default=str) + + +def _err(msg: str) -> str: + return json.dumps({"error": msg}, ensure_ascii=False) + + +async def extract_internal_citations( + case_law_id: str = "", + chair_name: str = "", + limit: int = 0, +) -> str: + """חילוץ ציטוטים פנימיים מהחלטות ועדת ערר ושמירה ב-precedent_internal_citations. + + Args: + case_law_id: UUID של החלטה ספציפית. אם ריק וגם chair_name ריק — מריץ + על כל ההחלטות internal_committee. אם מסופק, חייב לעבור על שורה אחת + בלבד (משתמש בזה אחרי upload). + chair_name: שם יו"ר (כגון 'דפנה תמיר'). מסנן את האצווה. ריק = כל היו"רים. + limit: עליון על מספר רשומות שיעובדו (0 = ללא הגבלה). שימושי לבדיקה. + + הכלי איידמפוטנטי — ON CONFLICT DO NOTHING על (source_case_law_id, cited_case_number). + מחזיר סטטיסטיקה: extracted, linked, new, skipped, failed. + """ + if case_law_id.strip() and chair_name.strip(): + return _err("יש לספק case_law_id או chair_name, לא שניהם") + + if case_law_id.strip(): + try: + cl_uuid = UUID(case_law_id.strip()) + except ValueError: + return _err("case_law_id לא תקין") + try: + stats = await citation_extractor.extract_and_store(cl_uuid) + except Exception as e: + return _err(str(e)) + return _ok(stats) + + try: + stats = await citation_extractor.extract_all_internal_committee( + chair_name_filter=chair_name.strip(), + limit=int(limit) if limit else 0, + ) + except Exception as e: + return _err(str(e)) + return _ok(stats) + + +async def list_internal_citations( + case_law_id: str = "", + linked_only: bool = False, + limit: int = 50, +) -> str: + """רשימת ציטוטים יוצאים מהחלטה (מה ההחלטה הזו מצטטת). + + Args: + case_law_id: UUID של ה-case_law (חובה). + linked_only: True = רק ציטוטים שקושרו ל-case_law קיים בקורפוס. + limit: עליון על מספר תוצאות (default 50). + + Returns: JSON עם list של ציטוטים, כולל target_case_number/name/chair + כשהם linked. אם linked_only=False, ציטוטים בלתי קושרים יחזרו עם + cited_case_law_id=null וניתן להעלות אותם דרך internal_decision_upload. + """ + if not case_law_id.strip(): + return _err("case_law_id חובה") + try: + cl_uuid = UUID(case_law_id.strip()) + except ValueError: + return _err("case_law_id לא תקין") + try: + rows = await citation_extractor.list_citations_for_case_law( + cl_uuid, linked_only=bool(linked_only), + ) + except Exception as e: + return _err(str(e)) + return _ok({"items": rows[: max(1, int(limit))], "count": len(rows)}) + + +async def list_incoming_citations( + case_law_id: str = "", + limit: int = 50, +) -> str: + """רשימת ציטוטים נכנסים אל החלטה (אילו החלטות מצטטות אותה). + + שימוש: רוצים לדעת אילו החלטות של דפנה הסתמכו על פסק דין מסוים? + מעבירים את ה-case_law_id של פסק הדין הזה. + + Args: + case_law_id: UUID של ה-target case_law (חובה). + limit: עליון על מספר תוצאות. + """ + if not case_law_id.strip(): + return _err("case_law_id חובה") + try: + cl_uuid = UUID(case_law_id.strip()) + except ValueError: + return _err("case_law_id לא תקין") + try: + rows = await citation_extractor.list_citations_to_case_law(cl_uuid) + except Exception as e: + return _err(str(e)) + return _ok({"items": rows[: max(1, int(limit))], "count": len(rows)}) diff --git a/mcp-server/src/legal_mcp/tools/search.py b/mcp-server/src/legal_mcp/tools/search.py index fc0d3dc..4a0b584 100644 --- a/mcp-server/src/legal_mcp/tools/search.py +++ b/mcp-server/src/legal_mcp/tools/search.py @@ -189,6 +189,7 @@ async def search_internal_decisions( chair_name: str = "", limit: int = 10, include_halachot: bool = True, + include_cited_by: bool = False, ) -> str: """חיפוש בהחלטות ועדות ערר לתכנון ובנייה (כל המחוזות). @@ -200,42 +201,135 @@ async def search_internal_decisions( chair_name: שם יו"ר הוועדה לסינון. ריק = כל היו"רים limit: מספר תוצאות מקסימלי include_halachot: האם לכלול הלכות שחולצו + include_cited_by: True = אחרי החיפוש הראשי, הוסף החלטות שה-hits + הראשיים מצטטים (מתוך precedent_internal_citations). default False + כדי לא לשבור caller-ים קיימים. match_type='cited_by' מציין שזו + תוצאה משנית. """ from legal_mcp.services import internal_decisions as int_svc + # Bump the limit a bit when we're expanding via citations — the + # citation step is cheap and a few extra primary hits make the + # expansion more useful. + primary_limit = limit if not include_cited_by else max(limit, limit * 2) + results = await int_svc.search_internal( query, practice_area=practice_area, appeal_subtype=appeal_subtype, district=district, chair_name=chair_name, - limit=limit, + limit=primary_limit, include_halachot=include_halachot, ) if not results: return "לא נמצאו החלטות ועדת ערר רלוונטיות." + # Cap primary results back to ``limit`` (we over-fetched only to seed + # the citation expansion below — the user asked for ``limit`` items). + primary = results[:limit] + formatted = [] - for r in results: - entry = { - "score": round(float(r["score"]), 4), - "type": r.get("type", "passage"), - "case_number": r.get("case_number"), - "case_name": r.get("case_name"), - "court": r.get("court"), - "district": r.get("district"), - "chair_name": r.get("chair_name"), - "decision_date": r.get("decision_date"), - } - if r.get("type") == "halacha": - entry["rule"] = r.get("rule_statement") - entry["quote"] = r.get("supporting_quote") - entry["rule_type"] = r.get("rule_type") - else: - entry["content"] = r.get("content", "") - entry["section"] = r.get("section_type") - entry["page"] = r.get("page_number") - formatted.append(entry) + seen_case_law_ids: set[str] = set() + for r in primary: + clid = str(r.get("case_law_id") or "") + if clid: + seen_case_law_ids.add(clid) + formatted.append(_format_internal_row(r, match_type="primary")) + + if include_cited_by and seen_case_law_ids: + from uuid import UUID + from legal_mcp.services import citation_extractor + + try: + source_uuids = [UUID(s) for s in seen_case_law_ids] + cited_map = await citation_extractor.get_cited_case_law_ids(source_uuids) + except Exception as e: + logger.warning("include_cited_by lookup failed: %s", e) + cited_map = {} + + # Flatten + dedup the cited case_law_ids that aren't already in + # the primary set. + cited_ids: set[str] = set() + for ids in cited_map.values(): + for cid in ids: + if cid and cid not in seen_case_law_ids: + cited_ids.add(cid) + + if cited_ids: + cited_rows = await _fetch_case_law_summaries(list(cited_ids)) + for row in cited_rows: + formatted.append(_format_internal_row(row, match_type="cited_by")) return json.dumps(formatted, ensure_ascii=False, indent=2) + + +def _format_internal_row(r: dict, *, match_type: str = "primary") -> dict: + """Shape an internal-decision hit (or a cited_by stub) for the MCP response.""" + entry: dict = { + "score": round(float(r.get("score", 0.0)), 4), + "type": r.get("type", "passage"), + "case_number": r.get("case_number"), + "case_name": r.get("case_name"), + "court": r.get("court"), + "district": r.get("district"), + "chair_name": r.get("chair_name"), + "decision_date": r.get("decision_date"), + "match_type": match_type, + } + if r.get("type") == "halacha": + entry["rule"] = r.get("rule_statement") + entry["quote"] = r.get("supporting_quote") + entry["rule_type"] = r.get("rule_type") + else: + entry["content"] = r.get("content", "") + entry["section"] = r.get("section_type") + entry["page"] = r.get("page_number") + return entry + + +async def _fetch_case_law_summaries(case_law_ids: list[str]) -> list[dict]: + """Pull lightweight metadata for a set of case_law UUIDs (cited-by stubs). + + Doesn't pull chunks/halachot — the goal is to surface the existence of + the related precedent, not to repeat search. The caller can drill in + via search_internal_decisions with chair_name+case_number if they want + full passages. + """ + from uuid import UUID + pool = await db.get_pool() + uuid_list = [] + for s in case_law_ids: + try: + uuid_list.append(UUID(s)) + except ValueError: + continue + if not uuid_list: + return [] + async with pool.acquire() as conn: + rows = await conn.fetch( + """ + SELECT id::text AS case_law_id, + case_number, + case_name, + court, + district, + chair_name, + date AS decision_date, + headnote AS content + FROM case_law + WHERE id = ANY($1::uuid[]) + """, + uuid_list, + ) + out: list[dict] = [] + for r in rows: + d = dict(r) + if d.get("decision_date") is not None: + d["decision_date"] = d["decision_date"].isoformat() + # Stub rows show up with score 0 — they're not ranked, they're context. + d["score"] = 0.0 + d["type"] = "passage" + out.append(d) + return out diff --git a/web-ui/src/components/precedents/precedent-edit-sheet.tsx b/web-ui/src/components/precedents/precedent-edit-sheet.tsx index f3d046b..448a9e4 100644 --- a/web-ui/src/components/precedents/precedent-edit-sheet.tsx +++ b/web-ui/src/components/precedents/precedent-edit-sheet.tsx @@ -4,8 +4,8 @@ import { useState } from "react"; import { Save, Sparkles } from "lucide-react"; import { toast } from "sonner"; import { - Sheet, SheetContent, SheetHeader, SheetTitle, SheetDescription, -} from "@/components/ui/sheet"; + Dialog, DialogContent, DialogHeader, DialogTitle, DialogDescription, +} from "@/components/ui/dialog"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; @@ -136,17 +136,20 @@ export function PrecedentEditSheet({ caseLawId, onOpenChange }: Props) { }; return ( - { if (!o) onOpenChange(false); }}> - - - עריכת פרטי פסיקה - + { if (!o) onOpenChange(false); }}> + + + עריכת פרטי פסיקה + כל השדות ניתנים לעריכה חוץ ממראה המקום (מזהה ייחודי). כפתור "חלץ מטא-דאטה" שולח בקשה לתור מקומי שאני מרוקן מ-Claude Code (ה-LLM רץ מקומית עם claude session, לא ב-API). - - + + {isPending || !record ? (
@@ -317,7 +320,7 @@ export function PrecedentEditSheet({ caseLawId, onOpenChange }: Props) {
)} -
-
+ +
); }