Files
legal-ai/mcp-server/tests/test_metadata_extract_chair_practice_area.py
Chaim 406e93b9bf
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 3s
Lint — undefined names / undefined-names (pull_request) Successful in 10s
fix(precedents): חילוץ-מטא-דאטה ממלא תחום (practice_area) ושם-יו"ר לכל החלטת-ועדה
שני פערים שצפו מ-/precedents בחילוץ-ההלכות:

1. **practice_area לא סומן** — השדה הועבר ל-LLM כקונטקסט-קריאה-בלבד ולא חולץ
   מעולם, כך שהעלאות שהשאירו אותו ריק נשארו ריקות והרדיו ב-/precedents הופיע
   ללא בחירה. עכשיו נגזר ב-apply_to_record: עדיפות לגזירה דטרמיניסטית מקידומת
   מספר-התיק (1xxx→rishuy, 8xxx→היטל, 9xxx→197 — מקור-אמת לדוקטי ועדת-ערר,
   INV-AH rule-based), ובנפילה — סיווג-תוכן של ה-LLM (שדה practice_area חדש
   בפרומפט, אנום-סגור) עבור פסקי-בית-משפט שהקידומת שלהם אינה מקודדת תחום.
   ממלא רק כשריק (G1 — נרמול במקור, לא תיקון-בקריאה).

2. **שם-יו"ר לא חולץ** (למשל 1132-09-24) — המיזוג היה מגודר על
   source_kind=='internal_committee' בלבד, ודילג בשקט על החלטות-ועדה שהועלו
   במסלול הפסיקה החיצוני (external_upload + source_type=appeals_committee, כמו
   החלטת ת"א מנבו) — היו"ר ישב בבלוק-החתימה אך לא חולץ. עכשיו מגודר על "האם זו
   החלטת-ועדה" (source_type/level אפקטיביים), לעולם לא על פסק-בית-משפט. ה-CHECK
   כופה non-empty רק ל-internal_committee, לכן כתיבה ל-external בטוחה.

חיזוק-פרומפט (לבקשת היו"ר): chair_name מציין מפורשות את בלוק-החתימה הדו-טורי
(מזכיר↔יו"ר — לקחת את צד-היו"ר) ומזהיר לא לחלץ יו"ר של פסקי-דין **מצוטטים**
בגוף ההחלטה.

UI (לוגיקה-בלבד, פטור משער-העיצוב): edit-sheet מסנכרן-מחדש מהרשומה הטרייה בכל
פתיחה (re-arm על סגירה) ו-usePrecedent עושה poll בזמן חילוץ — כך מילוי-רקע של
practice_area/chair_name מופיע בלי refresh מלא ("הכפתור לא נשאר מסומן").

בדיקות: test_metadata_extract_chair_practice_area.py (6 תרחישי-מיזוג, offline).

Invariants: G1 (נרמול-במקור), G2 (אותו extractor, לא מסלול מקביל),
INV-AH (גזירה דטרמיניסטית מועדפת, abstention כשאין ודאות).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 09:53:42 +00:00

166 lines
6.3 KiB
Python

"""Regression tests for two metadata-merge gaps surfaced from /precedents:
1. chair_name was filled ONLY for source_kind='internal_committee', so ועדת-ערר
decisions uploaded via the EXTERNAL precedent path (source_kind='external_upload',
source_type='appeals_committee' — e.g. 1132-09-24, a Tel-Aviv decision pulled
from נבו) never got their chair extracted even though it sits in the signature.
2. practice_area (the /precedents radio facet) was never set by extraction — it was
passed to the LLM as read-only context only. Committee/court uploads that left it
blank stayed blank, so the radio rendered nothing selected. It is now derived
deterministically from the case_number prefix (authoritative for ועדת-ערר dockets)
with the LLM's content classification as the fallback for court dockets whose
prefix doesn't encode a domain.
Runs fully OFFLINE — monkeypatches the ``db`` calls ``apply_to_record`` makes.
"""
from __future__ import annotations
import asyncio
from uuid import uuid4
import pytest
from legal_mcp.services import db, precedent_metadata_extractor as pme
def _run(coro):
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
def _wire_db(monkeypatch, record: dict) -> dict:
"""Stub the db calls apply_to_record makes; return a dict that captures the
kwargs passed to update_case_law."""
captured: dict = {}
async def _get(_cid):
return dict(record)
async def _update(_cid, **fields):
captured.update(fields)
return {**record, **fields}
async def _collides(_cn, _cid):
return False
monkeypatch.setattr(db, "get_case_law", _get)
monkeypatch.setattr(db, "update_case_law", _update)
monkeypatch.setattr(db, "case_number_collides", _collides)
# citation_formatted is pre-set in every fixture below, so the deterministic
# formatter is never reached — stub defensively anyway.
monkeypatch.setattr(db, "format_precedent_citation", lambda *a, **k: "")
return captured
def test_external_committee_decision_gets_chair_name(monkeypatch):
"""source_kind=external_upload + source_type=appeals_committee → chair filled."""
record = {
"source_kind": "external_upload",
"source_type": "appeals_committee",
"case_number": "1132-09-24",
"chair_name": "",
"district": "תל אביב",
"practice_area": "rishuy_uvniya",
"citation_formatted": "ערר ... 1132-09-24",
}
captured = _wire_db(monkeypatch, record)
suggested = {"chair_name": "מיכל דגני הלברשטם", "district": "תל אביב"}
out = _run(pme.apply_to_record(uuid4(), suggested))
assert out["updated"] is True
assert captured.get("chair_name") == "מיכל דגני הלברשטם"
def test_court_ruling_never_gets_chair_name(monkeypatch):
"""A court ruling is not a committee decision — chair must stay empty even if
the model slips and returns one."""
record = {
"source_kind": "external_upload",
"source_type": "court_ruling",
"precedent_level": "עליון",
"case_number": 'ע"א 4768/22',
"chair_name": "",
"district": "",
"practice_area": "betterment_levy",
"citation_formatted": 'ע"א 4768/22',
}
captured = _wire_db(monkeypatch, record)
suggested = {"chair_name": "פלוני אלמוני"}
_run(pme.apply_to_record(uuid4(), suggested))
assert "chair_name" not in captured
def test_practice_area_derived_from_case_number_prefix(monkeypatch):
"""8xxx docket → betterment_levy, deterministically, even if the LLM
suggested nothing (or something else)."""
record = {
"source_kind": "external_upload",
"source_type": "appeals_committee",
"case_number": "8126-03-25",
"chair_name": "פלונית",
"district": "ירושלים",
"practice_area": "",
"citation_formatted": "ערר ... 8126-03-25",
}
captured = _wire_db(monkeypatch, record)
out = _run(pme.apply_to_record(uuid4(), {}))
assert out["updated"] is True
assert captured.get("practice_area") == "betterment_levy"
def test_practice_area_falls_back_to_llm_for_court_docket(monkeypatch):
"""A Supreme-Court docket prefix (4xxx) encodes no domain → use the LLM's
content classification."""
record = {
"source_kind": "external_upload",
"source_type": "court_ruling",
"precedent_level": "עליון",
"case_number": 'ע"א 4768/22',
"chair_name": "",
"district": "",
"practice_area": "",
"citation_formatted": 'ע"א 4768/22',
}
captured = _wire_db(monkeypatch, record)
out = _run(pme.apply_to_record(uuid4(), {"practice_area": "betterment_levy"}))
assert captured.get("practice_area") == "betterment_levy"
def test_practice_area_not_overwritten_when_present(monkeypatch):
"""An existing practice_area (chair-set or earlier derivation) is preserved —
the prefix derivation only fills the blank."""
record = {
"source_kind": "external_upload",
"source_type": "appeals_committee",
"case_number": "8126-03-25", # prefix would say betterment_levy
"chair_name": "פלונית",
"district": "ירושלים",
"practice_area": "compensation_197", # but a human said 197 — keep it
"citation_formatted": "ערר ... 8126-03-25",
}
captured = _wire_db(monkeypatch, record)
_run(pme.apply_to_record(uuid4(), {"practice_area": "rishuy_uvniya"}))
assert "practice_area" not in captured
def test_invalid_llm_practice_area_is_dropped(monkeypatch):
"""The LLM returning a non-domain value (legacy 'appeals_committee' / free text)
must not be written — and with no usable prefix, practice_area stays blank."""
record = {
"source_kind": "external_upload",
"source_type": "court_ruling",
"precedent_level": "עליון",
"case_number": 'ע"א 4768/22',
"chair_name": "",
"district": "",
"practice_area": "",
"citation_formatted": 'ע"א 4768/22',
}
captured = _wire_db(monkeypatch, record)
_run(pme.apply_to_record(uuid4(), {"practice_area": "appeals_committee"}))
assert "practice_area" not in captured