From 26d09d648fea149be9cd2935f1580c207f69c24a Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 11 Apr 2026 16:36:48 +0000 Subject: [PATCH] Practice area separation: multi-tenant axis across DB, RAG, and UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two orthogonal columns — practice_area (top-level legal domain: appeals_committee / national_insurance / labor_law) and appeal_subtype (building_permit / betterment_levy / compensation_197) — denormalized into cases, documents, document_chunks, decisions, and style_corpus so vector searches can filter without JOINs. Why: the system handles two unrelated sub-domains under the same appeals committee (1xxx building permits and 8xxx/9xxx betterment/197), with different rules and writing style. Without a separation axis, search_similar() and the block-writer's precedent lookup were free to surface betterment-levy paragraphs while drafting a building-permit decision — a real risk of cross-domain contamination. The same axis also lets future domains (national insurance, labor law) coexist without separate schemas. Schema (V4 migration in db.py): - ALTER ... ADD COLUMN IF NOT EXISTS on all five tables + composite indexes (practice_area first). - Idempotent backfill: case_number ~ '^1' → building_permit, '^8' → betterment_levy, '^9' → compensation_197; propagated to documents, chunks, and decisions via case_id; training-corpus rows (case_id NULL) default to appeals_committee. Code: - New services/practice_area.py with derive_subtype, validate, and is_override + enum constants. - db.create_case / create_document / store_chunks / create_decision inherit practice_area from the parent case (or take an explicit override for the case_id=None training corpus). - db.search_similar and search_similar_paragraphs accept practice_area + appeal_subtype filters using the denormalized columns. - tools/search.py auto-resolves the filter from case_number when given. - block_writer._build_precedents_context now passes the active case's practice_area to search_similar_paragraphs — closes the contamination hole for the discussion-block precedent fetch. - tools/cases.case_create auto-derives subtype from case_number; an explicit override that disagrees writes a case_subtype_override entry to audit_log so we can spot bad classifications later. - tools/documents.document_upload_training tags new training material with practice_area + subtype end-to-end (corpus, document, chunks). UI (web/static/index.html + web/app.py): - New-case wizard gets a practice_area dropdown (others disabled until national_insurance / labor_law arrive) and an appeal_subtype dropdown with JS auto-fill from the case-number prefix; manual edits stick. - Case header shows a blue badge with practice_area · subtype. - CaseCreateRequest plumbs both fields through to cases_tools.case_create. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/legal_mcp/services/block_writer.py | 7 +- mcp-server/src/legal_mcp/services/db.py | 213 ++++++++++++++++-- .../src/legal_mcp/services/practice_area.py | 96 ++++++++ mcp-server/src/legal_mcp/tools/cases.py | 29 ++- mcp-server/src/legal_mcp/tools/documents.py | 25 +- mcp-server/src/legal_mcp/tools/search.py | 55 ++++- web/app.py | 4 + web/static/index.html | 73 +++++- 8 files changed, 468 insertions(+), 34 deletions(-) create mode 100644 mcp-server/src/legal_mcp/services/practice_area.py diff --git a/mcp-server/src/legal_mcp/services/block_writer.py b/mcp-server/src/legal_mcp/services/block_writer.py index 08599e5..94e05be 100644 --- a/mcp-server/src/legal_mcp/services/block_writer.py +++ b/mcp-server/src/legal_mcp/services/block_writer.py @@ -476,12 +476,17 @@ async def _build_precedents_context(case_id: UUID, block_id: str) -> str: case = await db.get_case(case_id) case_number = case.get("case_number", "") if case else "" subject = case.get("subject", "") if case else "" + practice_area = case.get("practice_area") if case else None + appeal_subtype = case.get("appeal_subtype") if case else None query = f"דיון משפטי בנושא {subject}" if subject else "דיון משפטי ועדת ערר" query_emb = await embeddings.embed_query(query) - # Search 1: paragraph_embeddings (from other decisions by Dafna) + # Search 1: paragraph_embeddings (from other decisions by Dafna). + # Filter by practice_area + appeal_subtype so we don't pull a + # betterment-levy paragraph when writing a building-permit decision. para_results = await db.search_similar_paragraphs( query_embedding=query_emb, limit=10, block_type="block-yod", + practice_area=practice_area, appeal_subtype=appeal_subtype, ) # Filter out same case para_results = [r for r in para_results if r.get("case_number", "") != case_number] diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 1f8a4a5..c577c57 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -200,6 +200,76 @@ CREATE TABLE IF NOT EXISTS appeal_type_rules ( ALTER TABLE decision_blocks ADD COLUMN IF NOT EXISTS image_placeholders JSONB DEFAULT '[]'; """ +# ── Phase 4: Practice area separation (multi-tenant axis) ────────── + +SCHEMA_V4_SQL = """ +-- ═══════════════════════════════════════════════════════════════════ +-- practice_area = top-level legal domain (multi-tenant axis): +-- appeals_committee | national_insurance | labor_law | ... +-- appeal_subtype = refines within practice_area: +-- building_permit | betterment_levy | compensation_197 | unknown +-- Both columns are denormalized to documents/chunks/decisions/style_corpus +-- so vector searches can filter without expensive JOINs. +-- ═══════════════════════════════════════════════════════════════════ + +ALTER TABLE cases ADD COLUMN IF NOT EXISTS practice_area TEXT; +ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS practice_area TEXT; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS appeal_subtype TEXT; +ALTER TABLE document_chunks ADD COLUMN IF NOT EXISTS practice_area TEXT; +ALTER TABLE document_chunks ADD COLUMN IF NOT EXISTS appeal_subtype TEXT; +ALTER TABLE decisions ADD COLUMN IF NOT EXISTS practice_area TEXT; +ALTER TABLE decisions ADD COLUMN IF NOT EXISTS appeal_subtype TEXT; +ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT; +ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT; + +CREATE INDEX IF NOT EXISTS idx_cases_practice + ON cases(practice_area, appeal_subtype); +CREATE INDEX IF NOT EXISTS idx_chunks_practice + ON document_chunks(practice_area); +CREATE INDEX IF NOT EXISTS idx_corpus_practice + ON style_corpus(practice_area, appeal_subtype); +CREATE INDEX IF NOT EXISTS idx_decisions_practice + ON decisions(practice_area); + +-- Backfill (idempotent — only fills NULLs) +UPDATE cases SET practice_area = 'appeals_committee' WHERE practice_area IS NULL; +UPDATE cases SET appeal_subtype = CASE + WHEN case_number ~ '^1[0-9]{3}' THEN 'building_permit' + WHEN case_number ~ '^8[0-9]{3}' THEN 'betterment_levy' + WHEN case_number ~ '^9[0-9]{3}' THEN 'compensation_197' + ELSE 'unknown' +END WHERE appeal_subtype IS NULL; + +UPDATE documents d + SET practice_area = c.practice_area, appeal_subtype = c.appeal_subtype + FROM cases c + WHERE d.case_id = c.id AND d.practice_area IS NULL; + +UPDATE document_chunks dc + SET practice_area = c.practice_area, appeal_subtype = c.appeal_subtype + FROM cases c + WHERE dc.case_id = c.id AND dc.practice_area IS NULL; + +UPDATE decisions de + SET practice_area = c.practice_area, appeal_subtype = c.appeal_subtype + FROM cases c + WHERE de.case_id = c.id AND de.practice_area IS NULL; + +-- All existing style_corpus entries are דפנה's appeals-committee decisions +UPDATE style_corpus SET practice_area = 'appeals_committee' WHERE practice_area IS NULL; + +-- Training corpus documents/chunks have case_id = NULL. All historical +-- training material is from דפנה's appeals committee, so default them. +UPDATE documents SET practice_area = 'appeals_committee' + WHERE case_id IS NULL AND practice_area IS NULL; + +UPDATE document_chunks dc + SET practice_area = d.practice_area, appeal_subtype = d.appeal_subtype + FROM documents d + WHERE dc.document_id = d.id AND dc.practice_area IS NULL; +""" + # ── Phase 2: Decision + Knowledge + RAG layers ──────────────────── SCHEMA_V2_SQL = """ @@ -388,7 +458,8 @@ async def init_schema() -> None: await conn.execute(MIGRATIONS_SQL) await conn.execute(SCHEMA_V2_SQL) await conn.execute(SCHEMA_V3_SQL) - logger.info("Database schema initialized (v1 + v2 + v3)") + await conn.execute(SCHEMA_V4_SQL) + logger.info("Database schema initialized (v1 + v2 + v3 + v4)") # ── Case CRUD ─────────────────────────────────────────────────────── @@ -405,6 +476,8 @@ async def create_case( hearing_date: date | None = None, notes: str = "", expected_outcome: str = "", + practice_area: str = "appeals_committee", + appeal_subtype: str | None = None, ) -> dict: pool = await get_pool() case_id = uuid4() @@ -412,17 +485,43 @@ async def create_case( await conn.execute( """INSERT INTO cases (id, case_number, title, appellants, respondents, subject, property_address, permit_number, committee_type, - hearing_date, notes, expected_outcome) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)""", + hearing_date, notes, expected_outcome, + practice_area, appeal_subtype) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)""", case_id, case_number, title, json.dumps(appellants or []), json.dumps(respondents or []), subject, property_address, permit_number, committee_type, hearing_date, notes, expected_outcome, + practice_area, appeal_subtype, ) return await get_case(case_id) +async def get_case_practice_area(case_id: UUID) -> tuple[str | None, str | None]: + """Return (practice_area, appeal_subtype) for a case, or (None, None) if missing.""" + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT practice_area, appeal_subtype FROM cases WHERE id = $1", case_id + ) + if row is None: + return None, None + return row["practice_area"], row["appeal_subtype"] + + +async def get_case_practice_area_by_number(case_number: str) -> tuple[str | None, str | None]: + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT practice_area, appeal_subtype FROM cases WHERE case_number = $1", + case_number, + ) + if row is None: + return None, None + return row["practice_area"], row["appeal_subtype"] + + async def get_case(case_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: @@ -488,19 +587,34 @@ def _row_to_case(row: asyncpg.Record) -> dict: # ── Document CRUD ─────────────────────────────────────────────────── async def create_document( - case_id: UUID, + case_id: UUID | None, doc_type: str, title: str, file_path: str, page_count: int | None = None, + practice_area: str | None = None, + appeal_subtype: str | None = None, ) -> dict: pool = await get_pool() doc_id = uuid4() async with pool.acquire() as conn: + # If practice_area not explicitly given, inherit from the parent case + # (for case-bound documents). Training corpus passes case_id=None and + # provides the practice_area directly. + if practice_area is None and case_id is not None: + case_row = await conn.fetchrow( + "SELECT practice_area, appeal_subtype FROM cases WHERE id = $1", + case_id, + ) + if case_row: + practice_area = case_row["practice_area"] + appeal_subtype = case_row["appeal_subtype"] await conn.execute( - """INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count) - VALUES ($1, $2, $3, $4, $5, $6)""", + """INSERT INTO documents (id, case_id, doc_type, title, file_path, + page_count, practice_area, appeal_subtype) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""", doc_id, case_id, doc_type, title, file_path, page_count, + practice_area, appeal_subtype, ) row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) return _row_to_doc(row) @@ -622,12 +736,20 @@ async def create_decision( ) version = (existing["version"] + 1) if existing else 1 + case_row = await conn.fetchrow( + "SELECT practice_area, appeal_subtype FROM cases WHERE id = $1", case_id + ) + practice_area = case_row["practice_area"] if case_row else None + appeal_subtype = case_row["appeal_subtype"] if case_row else None + await conn.execute( """INSERT INTO decisions (id, case_id, version, outcome, outcome_summary, - outcome_reasoning, direction_doc) - VALUES ($1, $2, $3, $4, $5, $6, $7)""", + outcome_reasoning, direction_doc, + practice_area, appeal_subtype) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""", decision_id, case_id, version, outcome, outcome_summary, outcome_reasoning, json.dumps(direction_doc) if direction_doc else None, + practice_area, appeal_subtype, ) return await get_decision(decision_id) @@ -701,12 +823,37 @@ async def store_chunks( document_id: UUID, case_id: UUID | None, chunks: list[dict], + practice_area: str | None = None, + appeal_subtype: str | None = None, ) -> int: """Store document chunks with embeddings. Each chunk dict has: - content, section_type, embedding (list[float]), page_number, chunk_index + content, section_type, embedding (list[float]), page_number, chunk_index. + + practice_area defaults to the parent case's value, or — when case_id is + None (training corpus) — falls back to the parent document's value so + vector search can still filter cleanly. """ pool = await get_pool() async with pool.acquire() as conn: + # Resolve practice_area in priority order: explicit > case > document. + if practice_area is None: + if case_id is not None: + case_row = await conn.fetchrow( + "SELECT practice_area, appeal_subtype FROM cases WHERE id = $1", + case_id, + ) + if case_row: + practice_area = case_row["practice_area"] + appeal_subtype = case_row["appeal_subtype"] + if practice_area is None: + doc_row = await conn.fetchrow( + "SELECT practice_area, appeal_subtype FROM documents WHERE id = $1", + document_id, + ) + if doc_row: + practice_area = doc_row["practice_area"] + appeal_subtype = doc_row["appeal_subtype"] + # Delete existing chunks for this document await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", document_id @@ -714,14 +861,16 @@ async def store_chunks( for chunk in chunks: await conn.execute( """INSERT INTO document_chunks - (document_id, case_id, chunk_index, content, section_type, embedding, page_number) - VALUES ($1, $2, $3, $4, $5, $6, $7)""", + (document_id, case_id, chunk_index, content, section_type, + embedding, page_number, practice_area, appeal_subtype) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""", document_id, case_id, chunk["chunk_index"], chunk["content"], chunk.get("section_type", "other"), chunk["embedding"], chunk.get("page_number"), + practice_area, appeal_subtype, ) return len(chunks) @@ -731,8 +880,15 @@ async def search_similar( limit: int = 10, case_id: UUID | None = None, section_type: str | None = None, + practice_area: str | None = None, + appeal_subtype: str | None = None, ) -> list[dict]: - """Cosine similarity search on document chunks.""" + """Cosine similarity search on document chunks. + + Filter by practice_area to keep precedents from the same legal domain + (e.g. don't surface betterment-levy chunks when working on building + permits). Uses the denormalized column on document_chunks — no JOIN. + """ pool = await get_pool() conditions = [] params: list = [query_embedding, limit] @@ -746,6 +902,14 @@ async def search_similar( conditions.append(f"dc.section_type = ${param_idx}") params.append(section_type) param_idx += 1 + if practice_area: + conditions.append(f"dc.practice_area = ${param_idx}") + params.append(practice_area) + param_idx += 1 + if appeal_subtype: + conditions.append(f"dc.appeal_subtype = ${param_idx}") + params.append(appeal_subtype) + param_idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" @@ -778,6 +942,8 @@ async def add_to_style_corpus( summary: str = "", outcome: str = "", key_principles: list[str] | None = None, + practice_area: str = "appeals_committee", + appeal_subtype: str | None = None, ) -> UUID: pool = await get_pool() corpus_id = uuid4() @@ -785,11 +951,13 @@ async def add_to_style_corpus( await conn.execute( """INSERT INTO style_corpus (id, document_id, decision_number, decision_date, - subject_categories, full_text, summary, outcome, key_principles) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""", + subject_categories, full_text, summary, outcome, key_principles, + practice_area, appeal_subtype) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""", corpus_id, document_id, decision_number, decision_date, json.dumps(subject_categories), full_text, summary, outcome, json.dumps(key_principles or []), + practice_area, appeal_subtype, ) return corpus_id @@ -893,8 +1061,15 @@ async def search_similar_paragraphs( query_embedding: list[float], limit: int = 10, block_type: str | None = None, + practice_area: str | None = None, + appeal_subtype: str | None = None, ) -> list[dict]: - """Search decision paragraphs by semantic similarity.""" + """Search decision paragraphs by semantic similarity. + + Filtering by practice_area uses the denormalized column on `decisions` + so we don't pull, e.g., betterment-levy paragraphs when writing a + building-permit decision. + """ pool = await get_pool() conditions = [] params: list = [query_embedding, limit] @@ -904,6 +1079,14 @@ async def search_similar_paragraphs( conditions.append(f"db.block_id = ${param_idx}") params.append(block_type) param_idx += 1 + if practice_area: + conditions.append(f"d.practice_area = ${param_idx}") + params.append(practice_area) + param_idx += 1 + if appeal_subtype: + conditions.append(f"d.appeal_subtype = ${param_idx}") + params.append(appeal_subtype) + param_idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" diff --git a/mcp-server/src/legal_mcp/services/practice_area.py b/mcp-server/src/legal_mcp/services/practice_area.py new file mode 100644 index 0000000..dd0e74a --- /dev/null +++ b/mcp-server/src/legal_mcp/services/practice_area.py @@ -0,0 +1,96 @@ +"""Practice area + appeal subtype: derivation, validation, constants. + +Two orthogonal axes used to separate legal domains across the system: + + practice_area — top-level domain (multi-tenant axis). Examples: + appeals_committee, national_insurance, labor_law. + appeal_subtype — refines within a domain. For appeals_committee: + building_permit (1xxx), betterment_levy (8xxx), + compensation_197 (9xxx), unknown. + +Both columns are denormalized into documents/chunks/decisions/style_corpus +so vector searches can filter cheaply. +""" + +from __future__ import annotations + +import re + +# ── Enums ────────────────────────────────────────────────────────── + +PRACTICE_AREAS: set[str] = { + "appeals_committee", + "national_insurance", + "labor_law", +} + +APPEALS_COMMITTEE_SUBTYPES: set[str] = { + "building_permit", + "betterment_levy", + "compensation_197", + "unknown", +} + +DEFAULT_PRACTICE_AREA = "appeals_committee" + +# Subtypes per practice_area (extend when adding domains) +SUBTYPES_BY_AREA: dict[str, set[str]] = { + "appeals_committee": APPEALS_COMMITTEE_SUBTYPES, + "national_insurance": {"unknown"}, + "labor_law": {"unknown"}, +} + + +# ── Derivation ───────────────────────────────────────────────────── + +_FIRST_DIGIT = re.compile(r"^\s*(\d)") + +_APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE = { + "1": "building_permit", + "8": "betterment_levy", + "9": "compensation_197", +} + + +def derive_subtype(case_number: str, practice_area: str = DEFAULT_PRACTICE_AREA) -> str: + """Infer the appeal_subtype from case_number. + + For appeals_committee, the convention is: + 1xxx → building_permit, 8xxx → betterment_levy, 9xxx → compensation_197. + + For other practice areas there is no public numbering convention yet, + so we return 'unknown' until a real rule is defined. + """ + if practice_area != "appeals_committee": + return "unknown" + m = _FIRST_DIGIT.match(case_number or "") + if not m: + return "unknown" + return _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE.get(m.group(1), "unknown") + + +# ── Validation ───────────────────────────────────────────────────── + + +def validate(practice_area: str, appeal_subtype: str | None) -> None: + """Raise ValueError on unknown values. appeal_subtype=None is allowed.""" + if practice_area not in PRACTICE_AREAS: + raise ValueError( + f"unknown practice_area: {practice_area!r}. " + f"expected one of {sorted(PRACTICE_AREAS)}" + ) + if appeal_subtype is None: + return + allowed = SUBTYPES_BY_AREA.get(practice_area, {"unknown"}) + if appeal_subtype not in allowed: + raise ValueError( + f"unknown appeal_subtype {appeal_subtype!r} for practice_area " + f"{practice_area!r}. expected one of {sorted(allowed)}" + ) + + +def is_override(case_number: str, practice_area: str, appeal_subtype: str) -> bool: + """True iff the user-supplied subtype disagrees with what derive_subtype + would have produced (and the derived value is not 'unknown').""" + derived = derive_subtype(case_number, practice_area) + return derived != "unknown" and derived != appeal_subtype diff --git a/mcp-server/src/legal_mcp/tools/cases.py b/mcp-server/src/legal_mcp/tools/cases.py index fa6714a..794c4c2 100644 --- a/mcp-server/src/legal_mcp/tools/cases.py +++ b/mcp-server/src/legal_mcp/tools/cases.py @@ -8,7 +8,7 @@ from pathlib import Path from uuid import UUID from legal_mcp import config -from legal_mcp.services import db +from legal_mcp.services import audit, db, practice_area as pa async def case_create( @@ -23,6 +23,8 @@ async def case_create( hearing_date: str = "", notes: str = "", expected_outcome: str = "", + practice_area: str = "appeals_committee", + appeal_subtype: str = "", ) -> str: """יצירת תיק ערר חדש. @@ -38,6 +40,9 @@ async def case_create( hearing_date: תאריך דיון (YYYY-MM-DD) notes: הערות expected_outcome: תוצאה צפויה (rejection/partial_acceptance/full_acceptance/betterment_levy) + practice_area: תחום משפטי (appeals_committee / national_insurance / labor_law) + appeal_subtype: סוג ערר (building_permit / betterment_levy / compensation_197). + ריק = יוסק אוטומטית ממספר התיק """ from datetime import date as date_type @@ -45,6 +50,12 @@ async def case_create( if hearing_date: h_date = date_type.fromisoformat(hearing_date) + # Resolve appeal_subtype: explicit override > auto-derive > 'unknown' + derived_subtype = pa.derive_subtype(case_number, practice_area) + if not appeal_subtype: + appeal_subtype = derived_subtype + pa.validate(practice_area, appeal_subtype) + case = await db.create_case( case_number=case_number, title=title, @@ -57,8 +68,24 @@ async def case_create( hearing_date=h_date, notes=notes, expected_outcome=expected_outcome, + practice_area=practice_area, + appeal_subtype=appeal_subtype, ) + # If the user overrode the case-number convention (e.g. case 8500 marked + # as building_permit), record it so we can audit later. + if pa.is_override(case_number, practice_area, appeal_subtype): + await audit.log_action( + action="case_subtype_override", + case_id=UUID(case["id"]), + details={ + "case_number": case_number, + "derived_subtype": derived_subtype, + "chosen_subtype": appeal_subtype, + "practice_area": practice_area, + }, + ) + # Initialize git repo for the case case_dir = config.find_case_dir(case_number) case_dir.mkdir(parents=True, exist_ok=True) diff --git a/mcp-server/src/legal_mcp/tools/documents.py b/mcp-server/src/legal_mcp/tools/documents.py index e468747..258d792 100644 --- a/mcp-server/src/legal_mcp/tools/documents.py +++ b/mcp-server/src/legal_mcp/tools/documents.py @@ -105,6 +105,8 @@ async def document_upload_training( decision_date: str = "", subject_categories: list[str] | None = None, title: str = "", + practice_area: str = "appeals_committee", + appeal_subtype: str = "", ) -> str: """העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training). @@ -114,10 +116,13 @@ async def document_upload_training( decision_date: תאריך ההחלטה (YYYY-MM-DD) subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197) title: שם המסמך + practice_area: תחום משפטי (appeals_committee / national_insurance / labor_law) + appeal_subtype: סוג ערר (building_permit / betterment_levy / compensation_197). + ריק = יוסק אוטומטית ממספר ההחלטה """ from datetime import date as date_type - from legal_mcp.services import extractor, embeddings, chunker + from legal_mcp.services import chunker, embeddings, extractor, practice_area as pa source = Path(file_path) if not source.exists(): @@ -126,6 +131,11 @@ async def document_upload_training( if not title: title = source.stem + # Resolve subtype: explicit > derived from decision_number > 'unknown' + if not appeal_subtype: + appeal_subtype = pa.derive_subtype(decision_number, practice_area) + pa.validate(practice_area, appeal_subtype) + # Copy to training directory (skip if already there) config.TRAINING_DIR.mkdir(parents=True, exist_ok=True) dest = config.TRAINING_DIR / source.name @@ -140,25 +150,29 @@ async def document_upload_training( if decision_date: d_date = date_type.fromisoformat(decision_date) - # Add to style corpus + # Add to style corpus (tagged by domain so block-writer can filter) corpus_id = await db.add_to_style_corpus( document_id=None, decision_number=decision_number, decision_date=d_date, subject_categories=subject_categories or [], full_text=text, + practice_area=practice_area, + appeal_subtype=appeal_subtype, ) # Chunk and embed for RAG search over training corpus chunks = chunker.chunk_document(text) if chunks: - # Create a document record (no case association) + # Create a document record (no case association — tag explicitly) doc = await db.create_document( case_id=None, doc_type="decision", title=f"[קורפוס] {title}", file_path=str(dest), page_count=page_count, + practice_area=practice_area, + appeal_subtype=appeal_subtype, ) doc_id = UUID(doc["id"]) await db.update_document(doc_id, extracted_text=text, extraction_status="completed") @@ -176,7 +190,10 @@ async def document_upload_training( } for c, emb in zip(chunks, embs) ] - await db.store_chunks(doc_id, None, chunk_dicts) + await db.store_chunks( + doc_id, None, chunk_dicts, + practice_area=practice_area, appeal_subtype=appeal_subtype, + ) return json.dumps({ "corpus_id": str(corpus_id), diff --git a/mcp-server/src/legal_mcp/tools/search.py b/mcp-server/src/legal_mcp/tools/search.py index 078874e..3c2d3f0 100644 --- a/mcp-server/src/legal_mcp/tools/search.py +++ b/mcp-server/src/legal_mcp/tools/search.py @@ -3,28 +3,52 @@ from __future__ import annotations import json +import logging from uuid import UUID from legal_mcp.services import db, embeddings +logger = logging.getLogger(__name__) + async def search_decisions( query: str, limit: int = 10, section_type: str = "", + practice_area: str = "", + appeal_subtype: str = "", + case_number: str = "", ) -> str: - """חיפוש סמנטי בהחלטות קודמות ובמסמכים. + """חיפוש סמנטי בהחלטות קודמות ובמסמכים — מסונן לפי תחום משפטי. Args: - query: שאילתת חיפוש בעברית (לדוגמה: "שימוש חורג למסחר באזור מגורים") + query: שאילתת חיפוש בעברית limit: מספר תוצאות מקסימלי - section_type: סינון לפי סוג סעיף (facts, legal_analysis, conclusion, ruling, וכו'). ריק = הכל + section_type: סינון לפי סוג סעיף (facts, legal_analysis, ...) + practice_area: תחום משפטי לסינון (appeals_committee/national_insurance/...) + appeal_subtype: סוג ערר לסינון (building_permit/betterment_levy/compensation_197) + case_number: אם סופק, ה-practice_area/subtype יוסקו אוטומטית מהתיק """ + # Auto-resolve practice_area from case_number if available + if case_number and not practice_area: + case = await db.get_case_by_number(case_number) + if case: + practice_area = case.get("practice_area") or "" + appeal_subtype = appeal_subtype or (case.get("appeal_subtype") or "") + + if not practice_area: + logger.warning( + "search_decisions called without practice_area filter — " + "results may mix legal domains" + ) + query_emb = await embeddings.embed_query(query) results = await db.search_similar( query_embedding=query_emb, limit=limit, section_type=section_type or None, + practice_area=practice_area or None, + appeal_subtype=appeal_subtype or None, ) if not results: @@ -61,6 +85,7 @@ async def search_case_documents( return f"תיק {case_number} לא נמצא." query_emb = await embeddings.embed_query(query) + # Restricted to case_id — practice_area filter would be redundant. results = await db.search_similar( query_embedding=query_emb, limit=limit, @@ -86,17 +111,37 @@ async def search_case_documents( async def find_similar_cases( description: str, limit: int = 5, + practice_area: str = "", + appeal_subtype: str = "", + case_number: str = "", ) -> str: - """מציאת תיקים דומים על בסיס תיאור. + """מציאת תיקים דומים על בסיס תיאור — מסונן לפי תחום משפטי. Args: - description: תיאור התיק או הנושא (לדוגמה: "ערר על סירוב להיתר בנייה לתוספת קומה") + description: תיאור התיק או הנושא limit: מספר תוצאות מקסימלי + practice_area: תחום משפטי לסינון + appeal_subtype: סוג ערר לסינון + case_number: אם סופק, ה-practice_area/subtype יוסקו אוטומטית מהתיק """ + if case_number and not practice_area: + case = await db.get_case_by_number(case_number) + if case: + practice_area = case.get("practice_area") or "" + appeal_subtype = appeal_subtype or (case.get("appeal_subtype") or "") + + if not practice_area: + logger.warning( + "find_similar_cases called without practice_area filter — " + "results may mix legal domains" + ) + query_emb = await embeddings.embed_query(description) results = await db.search_similar( query_embedding=query_emb, limit=limit * 3, # Get more to deduplicate by case + practice_area=practice_area or None, + appeal_subtype=appeal_subtype or None, ) if not results: diff --git a/web/app.py b/web/app.py index dc47bec..8dcbbe1 100644 --- a/web/app.py +++ b/web/app.py @@ -1069,6 +1069,8 @@ class CaseCreateRequest(BaseModel): hearing_date: str = "" notes: str = "" expected_outcome: str = "" + practice_area: str = "appeals_committee" + appeal_subtype: str = "" class CaseUpdateRequest(BaseModel): @@ -1097,6 +1099,8 @@ async def api_case_create(req: CaseCreateRequest): hearing_date=req.hearing_date, notes=req.notes, expected_outcome=req.expected_outcome, + practice_area=req.practice_area, + appeal_subtype=req.appeal_subtype, ) return json.loads(result) diff --git a/web/static/index.html b/web/static/index.html index dfc4e17..7624c24 100644 --- a/web/static/index.html +++ b/web/static/index.html @@ -1964,14 +1964,26 @@ kbd {
-
- - + + +
+
+ + +
+ +
+
@@ -2730,11 +2742,14 @@ function getListValues(listId) { function buildSummary() { const data = getWizardData(); const OUTCOME_LABELS = { rejection: 'דחייה', partial_acceptance: 'קבלה חלקית', full_acceptance: 'קבלה מלאה', betterment_levy: 'היטל השבחה' }; + const PRACTICE_AREA_LABELS = { appeals_committee: 'ועדת ערר', national_insurance: 'ביטוח לאומי', labor_law: 'דיני עבודה' }; + const SUBTYPE_LABELS = { building_permit: 'רישוי ובנייה', betterment_levy: 'היטל השבחה', compensation_197: "פיצויים (ס' 197)", unknown: 'לא ידוע' }; document.getElementById('wizSummary').innerHTML = ` - + + @@ -2743,11 +2758,44 @@ function buildSummary() { `; } +// 1xxx → building_permit, 8xxx → betterment_levy, 9xxx → compensation_197 +function deriveSubtypeFromCaseNumber(caseNumber) { + const m = (caseNumber || '').trim().match(/^(\d)/); + if (!m) return 'unknown'; + return ({1: 'building_permit', 8: 'betterment_levy', 9: 'compensation_197'})[m[1]] || 'unknown'; +} + +// Auto-fill subtype + committee_type when the user types/edits the case number. +// User can override the dropdown manually afterwards. +function wireSubtypeAutofill() { + const cnInput = document.getElementById('wiz-case-number'); + const subtypeSel = document.getElementById('wiz-appeal-subtype'); + const committeeHidden = document.getElementById('wiz-committee-type'); + if (!cnInput || !subtypeSel) return; + const SUBTYPE_TO_COMMITTEE = { + building_permit: 'רישוי', + betterment_levy: 'היטל השבחה', + compensation_197: 'פיצויים', + unknown: 'רישוי', + }; + let userOverrode = false; + subtypeSel.addEventListener('change', () => { userOverrode = true; }); + cnInput.addEventListener('input', () => { + if (userOverrode) return; + const derived = deriveSubtypeFromCaseNumber(cnInput.value); + subtypeSel.value = derived; + if (committeeHidden) committeeHidden.value = SUBTYPE_TO_COMMITTEE[derived]; + }); +} +document.addEventListener('DOMContentLoaded', wireSubtypeAutofill); + function getWizardData() { return { case_number: document.getElementById('wiz-case-number').value.trim(), title: document.getElementById('wiz-title').value.trim(), committee_type: document.getElementById('wiz-committee-type').value, + practice_area: document.getElementById('wiz-practice-area').value, + appeal_subtype: document.getElementById('wiz-appeal-subtype').value, property_address: document.getElementById('wiz-address').value.trim(), permit_number: document.getElementById('wiz-permit').value.trim(), appellants: getListValues('appellantsList'), @@ -2847,9 +2895,18 @@ async function loadCaseView(caseNumber) { new: 'חדש', in_progress: 'בתהליך', documents_ready: 'מסמכים מוכנים', drafted: 'טיוטה', final: 'סופי', }; + const PRACTICE_AREA_LABELS = { appeals_committee: 'ועדת ערר', national_insurance: 'ביטוח לאומי', labor_law: 'דיני עבודה' }; + const SUBTYPE_LABELS = { building_permit: 'רישוי ובנייה', betterment_levy: 'היטל השבחה', compensation_197: "פיצויים (ס' 197)", unknown: 'לא ידוע' }; const meta = []; meta.push(`${STATUS_LABELS[data.status] || data.status}`); - if (data.committee_type) meta.push(data.committee_type); + if (data.practice_area || data.appeal_subtype) { + const parts = []; + if (data.practice_area) parts.push(PRACTICE_AREA_LABELS[data.practice_area] || data.practice_area); + if (data.appeal_subtype) parts.push(SUBTYPE_LABELS[data.appeal_subtype] || data.appeal_subtype); + meta.push(`${parts.join(' · ')}`); + } else if (data.committee_type) { + meta.push(data.committee_type); + } if (data.property_address) meta.push(data.property_address); if (data.appellants?.length) meta.push('עוררים: ' + data.appellants.join(', ')); document.getElementById('caseViewMeta').innerHTML = meta.map(m => `${m}`).join('');
מספר תיק${esc(data.case_number)}
כותרת${esc(data.title)}
סוג${esc(data.committee_type)}
תחום${esc(PRACTICE_AREA_LABELS[data.practice_area] || data.practice_area)}
סוג ערר${esc(SUBTYPE_LABELS[data.appeal_subtype] || data.appeal_subtype)}
כתובת${esc(data.property_address || '—')}
עוררים${data.appellants.length ? data.appellants.map(esc).join(', ') : '—'}
משיבים${data.respondents.length ? data.respondents.map(esc).join(', ') : '—'}