Files
legal-ai/mcp-server/src/legal_mcp/services/db.py
Chaim 26d09d648f Practice area separation: multi-tenant axis across DB, RAG, and UI
Adds two orthogonal columns — practice_area (top-level legal domain:
appeals_committee / national_insurance / labor_law) and appeal_subtype
(building_permit / betterment_levy / compensation_197) — denormalized
into cases, documents, document_chunks, decisions, and style_corpus so
vector searches can filter without JOINs.

Why: the system handles two unrelated sub-domains under the same
appeals committee (1xxx building permits and 8xxx/9xxx betterment/197),
with different rules and writing style. Without a separation axis,
search_similar() and the block-writer's precedent lookup were free to
surface betterment-levy paragraphs while drafting a building-permit
decision — a real risk of cross-domain contamination. The same axis
also lets future domains (national insurance, labor law) coexist
without separate schemas.

Schema (V4 migration in db.py):
- ALTER ... ADD COLUMN IF NOT EXISTS on all five tables + composite
  indexes (practice_area first).
- Idempotent backfill: case_number ~ '^1' → building_permit, '^8' →
  betterment_levy, '^9' → compensation_197; propagated to documents,
  chunks, and decisions via case_id; training-corpus rows (case_id NULL)
  default to appeals_committee.

Code:
- New services/practice_area.py with derive_subtype, validate, and
  is_override + enum constants.
- db.create_case / create_document / store_chunks / create_decision
  inherit practice_area from the parent case (or take an explicit
  override for the case_id=None training corpus).
- db.search_similar and search_similar_paragraphs accept practice_area
  + appeal_subtype filters using the denormalized columns.
- tools/search.py auto-resolves the filter from case_number when given.
- block_writer._build_precedents_context now passes the active case's
  practice_area to search_similar_paragraphs — closes the contamination
  hole for the discussion-block precedent fetch.
- tools/cases.case_create auto-derives subtype from case_number; an
  explicit override that disagrees writes a case_subtype_override entry
  to audit_log so we can spot bad classifications later.
- tools/documents.document_upload_training tags new training material
  with practice_area + subtype end-to-end (corpus, document, chunks).

UI (web/static/index.html + web/app.py):
- New-case wizard gets a practice_area dropdown (others disabled until
  national_insurance / labor_law arrive) and an appeal_subtype dropdown
  with JS auto-fill from the case-number prefix; manual edits stick.
- Case header shows a blue badge with practice_area · subtype.
- CaseCreateRequest plumbs both fields through to cases_tools.case_create.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 16:36:48 +00:00

1172 lines
46 KiB
Python

"""Database service - asyncpg connection pool and queries."""
from __future__ import annotations
import json
import logging
from datetime import date
from uuid import UUID, uuid4
import asyncpg
from pgvector.asyncpg import register_vector
from legal_mcp import config
logger = logging.getLogger(__name__)
_pool: asyncpg.Pool | None = None
async def get_pool() -> asyncpg.Pool:
global _pool
if _pool is None:
# First, ensure pgvector extension exists (before registering type codec)
conn = await asyncpg.connect(config.POSTGRES_URL)
await conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"')
await conn.close()
_pool = await asyncpg.create_pool(
config.POSTGRES_URL,
min_size=2,
max_size=10,
init=_init_connection,
)
return _pool
async def _init_connection(conn: asyncpg.Connection) -> None:
await register_vector(conn)
async def close_pool() -> None:
global _pool
if _pool:
await _pool.close()
_pool = None
# ── Schema ──────────────────────────────────────────────────────────
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS cases (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_number TEXT UNIQUE NOT NULL,
title TEXT NOT NULL,
appellants JSONB DEFAULT '[]',
respondents JSONB DEFAULT '[]',
subject TEXT DEFAULT '',
property_address TEXT DEFAULT '',
permit_number TEXT DEFAULT '',
committee_type TEXT DEFAULT 'ועדה מקומית',
status TEXT DEFAULT 'new',
hearing_date DATE,
decision_date DATE,
tags JSONB DEFAULT '[]',
notes TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS documents (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
doc_type TEXT NOT NULL,
title TEXT NOT NULL,
file_path TEXT NOT NULL,
extracted_text TEXT DEFAULT '',
extraction_status TEXT DEFAULT 'pending',
page_count INTEGER,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS document_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_type TEXT DEFAULT 'other',
embedding vector(1024),
page_number INTEGER,
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_corpus (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
decision_number TEXT,
decision_date DATE,
subject_categories JSONB DEFAULT '[]',
full_text TEXT NOT NULL,
summary TEXT DEFAULT '',
outcome TEXT DEFAULT '',
key_principles JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_patterns (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
pattern_type TEXT NOT NULL,
pattern_text TEXT NOT NULL,
frequency INTEGER DEFAULT 1,
context TEXT DEFAULT '',
examples JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_chunks_embedding
ON document_chunks USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id);
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id);
CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status);
CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number);
"""
MIGRATIONS_SQL = """
ALTER TABLE cases ADD COLUMN IF NOT EXISTS expected_outcome TEXT DEFAULT '';
CREATE TABLE IF NOT EXISTS audit_log (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
action TEXT NOT NULL,
case_id UUID REFERENCES cases(id) ON DELETE SET NULL,
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
details JSONB DEFAULT '{}',
actor TEXT DEFAULT 'system',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_audit_case ON audit_log(case_id);
CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_log(action);
CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_log(created_at DESC);
"""
# ── Phase 3: Workflow expansion ────────────────────────────────────
SCHEMA_V3_SQL = """
-- הרחבת decisions עם שדות חדשים
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS direction_doc JSONB DEFAULT NULL;
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS outcome_reasoning TEXT DEFAULT '';
-- הרחבת cases עם appeal_type (אם לא קיים)
ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_type TEXT DEFAULT '';
-- טבלת qa_results
CREATE TABLE IF NOT EXISTS qa_results (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
check_name TEXT NOT NULL,
passed BOOLEAN NOT NULL,
severity TEXT DEFAULT 'warning',
errors JSONB DEFAULT '[]',
details TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_qa_results_decision ON qa_results(decision_id);
CREATE INDEX IF NOT EXISTS idx_qa_results_case ON qa_results(case_id);
-- טבלת decision_definitions (אם לא קיימת)
CREATE TABLE IF NOT EXISTS decision_definitions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
term TEXT NOT NULL,
definition TEXT NOT NULL,
block_id TEXT DEFAULT 'block-he',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_definitions_decision ON decision_definitions(decision_id);
-- טבלת appeal_type_rules (אם לא קיימת)
CREATE TABLE IF NOT EXISTS appeal_type_rules (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
appeal_type TEXT NOT NULL,
rule_category TEXT NOT NULL,
rule_key TEXT NOT NULL,
rule_value JSONB NOT NULL,
description TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(appeal_type, rule_category, rule_key)
);
-- image_placeholders על decision_blocks
ALTER TABLE decision_blocks ADD COLUMN IF NOT EXISTS image_placeholders JSONB DEFAULT '[]';
"""
# ── Phase 4: Practice area separation (multi-tenant axis) ──────────
SCHEMA_V4_SQL = """
-- ═══════════════════════════════════════════════════════════════════
-- practice_area = top-level legal domain (multi-tenant axis):
-- appeals_committee | national_insurance | labor_law | ...
-- appeal_subtype = refines within practice_area:
-- building_permit | betterment_levy | compensation_197 | unknown
-- Both columns are denormalized to documents/chunks/decisions/style_corpus
-- so vector searches can filter without expensive JOINs.
-- ═══════════════════════════════════════════════════════════════════
ALTER TABLE cases ADD COLUMN IF NOT EXISTS practice_area TEXT;
ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS practice_area TEXT;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS appeal_subtype TEXT;
ALTER TABLE document_chunks ADD COLUMN IF NOT EXISTS practice_area TEXT;
ALTER TABLE document_chunks ADD COLUMN IF NOT EXISTS appeal_subtype TEXT;
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS practice_area TEXT;
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS appeal_subtype TEXT;
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT;
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT;
CREATE INDEX IF NOT EXISTS idx_cases_practice
ON cases(practice_area, appeal_subtype);
CREATE INDEX IF NOT EXISTS idx_chunks_practice
ON document_chunks(practice_area);
CREATE INDEX IF NOT EXISTS idx_corpus_practice
ON style_corpus(practice_area, appeal_subtype);
CREATE INDEX IF NOT EXISTS idx_decisions_practice
ON decisions(practice_area);
-- Backfill (idempotent — only fills NULLs)
UPDATE cases SET practice_area = 'appeals_committee' WHERE practice_area IS NULL;
UPDATE cases SET appeal_subtype = CASE
WHEN case_number ~ '^1[0-9]{3}' THEN 'building_permit'
WHEN case_number ~ '^8[0-9]{3}' THEN 'betterment_levy'
WHEN case_number ~ '^9[0-9]{3}' THEN 'compensation_197'
ELSE 'unknown'
END WHERE appeal_subtype IS NULL;
UPDATE documents d
SET practice_area = c.practice_area, appeal_subtype = c.appeal_subtype
FROM cases c
WHERE d.case_id = c.id AND d.practice_area IS NULL;
UPDATE document_chunks dc
SET practice_area = c.practice_area, appeal_subtype = c.appeal_subtype
FROM cases c
WHERE dc.case_id = c.id AND dc.practice_area IS NULL;
UPDATE decisions de
SET practice_area = c.practice_area, appeal_subtype = c.appeal_subtype
FROM cases c
WHERE de.case_id = c.id AND de.practice_area IS NULL;
-- All existing style_corpus entries are דפנה's appeals-committee decisions
UPDATE style_corpus SET practice_area = 'appeals_committee' WHERE practice_area IS NULL;
-- Training corpus documents/chunks have case_id = NULL. All historical
-- training material is from דפנה's appeals committee, so default them.
UPDATE documents SET practice_area = 'appeals_committee'
WHERE case_id IS NULL AND practice_area IS NULL;
UPDATE document_chunks dc
SET practice_area = d.practice_area, appeal_subtype = d.appeal_subtype
FROM documents d
WHERE dc.document_id = d.id AND dc.practice_area IS NULL;
"""
# ── Phase 2: Decision + Knowledge + RAG layers ────────────────────
SCHEMA_V2_SQL = """
-- ═══════════════════════════════════════════════════════════════════
-- Layer 2: Decision
-- ═══════════════════════════════════════════════════════════════════
-- decisions: מטאדטה של החלטה (גרסה אחת = רשומה אחת)
CREATE TABLE IF NOT EXISTS decisions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
version INTEGER DEFAULT 1,
status TEXT DEFAULT 'draft', -- draft/review/final/published
outcome TEXT DEFAULT '', -- rejected/accepted/partial
outcome_summary TEXT DEFAULT '', -- תמצית תוצאה (שורה אחת)
total_paragraphs INTEGER DEFAULT 0,
total_words INTEGER DEFAULT 0,
decision_date DATE,
author TEXT DEFAULT 'דפנה תמיר',
panel_members JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(case_id, version)
);
-- decision_blocks: 12 בלוקים לפי block-schema.md
CREATE TABLE IF NOT EXISTS decision_blocks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
block_id TEXT NOT NULL, -- block-alef, block-bet, ... block-yod-bet
block_index INTEGER NOT NULL, -- 1-12
title TEXT DEFAULT '', -- כותרת הבלוק (ריק לבלוקים ללא כותרת)
content TEXT DEFAULT '', -- תוכן מלא (markdown)
word_count INTEGER DEFAULT 0,
weight_percent NUMERIC(5,2) DEFAULT 0, -- משקל בפועל (%)
generation_type TEXT DEFAULT '', -- template-fill/reproduction/paraphrase/...
model_used TEXT DEFAULT '', -- sonnet/opus/script
temperature NUMERIC(3,2) DEFAULT 0,
status TEXT DEFAULT 'empty', -- empty/draft/review/final
notes TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(decision_id, block_id)
);
-- decision_paragraphs: סעיפים בודדים עם מעקב ציטוטים
CREATE TABLE IF NOT EXISTS decision_paragraphs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
block_id UUID REFERENCES decision_blocks(id) ON DELETE CASCADE,
paragraph_number INTEGER NOT NULL, -- מספור רציף בתוך ההחלטה
content TEXT NOT NULL,
word_count INTEGER DEFAULT 0,
citations JSONB DEFAULT '[]', -- [{case_law_id, text, type}]
cross_references JSONB DEFAULT '[]', -- הפניות לסעיפים אחרים ["סעיף 5 לעיל"]
created_at TIMESTAMPTZ DEFAULT now()
);
-- claims: טענות צדדים (בלוק ז)
CREATE TABLE IF NOT EXISTS claims (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
party_role TEXT NOT NULL, -- appellant/respondent/permit_applicant/committee
party_name TEXT DEFAULT '',
claim_text TEXT NOT NULL,
claim_index INTEGER DEFAULT 0, -- סדר הופעה
source_document TEXT DEFAULT '', -- מאיזה מסמך חולצה הטענה
addressed_in_paragraph INTEGER, -- באיזה סעיף בדיון נענתה
created_at TIMESTAMPTZ DEFAULT now()
);
-- ═══════════════════════════════════════════════════════════════════
-- Layer 3: Legal Knowledge
-- ═══════════════════════════════════════════════════════════════════
-- case_law: פסיקה (תקדימים)
CREATE TABLE IF NOT EXISTS case_law (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_number TEXT UNIQUE NOT NULL, -- עע"מ 3975/22 או ערר 1011-03-25
case_name TEXT NOT NULL, -- שם קצר: "ב. קרן-נכסים"
court TEXT DEFAULT '', -- בג"ץ / עליון / מנהלי / ועדת ערר
date DATE,
subject_tags JSONB DEFAULT '[]', -- ["proprietary_claims", "parking"]
summary TEXT DEFAULT '', -- תמצית 2-3 משפטים
key_quote TEXT DEFAULT '', -- ציטוט מרכזי
full_text TEXT DEFAULT '', -- טקסט מלא אם זמין
source_url TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now()
);
-- case_law_citations: קשרים בין פסיקה להחלטות שלנו
CREATE TABLE IF NOT EXISTS case_law_citations (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE SET NULL,
citation_type TEXT DEFAULT 'support', -- support/distinguish/overrule/obiter
context_text TEXT DEFAULT '', -- ההקשר שבו צוטט
created_at TIMESTAMPTZ DEFAULT now()
);
-- statutory_provisions: חקיקה נפוצה
CREATE TABLE IF NOT EXISTS statutory_provisions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
statute_name TEXT NOT NULL, -- "חוק התכנון והבנייה"
section_number TEXT NOT NULL, -- "152(א)(2)"
section_title TEXT DEFAULT '', -- "זכות ערר"
full_text TEXT DEFAULT '', -- נוסח הסעיף
common_usage TEXT DEFAULT '', -- מתי משתמשים
subject_tags JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(statute_name, section_number)
);
-- transition_phrases: ביטויי מעבר של דפנה
CREATE TABLE IF NOT EXISTS transition_phrases (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
phrase TEXT UNIQUE NOT NULL, -- "ועל מנת לא לצאת בחסר"
usage_context TEXT DEFAULT '', -- מתי להשתמש
block_types JSONB DEFAULT '[]', -- באילו בלוקים: ["block-yod"]
frequency INTEGER DEFAULT 1, -- כמה פעמים ראינו
source_decision TEXT DEFAULT '', -- מאיזו החלטה
created_at TIMESTAMPTZ DEFAULT now()
);
-- lessons_learned: לקחים מהשוואת טיוטות לגרסאות סופיות
CREATE TABLE IF NOT EXISTS lessons_learned (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
lesson_title TEXT NOT NULL, -- "Discussion = continuous essay, no sub-headers"
lesson_text TEXT NOT NULL, -- תיאור מלא
category TEXT DEFAULT '', -- structure/style/content/process
applies_to JSONB DEFAULT '[]', -- ["block-yod", "all"]
source_case TEXT DEFAULT '', -- "הכט 1180-1181"
severity TEXT DEFAULT 'important', -- critical/important/nice-to-have
created_at TIMESTAMPTZ DEFAULT now()
);
-- ═══════════════════════════════════════════════════════════════════
-- Layer 4: Extended RAG
-- ═══════════════════════════════════════════════════════════════════
-- paragraph_embeddings: embeddings של סעיפים בהחלטות
CREATE TABLE IF NOT EXISTS paragraph_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE CASCADE,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
-- case_law_embeddings: embeddings של פסיקה
CREATE TABLE IF NOT EXISTS case_law_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
chunk_text TEXT NOT NULL,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
-- ═══════════════════════════════════════════════════════════════════
-- Indexes
-- ═══════════════════════════════════════════════════════════════════
CREATE INDEX IF NOT EXISTS idx_decisions_case ON decisions(case_id);
CREATE INDEX IF NOT EXISTS idx_decisions_status ON decisions(status);
CREATE INDEX IF NOT EXISTS idx_decision_blocks_decision ON decision_blocks(decision_id);
CREATE INDEX IF NOT EXISTS idx_decision_blocks_block_id ON decision_blocks(block_id);
CREATE INDEX IF NOT EXISTS idx_decision_paragraphs_block ON decision_paragraphs(block_id);
CREATE INDEX IF NOT EXISTS idx_claims_case ON claims(case_id);
CREATE INDEX IF NOT EXISTS idx_claims_role ON claims(party_role);
CREATE INDEX IF NOT EXISTS idx_case_law_subject ON case_law USING gin(subject_tags);
CREATE INDEX IF NOT EXISTS idx_case_law_citations_decision ON case_law_citations(decision_id);
CREATE INDEX IF NOT EXISTS idx_statutory_provisions_statute ON statutory_provisions(statute_name);
CREATE INDEX IF NOT EXISTS idx_transition_phrases_block ON transition_phrases USING gin(block_types);
CREATE INDEX IF NOT EXISTS idx_lessons_category ON lessons_learned(category);
CREATE INDEX IF NOT EXISTS idx_paragraph_embeddings_vec
ON paragraph_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
CREATE INDEX IF NOT EXISTS idx_case_law_embeddings_vec
ON case_law_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
"""
async def init_schema() -> None:
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL)
await conn.execute(MIGRATIONS_SQL)
await conn.execute(SCHEMA_V2_SQL)
await conn.execute(SCHEMA_V3_SQL)
await conn.execute(SCHEMA_V4_SQL)
logger.info("Database schema initialized (v1 + v2 + v3 + v4)")
# ── Case CRUD ───────────────────────────────────────────────────────
async def create_case(
case_number: str,
title: str,
appellants: list[str] | None = None,
respondents: list[str] | None = None,
subject: str = "",
property_address: str = "",
permit_number: str = "",
committee_type: str = "ועדה מקומית",
hearing_date: date | None = None,
notes: str = "",
expected_outcome: str = "",
practice_area: str = "appeals_committee",
appeal_subtype: str | None = None,
) -> dict:
pool = await get_pool()
case_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO cases (id, case_number, title, appellants, respondents,
subject, property_address, permit_number, committee_type,
hearing_date, notes, expected_outcome,
practice_area, appeal_subtype)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)""",
case_id, case_number, title,
json.dumps(appellants or []),
json.dumps(respondents or []),
subject, property_address, permit_number, committee_type,
hearing_date, notes, expected_outcome,
practice_area, appeal_subtype,
)
return await get_case(case_id)
async def get_case_practice_area(case_id: UUID) -> tuple[str | None, str | None]:
"""Return (practice_area, appeal_subtype) for a case, or (None, None) if missing."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT practice_area, appeal_subtype FROM cases WHERE id = $1", case_id
)
if row is None:
return None, None
return row["practice_area"], row["appeal_subtype"]
async def get_case_practice_area_by_number(case_number: str) -> tuple[str | None, str | None]:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT practice_area, appeal_subtype FROM cases WHERE case_number = $1",
case_number,
)
if row is None:
return None, None
return row["practice_area"], row["appeal_subtype"]
async def get_case(case_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id)
if row is None:
return None
return _row_to_case(row)
async def get_case_by_number(case_number: str) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM cases WHERE case_number = $1", case_number
)
if row is None:
return None
return _row_to_case(row)
async def list_cases(status: str | None = None, limit: int = 50) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if status:
rows = await conn.fetch(
"SELECT * FROM cases WHERE status = $1 ORDER BY updated_at DESC LIMIT $2",
status, limit,
)
else:
rows = await conn.fetch(
"SELECT * FROM cases ORDER BY updated_at DESC LIMIT $1", limit
)
return [_row_to_case(r) for r in rows]
async def update_case(case_id: UUID, **fields) -> dict | None:
if not fields:
return await get_case(case_id)
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key in ("appellants", "respondents", "tags"):
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, case_id, *values)
return await get_case(case_id)
def _row_to_case(row: asyncpg.Record) -> dict:
d = dict(row)
for field in ("appellants", "respondents", "tags"):
if isinstance(d.get(field), str):
d[field] = json.loads(d[field])
d["id"] = str(d["id"])
return d
# ── Document CRUD ───────────────────────────────────────────────────
async def create_document(
case_id: UUID | None,
doc_type: str,
title: str,
file_path: str,
page_count: int | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> dict:
pool = await get_pool()
doc_id = uuid4()
async with pool.acquire() as conn:
# If practice_area not explicitly given, inherit from the parent case
# (for case-bound documents). Training corpus passes case_id=None and
# provides the practice_area directly.
if practice_area is None and case_id is not None:
case_row = await conn.fetchrow(
"SELECT practice_area, appeal_subtype FROM cases WHERE id = $1",
case_id,
)
if case_row:
practice_area = case_row["practice_area"]
appeal_subtype = case_row["appeal_subtype"]
await conn.execute(
"""INSERT INTO documents (id, case_id, doc_type, title, file_path,
page_count, practice_area, appeal_subtype)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""",
doc_id, case_id, doc_type, title, file_path, page_count,
practice_area, appeal_subtype,
)
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row)
async def update_document(doc_id: UUID, **fields) -> None:
if not fields:
return
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key == "metadata":
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, doc_id, *values)
async def get_document(doc_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row) if row else None
async def list_documents(case_id: UUID) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id
)
return [_row_to_doc(r) for r in rows]
async def get_document_text(doc_id: UUID) -> str:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT extracted_text FROM documents WHERE id = $1", doc_id
)
return row["extracted_text"] if row else ""
def _row_to_doc(row: asyncpg.Record) -> dict:
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("metadata"), str):
d["metadata"] = json.loads(d["metadata"])
return d
# ── Claims ─────────────────────────────────────────────────────────
async def store_claims(case_id: UUID, claims: list[dict], source_document: str = "") -> int:
"""Store extracted claims. Replaces existing claims from same source.
Each claim dict: party_role, claim_text, claim_index, party_name (optional)
"""
pool = await get_pool()
async with pool.acquire() as conn:
if source_document:
await conn.execute(
"DELETE FROM claims WHERE case_id = $1 AND source_document = $2",
case_id, source_document,
)
for claim in claims:
await conn.execute(
"""INSERT INTO claims (case_id, party_role, party_name, claim_text, claim_index, source_document, claim_type)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
case_id,
claim["party_role"],
claim.get("party_name", ""),
claim["claim_text"],
claim.get("claim_index", 0),
source_document,
claim.get("claim_type", "claim"),
)
return len(claims)
async def get_claims(case_id: UUID, party_role: str | None = None) -> list[dict]:
"""Get claims for a case, optionally filtered by party role."""
pool = await get_pool()
async with pool.acquire() as conn:
if party_role:
rows = await conn.fetch(
"SELECT * FROM claims WHERE case_id = $1 AND party_role = $2 ORDER BY claim_index",
case_id, party_role,
)
else:
rows = await conn.fetch(
"SELECT * FROM claims WHERE case_id = $1 ORDER BY party_role, claim_index",
case_id,
)
return [dict(r) for r in rows]
# ── Decisions ──────────────────────────────────────────────────────
async def create_decision(
case_id: UUID,
outcome: str = "",
outcome_summary: str = "",
outcome_reasoning: str = "",
direction_doc: dict | None = None,
) -> dict:
"""Create a decision record for a case."""
pool = await get_pool()
decision_id = uuid4()
async with pool.acquire() as conn:
# Check if a decision already exists for this case
existing = await conn.fetchrow(
"SELECT id, version FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1",
case_id,
)
version = (existing["version"] + 1) if existing else 1
case_row = await conn.fetchrow(
"SELECT practice_area, appeal_subtype FROM cases WHERE id = $1", case_id
)
practice_area = case_row["practice_area"] if case_row else None
appeal_subtype = case_row["appeal_subtype"] if case_row else None
await conn.execute(
"""INSERT INTO decisions (id, case_id, version, outcome, outcome_summary,
outcome_reasoning, direction_doc,
practice_area, appeal_subtype)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
decision_id, case_id, version, outcome, outcome_summary,
outcome_reasoning, json.dumps(direction_doc) if direction_doc else None,
practice_area, appeal_subtype,
)
return await get_decision(decision_id)
async def get_decision(decision_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM decisions WHERE id = $1", decision_id)
if not row:
return None
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("direction_doc"), str):
d["direction_doc"] = json.loads(d["direction_doc"])
if isinstance(d.get("panel_members"), str):
d["panel_members"] = json.loads(d["panel_members"])
return d
async def get_decision_by_case(case_id: UUID) -> dict | None:
"""Get the latest decision for a case."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1",
case_id,
)
if not row:
return None
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("direction_doc"), str):
d["direction_doc"] = json.loads(d["direction_doc"])
if isinstance(d.get("panel_members"), str):
d["panel_members"] = json.loads(d["panel_members"])
return d
async def update_decision(decision_id: UUID, **fields) -> None:
if not fields:
return
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key in ("direction_doc", "panel_members") and isinstance(val, (dict, list)):
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = f"UPDATE decisions SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, decision_id, *values)
# ── Chunks & Vectors ───────────────────────────────────────────────
async def delete_document_chunks(document_id: UUID) -> int:
"""Delete all chunks for a document (used before reprocessing)."""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", document_id
)
return int(result.split()[-1]) # e.g. "DELETE 5" -> 5
async def store_chunks(
document_id: UUID,
case_id: UUID | None,
chunks: list[dict],
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> int:
"""Store document chunks with embeddings. Each chunk dict has:
content, section_type, embedding (list[float]), page_number, chunk_index.
practice_area defaults to the parent case's value, or — when case_id is
None (training corpus) — falls back to the parent document's value so
vector search can still filter cleanly.
"""
pool = await get_pool()
async with pool.acquire() as conn:
# Resolve practice_area in priority order: explicit > case > document.
if practice_area is None:
if case_id is not None:
case_row = await conn.fetchrow(
"SELECT practice_area, appeal_subtype FROM cases WHERE id = $1",
case_id,
)
if case_row:
practice_area = case_row["practice_area"]
appeal_subtype = case_row["appeal_subtype"]
if practice_area is None:
doc_row = await conn.fetchrow(
"SELECT practice_area, appeal_subtype FROM documents WHERE id = $1",
document_id,
)
if doc_row:
practice_area = doc_row["practice_area"]
appeal_subtype = doc_row["appeal_subtype"]
# Delete existing chunks for this document
await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", document_id
)
for chunk in chunks:
await conn.execute(
"""INSERT INTO document_chunks
(document_id, case_id, chunk_index, content, section_type,
embedding, page_number, practice_area, appeal_subtype)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
document_id, case_id,
chunk["chunk_index"],
chunk["content"],
chunk.get("section_type", "other"),
chunk["embedding"],
chunk.get("page_number"),
practice_area, appeal_subtype,
)
return len(chunks)
async def search_similar(
query_embedding: list[float],
limit: int = 10,
case_id: UUID | None = None,
section_type: str | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Cosine similarity search on document chunks.
Filter by practice_area to keep precedents from the same legal domain
(e.g. don't surface betterment-levy chunks when working on building
permits). Uses the denormalized column on document_chunks — no JOIN.
"""
pool = await get_pool()
conditions = []
params: list = [query_embedding, limit]
param_idx = 3
if case_id:
conditions.append(f"dc.case_id = ${param_idx}")
params.append(case_id)
param_idx += 1
if section_type:
conditions.append(f"dc.section_type = ${param_idx}")
params.append(section_type)
param_idx += 1
if practice_area:
conditions.append(f"dc.practice_area = ${param_idx}")
params.append(practice_area)
param_idx += 1
if appeal_subtype:
conditions.append(f"dc.appeal_subtype = ${param_idx}")
params.append(appeal_subtype)
param_idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT dc.content, dc.section_type, dc.page_number,
dc.document_id, dc.case_id,
d.title AS document_title,
c.case_number,
1 - (dc.embedding <=> $1) AS score
FROM document_chunks dc
JOIN documents d ON d.id = dc.document_id
JOIN cases c ON c.id = dc.case_id
{where}
ORDER BY dc.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
# ── Style corpus ────────────────────────────────────────────────────
async def add_to_style_corpus(
document_id: UUID | None,
decision_number: str,
decision_date: date | None,
subject_categories: list[str],
full_text: str,
summary: str = "",
outcome: str = "",
key_principles: list[str] | None = None,
practice_area: str = "appeals_committee",
appeal_subtype: str | None = None,
) -> UUID:
pool = await get_pool()
corpus_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO style_corpus
(id, document_id, decision_number, decision_date,
subject_categories, full_text, summary, outcome, key_principles,
practice_area, appeal_subtype)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""",
corpus_id, document_id, decision_number, decision_date,
json.dumps(subject_categories), full_text, summary, outcome,
json.dumps(key_principles or []),
practice_area, appeal_subtype,
)
return corpus_id
async def delete_from_style_corpus(corpus_id: UUID) -> dict:
"""Remove a decision from style_corpus + related documents (cascades chunks).
Also tries to delete the [קורפוס] document associated by title match,
since the current training pipeline inserts style_corpus with document_id=NULL.
"""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
row = await conn.fetchrow(
"DELETE FROM style_corpus WHERE id = $1 "
"RETURNING decision_number, document_id",
corpus_id,
)
if not row:
return {"deleted": False, "reason": "not found"}
docs_deleted = 0
if row["document_id"]:
await conn.execute(
"DELETE FROM documents WHERE id = $1", row["document_id"]
)
docs_deleted = 1
else:
# Best-effort: match a [קורפוס] document by the decision_number
# in its title. Only for single, unambiguous matches.
if row["decision_number"]:
docs = await conn.fetch(
"SELECT id FROM documents "
"WHERE case_id IS NULL AND title LIKE $1",
f"%{row['decision_number']}%",
)
if len(docs) == 1:
await conn.execute(
"DELETE FROM documents WHERE id = $1", docs[0]["id"]
)
docs_deleted = 1
return {
"deleted": True,
"decision_number": row["decision_number"],
"docs_deleted": docs_deleted,
}
async def get_style_patterns(pattern_type: str | None = None) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if pattern_type:
rows = await conn.fetch(
"SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC",
pattern_type,
)
else:
rows = await conn.fetch(
"SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC"
)
return [dict(r) for r in rows]
async def upsert_style_pattern(
pattern_type: str,
pattern_text: str,
context: str = "",
examples: list[str] | None = None,
) -> None:
pool = await get_pool()
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2",
pattern_type, pattern_text,
)
if existing:
await conn.execute(
"UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1",
existing["id"],
)
else:
await conn.execute(
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples)
VALUES ($1, $2, $3, $4)""",
pattern_type, pattern_text, context,
json.dumps(examples or []),
)
async def clear_style_patterns() -> None:
"""Delete all existing style patterns (used before re-analysis)."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute("DELETE FROM style_patterns")
# ── Semantic Search (V2 — decision blocks & case law) ─────────────
async def search_similar_paragraphs(
query_embedding: list[float],
limit: int = 10,
block_type: str | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Search decision paragraphs by semantic similarity.
Filtering by practice_area uses the denormalized column on `decisions`
so we don't pull, e.g., betterment-levy paragraphs when writing a
building-permit decision.
"""
pool = await get_pool()
conditions = []
params: list = [query_embedding, limit]
param_idx = 3
if block_type:
conditions.append(f"db.block_id = ${param_idx}")
params.append(block_type)
param_idx += 1
if practice_area:
conditions.append(f"d.practice_area = ${param_idx}")
params.append(practice_area)
param_idx += 1
if appeal_subtype:
conditions.append(f"d.appeal_subtype = ${param_idx}")
params.append(appeal_subtype)
param_idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT dp.content, dp.word_count, dp.paragraph_number,
db.block_id AS block_type, db.title AS block_title,
c.case_number, c.title AS case_title,
d.outcome, d.author,
1 - (pe.embedding <=> $1) AS score
FROM paragraph_embeddings pe
JOIN decision_paragraphs dp ON dp.id = pe.paragraph_id
JOIN decision_blocks db ON db.id = dp.block_id
JOIN decisions d ON d.id = db.decision_id
JOIN cases c ON c.id = d.case_id
{where}
ORDER BY pe.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
async def search_similar_case_law(
query_embedding: list[float],
limit: int = 5,
) -> list[dict]:
"""Search case law by semantic similarity."""
pool = await get_pool()
sql = """
SELECT cl.case_number, cl.case_name, cl.court, cl.summary,
cl.key_quote, cl.subject_tags,
cle.chunk_text,
1 - (cle.embedding <=> $1) AS score
FROM case_law_embeddings cle
JOIN case_law cl ON cl.id = cle.case_law_id
ORDER BY cle.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, query_embedding, limit)
results = []
for r in rows:
d = dict(r)
if isinstance(d.get("subject_tags"), str):
d["subject_tags"] = json.loads(d["subject_tags"])
results.append(d)
return results
async def search_precedents(
query_embedding: list[float],
limit: int = 10,
) -> list[dict]:
"""Combined search: paragraphs + case law, ranked by score."""
paragraphs = await search_similar_paragraphs(query_embedding, limit=limit)
case_law = await search_similar_case_law(query_embedding, limit=limit)
# Combine and sort by score
results = []
for p in paragraphs:
results.append({
"type": "decision_paragraph",
"score": float(p["score"]),
"case_number": p["case_number"],
"case_title": p["case_title"],
"block_type": p["block_type"],
"content": p["content"][:500],
"author": p["author"],
})
for c in case_law:
results.append({
"type": "case_law",
"score": float(c["score"]),
"case_number": c["case_number"],
"case_name": c["case_name"],
"court": c["court"],
"content": c["summary"],
})
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]