Files
legal-ai/mcp-server/src/legal_mcp/services/db.py
Chaim ad4350029a fix(style-acq T1): insert_style_exemplar — vector כ-list לא str (register_vector)
asyncpg עם pgvector register_vector מקבל את ה-embedding כ-list[float] ישירות;
str() גרם ל-DataError. תוקן בהתאם לדפוס store_*_image_embeddings.
Backfill הורץ בהצלחה: 2670 דוגמאות מ-83 החלטות.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 18:14:56 +00:00

5158 lines
207 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Database service - asyncpg connection pool and queries."""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import re
from datetime import date
from uuid import UUID, uuid4
import asyncpg
from pgvector.asyncpg import register_vector
from legal_mcp import config
from legal_mcp.services import halacha_quality
logger = logging.getLogger(__name__)
_pool: asyncpg.Pool | None = None
_schema_ready: bool = False
_init_lock: asyncio.Lock = asyncio.Lock()
async def get_pool() -> asyncpg.Pool:
"""Return the connection pool, creating it (and running schema init) lazily.
The MCP server's `lifespan` no longer blocks on schema init — it's done
here on first DB access. This keeps the `initialize`/`tools/list` MCP
handshake immediate so Claude Code never sees a stale "No such tool".
"""
global _pool, _schema_ready
if _pool is not None and _schema_ready:
return _pool
async with _init_lock:
if _pool is None:
# First, ensure pgvector extension exists (before registering type codec)
conn = await asyncpg.connect(config.POSTGRES_URL)
try:
await conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"')
finally:
await conn.close()
_pool = await asyncpg.create_pool(
config.POSTGRES_URL,
min_size=2,
max_size=10,
init=_init_connection,
)
if not _schema_ready:
await _run_schema_migrations(_pool)
_schema_ready = True
return _pool
async def _init_connection(conn: asyncpg.Connection) -> None:
await register_vector(conn)
async def close_pool() -> None:
global _pool
if _pool:
await _pool.close()
_pool = None
# ── Schema ──────────────────────────────────────────────────────────
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS cases (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_number TEXT UNIQUE NOT NULL,
title TEXT NOT NULL,
appellants JSONB DEFAULT '[]',
respondents JSONB DEFAULT '[]',
subject TEXT DEFAULT '',
property_address TEXT DEFAULT '',
permit_number TEXT DEFAULT '',
committee_type TEXT DEFAULT 'ועדה מקומית',
status TEXT DEFAULT 'new',
hearing_date DATE,
decision_date DATE,
tags JSONB DEFAULT '[]',
notes TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS documents (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
doc_type TEXT NOT NULL,
title TEXT NOT NULL,
file_path TEXT NOT NULL,
extracted_text TEXT DEFAULT '',
extraction_status TEXT DEFAULT 'pending',
page_count INTEGER,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT now()
);
-- INV-TOOL3 / GAP-52: SHA-256 of the uploaded file bytes, for idempotent upload
-- (re-uploading the same file to a case returns the existing document). Empty
-- default = legacy rows with unknown hash; never matched as a duplicate.
ALTER TABLE documents ADD COLUMN IF NOT EXISTS content_hash text NOT NULL DEFAULT '';
CREATE TABLE IF NOT EXISTS document_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_type TEXT DEFAULT 'other',
embedding vector(1024),
page_number INTEGER,
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_corpus (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
decision_number TEXT,
decision_date DATE,
subject_categories JSONB DEFAULT '[]',
full_text TEXT NOT NULL,
summary TEXT DEFAULT '',
outcome TEXT DEFAULT '',
key_principles JSONB DEFAULT '[]',
practice_area TEXT DEFAULT 'appeals_committee',
appeal_subtype TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_patterns (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
pattern_type TEXT NOT NULL,
pattern_text TEXT NOT NULL,
frequency INTEGER DEFAULT 1,
context TEXT DEFAULT '',
examples JSONB DEFAULT '[]',
appeal_subtype TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_chunks_embedding
ON document_chunks USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id);
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id);
CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status);
CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number);
"""
MIGRATIONS_SQL = """
ALTER TABLE cases ADD COLUMN IF NOT EXISTS expected_outcome TEXT DEFAULT '';
CREATE TABLE IF NOT EXISTS audit_log (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
action TEXT NOT NULL,
case_id UUID REFERENCES cases(id) ON DELETE SET NULL,
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
details JSONB DEFAULT '{}',
actor TEXT DEFAULT 'system',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_audit_case ON audit_log(case_id);
CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_log(action);
CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_log(created_at DESC);
"""
# ── Phase 3: Workflow expansion ────────────────────────────────────
SCHEMA_V3_SQL = """
-- הרחבת decisions עם שדות חדשים
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS direction_doc JSONB DEFAULT NULL;
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS outcome_reasoning TEXT DEFAULT '';
-- הרחבת cases עם appeal_type (אם לא קיים)
ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_type TEXT DEFAULT '';
ALTER TABLE cases ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
-- active_draft_path = path to the DOCX that is the current source of truth
-- for this case's decision text. Set to the latest טיוטה-v*.docx after export,
-- or the latest עריכה-v*.docx after user upload. Used by revise_draft to know
-- what file to base Track Changes revisions on.
ALTER TABLE cases ADD COLUMN IF NOT EXISTS active_draft_path TEXT;
-- הרחבת style_corpus עם practice_area / appeal_subtype
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
-- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר
ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
-- decision_lessons: per-decision learnings the chair / curator / style_analyzer
-- attaches to a corpus row. The generic legal-decision-lessons.md file stays
-- as the source of truth for cross-corpus patterns; this table stores the
-- granular "what we learned from THIS decision" notes that drive the writer's
-- future drafts and let the curator look up prior observations on the same row.
CREATE TABLE IF NOT EXISTS decision_lessons (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
style_corpus_id UUID NOT NULL REFERENCES style_corpus(id) ON DELETE CASCADE,
lesson_text TEXT NOT NULL,
category TEXT DEFAULT 'general', -- style / structure / lexicon / tabular / general
source TEXT DEFAULT 'manual', -- manual / curator / chair / style_analyzer
applied_to_skill BOOLEAN DEFAULT false, -- has this been promoted into SKILL.md?
created_by TEXT DEFAULT 'chaim',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_decision_lessons_corpus ON decision_lessons(style_corpus_id);
CREATE INDEX IF NOT EXISTS idx_decision_lessons_applied ON decision_lessons(applied_to_skill);
-- chat_conversations / chat_messages: persistent history for the
-- "שיחה עם הסוכן" tab on /training. Each conversation can optionally be
-- scoped to a single style_corpus row (when the chair starts a chat
-- "about decision X"). claude_session_id is the value the local claude
-- CLI returns in stream-json — we pass it back via `--resume` on the
-- next message so the model continues the same conversation without
-- re-loading the system prompt every time.
CREATE TABLE IF NOT EXISTS chat_conversations (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
title TEXT NOT NULL DEFAULT 'שיחה חדשה',
style_corpus_id UUID REFERENCES style_corpus(id) ON DELETE SET NULL,
claude_session_id TEXT,
system_prompt_version TEXT DEFAULT 'v1',
created_at TIMESTAMPTZ DEFAULT now(),
last_message_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS chat_messages (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
conversation_id UUID NOT NULL REFERENCES chat_conversations(id) ON DELETE CASCADE,
role TEXT NOT NULL, -- 'user' | 'assistant'
content TEXT NOT NULL,
raw_events JSONB DEFAULT '[]', -- stream-json events for the assistant turn (optional, for debug)
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_chat_messages_conv ON chat_messages(conversation_id, created_at);
CREATE INDEX IF NOT EXISTS idx_chat_conv_corpus ON chat_conversations(style_corpus_id);
CREATE INDEX IF NOT EXISTS idx_chat_conv_last ON chat_conversations(last_message_at DESC);
-- טבלת qa_results
CREATE TABLE IF NOT EXISTS qa_results (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
check_name TEXT NOT NULL,
passed BOOLEAN NOT NULL,
severity TEXT DEFAULT 'warning',
errors JSONB DEFAULT '[]',
details TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_qa_results_decision ON qa_results(decision_id);
CREATE INDEX IF NOT EXISTS idx_qa_results_case ON qa_results(case_id);
-- טבלת decision_definitions (אם לא קיימת)
CREATE TABLE IF NOT EXISTS decision_definitions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
term TEXT NOT NULL,
definition TEXT NOT NULL,
block_id TEXT DEFAULT 'block-he',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_definitions_decision ON decision_definitions(decision_id);
-- טבלת appeal_type_rules (אם לא קיימת)
CREATE TABLE IF NOT EXISTS appeal_type_rules (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
appeal_type TEXT NOT NULL,
rule_category TEXT NOT NULL,
rule_key TEXT NOT NULL,
rule_value JSONB NOT NULL,
description TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(appeal_type, rule_category, rule_key)
);
-- image_placeholders על decision_blocks
ALTER TABLE decision_blocks ADD COLUMN IF NOT EXISTS image_placeholders JSONB DEFAULT '[]';
"""
# ── Phase 2: Decision + Knowledge + RAG layers ────────────────────
SCHEMA_V2_SQL = """
-- ═══════════════════════════════════════════════════════════════════
-- Layer 2: Decision
-- ═══════════════════════════════════════════════════════════════════
-- decisions: מטאדטה של החלטה (גרסה אחת = רשומה אחת)
CREATE TABLE IF NOT EXISTS decisions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
version INTEGER DEFAULT 1,
status TEXT DEFAULT 'draft', -- draft/review/final/published
outcome TEXT DEFAULT '', -- rejected/accepted/partial
outcome_summary TEXT DEFAULT '', -- תמצית תוצאה (שורה אחת)
total_paragraphs INTEGER DEFAULT 0,
total_words INTEGER DEFAULT 0,
decision_date DATE,
author TEXT DEFAULT 'דפנה תמיר',
panel_members JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(case_id, version)
);
-- decision_blocks: 12 בלוקים לפי block-schema.md
CREATE TABLE IF NOT EXISTS decision_blocks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
block_id TEXT NOT NULL, -- block-alef, block-bet, ... block-yod-bet
block_index INTEGER NOT NULL, -- 1-12
title TEXT DEFAULT '', -- כותרת הבלוק (ריק לבלוקים ללא כותרת)
content TEXT DEFAULT '', -- תוכן מלא (markdown)
word_count INTEGER DEFAULT 0,
weight_percent NUMERIC(5,2) DEFAULT 0, -- משקל בפועל (%)
generation_type TEXT DEFAULT '', -- template-fill/reproduction/paraphrase/...
model_used TEXT DEFAULT '', -- sonnet/opus/script
temperature NUMERIC(3,2) DEFAULT 0,
status TEXT DEFAULT 'empty', -- empty/draft/review/final
notes TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(decision_id, block_id)
);
-- decision_paragraphs: סעיפים בודדים עם מעקב ציטוטים
CREATE TABLE IF NOT EXISTS decision_paragraphs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
block_id UUID REFERENCES decision_blocks(id) ON DELETE CASCADE,
paragraph_number INTEGER NOT NULL, -- מספור רציף בתוך ההחלטה
content TEXT NOT NULL,
word_count INTEGER DEFAULT 0,
citations JSONB DEFAULT '[]', -- [{case_law_id, text, type}]
cross_references JSONB DEFAULT '[]', -- הפניות לסעיפים אחרים ["סעיף 5 לעיל"]
created_at TIMESTAMPTZ DEFAULT now()
);
-- claims: טענות צדדים (בלוק ז)
CREATE TABLE IF NOT EXISTS claims (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
party_role TEXT NOT NULL, -- appellant/respondent/permit_applicant/committee
party_name TEXT DEFAULT '',
claim_text TEXT NOT NULL,
claim_index INTEGER DEFAULT 0, -- סדר הופעה
source_document TEXT DEFAULT '', -- מאיזה מסמך חולצה הטענה
addressed_in_paragraph INTEGER, -- באיזה סעיף בדיון נענתה
created_at TIMESTAMPTZ DEFAULT now()
);
-- ═══════════════════════════════════════════════════════════════════
-- Layer 3: Legal Knowledge
-- ═══════════════════════════════════════════════════════════════════
-- case_law: פסיקה (תקדימים)
CREATE TABLE IF NOT EXISTS case_law (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_number TEXT UNIQUE NOT NULL, -- עע"מ 3975/22 או ערר 1011-03-25
case_name TEXT NOT NULL, -- שם קצר: "ב. קרן-נכסים"
court TEXT DEFAULT '', -- בג"ץ / עליון / מנהלי / ועדת ערר
date DATE,
subject_tags JSONB DEFAULT '[]', -- ["proprietary_claims", "parking"]
summary TEXT DEFAULT '', -- תמצית 2-3 משפטים
key_quote TEXT DEFAULT '', -- ציטוט מרכזי
full_text TEXT DEFAULT '', -- טקסט מלא אם זמין
source_url TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now()
);
-- case_law_citations: קשרים בין פסיקה להחלטות שלנו
CREATE TABLE IF NOT EXISTS case_law_citations (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE,
paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE SET NULL,
citation_type TEXT DEFAULT 'support', -- support/distinguish/overrule/obiter
context_text TEXT DEFAULT '', -- ההקשר שבו צוטט
created_at TIMESTAMPTZ DEFAULT now()
);
-- statutory_provisions: חקיקה נפוצה
CREATE TABLE IF NOT EXISTS statutory_provisions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
statute_name TEXT NOT NULL, -- "חוק התכנון והבנייה"
section_number TEXT NOT NULL, -- "152(א)(2)"
section_title TEXT DEFAULT '', -- "זכות ערר"
full_text TEXT DEFAULT '', -- נוסח הסעיף
common_usage TEXT DEFAULT '', -- מתי משתמשים
subject_tags JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(statute_name, section_number)
);
-- transition_phrases: ביטויי מעבר של דפנה
CREATE TABLE IF NOT EXISTS transition_phrases (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
phrase TEXT UNIQUE NOT NULL, -- "ועל מנת לא לצאת בחסר"
usage_context TEXT DEFAULT '', -- מתי להשתמש
block_types JSONB DEFAULT '[]', -- באילו בלוקים: ["block-yod"]
frequency INTEGER DEFAULT 1, -- כמה פעמים ראינו
source_decision TEXT DEFAULT '', -- מאיזו החלטה
created_at TIMESTAMPTZ DEFAULT now()
);
-- lessons_learned: לקחים מהשוואת טיוטות לגרסאות סופיות
CREATE TABLE IF NOT EXISTS lessons_learned (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
lesson_title TEXT NOT NULL, -- "Discussion = continuous essay, no sub-headers"
lesson_text TEXT NOT NULL, -- תיאור מלא
category TEXT DEFAULT '', -- structure/style/content/process
applies_to JSONB DEFAULT '[]', -- ["block-yod", "all"]
source_case TEXT DEFAULT '', -- "הכט 1180-1181"
severity TEXT DEFAULT 'important', -- critical/important/nice-to-have
created_at TIMESTAMPTZ DEFAULT now()
);
-- ═══════════════════════════════════════════════════════════════════
-- Layer 4: Extended RAG
-- ═══════════════════════════════════════════════════════════════════
-- paragraph_embeddings: embeddings של סעיפים בהחלטות
CREATE TABLE IF NOT EXISTS paragraph_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE CASCADE,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
-- case_law_embeddings: embeddings של פסיקה
CREATE TABLE IF NOT EXISTS case_law_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
chunk_text TEXT NOT NULL,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
-- ═══════════════════════════════════════════════════════════════════
-- Chair Feedback (הערות דפנה על טיוטות)
-- ═══════════════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS chair_feedback (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE SET NULL,
block_id TEXT DEFAULT '', -- block-yod, block-vav, etc.
feedback_text TEXT NOT NULL, -- ההערה של דפנה
category TEXT DEFAULT 'other', -- missing_content/wrong_tone/wrong_structure/factual_error/style/other
lesson_extracted TEXT DEFAULT '', -- הלקח שהופק
applied_to TEXT[] DEFAULT '{}', -- לאילו קבצים/כללים הלקח יושם
resolved BOOLEAN DEFAULT FALSE, -- האם הלקח יושם
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS tag_company_mappings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
tag TEXT NOT NULL, -- appeal_subtype value (e.g. building_permit)
tag_label TEXT NOT NULL DEFAULT '', -- Hebrew display label
company_id TEXT NOT NULL, -- Paperclip company UUID
company_name TEXT NOT NULL DEFAULT '', -- cached company name for display
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(tag, company_id)
);
-- ═══════════════════════════════════════════════════════════════════
-- Indexes
-- ═══════════════════════════════════════════════════════════════════
CREATE INDEX IF NOT EXISTS idx_decisions_case ON decisions(case_id);
CREATE INDEX IF NOT EXISTS idx_decisions_status ON decisions(status);
CREATE INDEX IF NOT EXISTS idx_decision_blocks_decision ON decision_blocks(decision_id);
CREATE INDEX IF NOT EXISTS idx_decision_blocks_block_id ON decision_blocks(block_id);
CREATE INDEX IF NOT EXISTS idx_decision_paragraphs_block ON decision_paragraphs(block_id);
CREATE INDEX IF NOT EXISTS idx_claims_case ON claims(case_id);
CREATE INDEX IF NOT EXISTS idx_claims_role ON claims(party_role);
CREATE INDEX IF NOT EXISTS idx_case_law_subject ON case_law USING gin(subject_tags);
CREATE INDEX IF NOT EXISTS idx_case_law_citations_decision ON case_law_citations(decision_id);
CREATE INDEX IF NOT EXISTS idx_statutory_provisions_statute ON statutory_provisions(statute_name);
CREATE INDEX IF NOT EXISTS idx_transition_phrases_block ON transition_phrases USING gin(block_types);
CREATE INDEX IF NOT EXISTS idx_lessons_category ON lessons_learned(category);
CREATE INDEX IF NOT EXISTS idx_paragraph_embeddings_vec
ON paragraph_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
CREATE INDEX IF NOT EXISTS idx_case_law_embeddings_vec
ON case_law_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
"""
# ── Phase 4: Methodology alignment ──────────────────────────────
SCHEMA_V4_SQL = """
-- ═══════════════════════════════════════════════════════════════════
-- V4: Methodology alignment (decision-methodology.md)
-- ═══════════════════════════════════════════════════════════════════
-- claims: טיפול בטענות (bundle/skip) + סוג טענה
ALTER TABLE claims ADD COLUMN IF NOT EXISTS claim_type TEXT DEFAULT 'claim';
-- claim / response / reply
ALTER TABLE claims ADD COLUMN IF NOT EXISTS claim_handling TEXT DEFAULT 'address';
-- address (דיון מלא) / bundle (קיבוץ) / skip (דילוג)
ALTER TABLE claims ADD COLUMN IF NOT EXISTS bundle_group TEXT DEFAULT '';
-- שם הקבוצה לקיבוץ (למשל "פגמים פרוצדורליים")
ALTER TABLE claims ADD COLUMN IF NOT EXISTS handling_reason TEXT DEFAULT '';
-- נימוק לדילוג/קיבוץ (למשל "נבחנה ולא מצאנו ממש")
-- cases: תקן ביקורת + קטגוריות נושא
ALTER TABLE cases ADD COLUMN IF NOT EXISTS standard_of_review TEXT DEFAULT '';
-- "שיקול דעת תכנוני עצמאי" / "בחינת שומה מכרעת" / ...
ALTER TABLE cases ADD COLUMN IF NOT EXISTS subject_categories JSONB DEFAULT '[]';
-- ["חניה", "קווי בניין", "גובה", "שימוש חורג", ...]
-- case_law: רמת תקדים + מעמד
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS precedent_level TEXT DEFAULT '';
-- עליון / מנהלי / ועדת ערר ארצית / ועדת ערר מחוזית
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS is_binding BOOLEAN DEFAULT TRUE;
-- הלכה מחייבת (true) / אמרת אגב (false)
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS creac_role TEXT DEFAULT '';
-- rule (הנחה עליונה) / explanation (הרחבה) / analogy (אנלוגיה)
-- decisions: סדר סוגיות + תקן ביקורת
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS issue_order JSONB DEFAULT '[]';
-- סדר הסוגיות שנקבע ע"י המנצח: [{"title": "...", "type": "threshold/dispositive/secondary"}]
ALTER TABLE decisions ADD COLUMN IF NOT EXISTS claim_handling JSONB DEFAULT '{}';
-- {"overrides": [{"claim_id": "...", "handling": "bundle", "group": "..."}]}
-- indexes
CREATE INDEX IF NOT EXISTS idx_claims_handling ON claims(claim_handling);
CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type);
CREATE INDEX IF NOT EXISTS idx_case_law_level ON case_law(precedent_level);
"""
# ── Phase 5: Interim draft (appraiser facts + post-hearing flag) ───
SCHEMA_V5_SQL = """
-- appraiser_facts: תכניות והיתרים שצוינו ע"י כל שמאי בנפרד.
-- בשונה מ-claims (שהוא טענה משפטית), כאן מאוחסנת עובדה עניינית מתוך השומה.
-- שימוש ראשי: זיהוי סתירות בין שמאים על איזו תכנית או היתר חל בנכס.
CREATE TABLE IF NOT EXISTS appraiser_facts (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE,
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
appraiser_name TEXT NOT NULL,
fact_type TEXT NOT NULL CHECK (fact_type IN ('plan', 'permit')),
identifier TEXT NOT NULL,
details JSONB NOT NULL DEFAULT '{}',
page_number INTEGER,
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_appraiser_facts_case ON appraiser_facts(case_id, fact_type);
CREATE INDEX IF NOT EXISTS idx_appraiser_facts_identifier ON appraiser_facts(case_id, identifier);
-- V5.1: appraiser_side — which party this appraiser represents.
-- Values: 'committee' (הוועדה), 'appellant' (העורר), 'deciding' (מכריע).
-- Required by extract_appraiser_facts; the chair tags it via the UI before extraction.
-- Set via documents.metadata.appraiser_side at upload/edit time, then propagated here
-- so that conflict rendering in block-tet can label each entry with its side.
ALTER TABLE appraiser_facts ADD COLUMN IF NOT EXISTS appraiser_side TEXT DEFAULT '';
CREATE INDEX IF NOT EXISTS idx_appraiser_facts_side ON appraiser_facts(case_id, appraiser_side);
-- documents.metadata.is_post_hearing: flag for materials submitted after the hearing
-- (השלמות טיעון, הצעות פשרה). Used by block-chet to include them in the proceedings narrative.
-- documents.metadata.appraiser_side: which side the appraiser represents (see above).
-- No schema change needed — uses existing JSONB metadata column.
"""
# ── V6: Case archiving ────────────────────────────────────────────
SCHEMA_V6_SQL = """
-- archived_at: timestamp when the case was moved to the archive screen.
-- NULL = active (default). Set via POST /api/cases/{case_number}/archive.
-- Cleared via POST /api/cases/{case_number}/restore.
-- The /api/cases endpoint filters out archived cases by default;
-- pass ?include_archived=true (or use /api/cases/archived) to see them.
ALTER TABLE cases ADD COLUMN IF NOT EXISTS archived_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_cases_archived ON cases(archived_at) WHERE archived_at IS NOT NULL;
"""
# ── V7: External Precedent Library + halacha extraction ──────────
# Chair-uploaded external court rulings and other appeals committee decisions
# become an authoritative law corpus. Distinct from style_corpus (Daphna's
# style) and case_precedents (chair-attached quotes scoped to a single case).
SCHEMA_V7_SQL = """
-- case_law extensions: distinguish chair-uploaded full rulings from
-- auto-extracted citation stubs, and track ingestion progress.
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_kind TEXT DEFAULT 'cited_only';
-- 'external_upload' (chair uploaded full ruling) | 'cited_only' (stub from
-- references_extractor) | 'nevo_seed' (future: auto-fetched from Nevo).
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS document_id UUID REFERENCES documents(id) ON DELETE SET NULL;
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS extraction_status TEXT DEFAULT 'pending';
-- 'pending' | 'processing' | 'completed' | 'failed'
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_status TEXT DEFAULT 'pending';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS metadata_extraction_status TEXT DEFAULT 'pending';
-- 'pending' | 'processing' | 'completed' | 'failed'. Mirrors the
-- text/halacha status columns so the UI can show a live badge while the
-- local-MCP worker drains the metadata queue (previously only the
-- metadata_extraction_requested_at timestamp existed — no 'processing').
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT '';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT '';
-- chair-editable abstract shown in search results.
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT '';
-- 'court_ruling' | 'appeals_committee'
-- practice_area is closed to the three appeals committee domains.
DO $$ BEGIN
ALTER TABLE case_law ADD CONSTRAINT case_law_practice_area_check
CHECK (practice_area IN ('', 'rishuy_uvniya', 'betterment_levy', 'compensation_197'));
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind);
CREATE INDEX IF NOT EXISTS idx_case_law_practice ON case_law(practice_area, appeal_subtype);
-- precedent_chunks: full-text chunks of an uploaded ruling, with embeddings.
-- Analog of document_chunks for case_law rows where source_kind='external_upload'.
CREATE TABLE IF NOT EXISTS precedent_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_type TEXT DEFAULT 'other',
-- intro | facts | legal_analysis | ruling | conclusion | other
page_number INTEGER,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_case_law ON precedent_chunks(case_law_id);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_section ON precedent_chunks(case_law_id, section_type);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_vec
ON precedent_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
-- halachot: extracted binding rules. One halacha = one rule + verbatim quote.
-- Embedded separately for rule-precision semantic match (chunks centroid is
-- dominated by surrounding context). All halachot start as pending_review;
-- only approved/published rows are visible to search_precedent_library.
CREATE TABLE IF NOT EXISTS halachot (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
halacha_index INTEGER NOT NULL,
rule_statement TEXT NOT NULL,
rule_type TEXT DEFAULT 'binding',
-- binding | interpretive | procedural | obiter
reasoning_summary TEXT DEFAULT '',
supporting_quote TEXT NOT NULL,
page_reference TEXT DEFAULT '',
practice_areas TEXT[] DEFAULT '{}',
subject_tags TEXT[] DEFAULT '{}',
cites TEXT[] DEFAULT '{}',
confidence NUMERIC(3,2) DEFAULT 0.0,
quote_verified BOOLEAN DEFAULT FALSE,
review_status TEXT DEFAULT 'pending_review',
-- pending_review | approved | rejected | published | deferred (#84 snooze)
reviewer TEXT DEFAULT '',
reviewed_at TIMESTAMPTZ,
quality_flags TEXT[] DEFAULT '{}',
-- non_decision | truncated_quote | thin_restatement | quote_unverified
-- (any flag blocks auto-approve → routes to pending_review)
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
ALTER TABLE halachot ADD COLUMN IF NOT EXISTS quality_flags TEXT[] DEFAULT '{}';
CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id);
CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status);
CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas);
CREATE INDEX IF NOT EXISTS idx_halachot_tags ON halachot USING gin(subject_tags);
CREATE INDEX IF NOT EXISTS idx_halachot_vec
ON halachot USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
-- #83: halacha_index must be unique per precedent. The extractor assigns it as
-- MAX(halacha_index)+1 under an in-process store-lock + a cross-process advisory
-- lock, so collisions shouldn't occur — but per FireHydrant/OneUptime the
-- constraint is the actual correctness guarantee (the lock is the optimization).
-- A racing/double run now fails LOUDLY instead of silently appending duplicates
-- (the 2026-05/06 over-extraction root cause). Requires clean data first (see
-- scripts: the 6 colliding precedents were renumbered 2026-06-03).
CREATE UNIQUE INDEX IF NOT EXISTS idx_halachot_unique_index
ON halachot(case_law_id, halacha_index);
"""
# ── V8: Extraction request queue ─────────────────────────────────
# Web UI buttons ("Sparkles" = request metadata extraction; "Refresh" =
# request halacha extraction) run inside the FastAPI container, which has
# no `claude` CLI. They can't run the LLM extractor directly. Instead they
# stamp a request timestamp here, and the chair (or me) runs the MCP tool
# `precedent_process_pending_extractions` from local Claude Code, where the
# CLI is available, to drain the queue. See claude_session.py for the rule.
SCHEMA_V8_SQL = """
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS metadata_extraction_requested_at TIMESTAMPTZ;
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_requested_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_case_law_metadata_requested
ON case_law(metadata_extraction_requested_at)
WHERE metadata_extraction_requested_at IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_case_law_halacha_requested
ON case_law(halacha_extraction_requested_at)
WHERE halacha_extraction_requested_at IS NOT NULL;
"""
# ── V9: Multimodal page-image embeddings ─────────────────────────
# voyage-multimodal-3 (1024-dim) embeds the whole page as an image:
# captures table layout, scanned content, signatures, plans — content
# that text-OCR loses. Ingestion is gated by config.MULTIMODAL_ENABLED;
# search_*_hybrid() merge text-cosine + image-cosine when present.
# image_thumbnail_path is a relative path under DATA_DIR/cases/{case}/
# thumbnails/ or DATA_DIR/precedent-library/thumbnails/ — a small JPEG
# rendered at config.MULTIMODAL_THUMB_DPI for UI preview, distinct from
# the higher-DPI render fed to the embedder (which is not persisted).
SCHEMA_V9_SQL = """
CREATE TABLE IF NOT EXISTS document_image_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
page_number INTEGER NOT NULL,
image_thumbnail_path TEXT,
embedding vector(1024),
model_name TEXT DEFAULT 'voyage-multimodal-3',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(document_id, page_number)
);
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_vec
ON document_image_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 50);
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_doc
ON document_image_embeddings(document_id);
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_case
ON document_image_embeddings(case_id);
CREATE TABLE IF NOT EXISTS precedent_image_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
page_number INTEGER NOT NULL,
image_thumbnail_path TEXT,
embedding vector(1024),
model_name TEXT DEFAULT 'voyage-multimodal-3',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(case_law_id, page_number)
);
CREATE INDEX IF NOT EXISTS idx_prec_img_emb_vec
ON precedent_image_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 50);
CREATE INDEX IF NOT EXISTS idx_prec_img_emb_case_law
ON precedent_image_embeddings(case_law_id);
"""
SCHEMA_V10_SQL = """
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS chair_name TEXT DEFAULT '';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS district TEXT DEFAULT '';
ALTER TABLE cases ADD COLUMN IF NOT EXISTS chair_name TEXT DEFAULT '';
CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind);
CREATE INDEX IF NOT EXISTS idx_case_law_chair ON case_law(chair_name) WHERE chair_name <> '';
CREATE INDEX IF NOT EXISTS idx_case_law_district ON case_law(district) WHERE district <> '';
"""
SCHEMA_V11_SQL = """
CREATE TABLE IF NOT EXISTS case_law_relations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE,
related_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE,
relation_type TEXT NOT NULL DEFAULT 'same_case_chain',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(case_law_id, related_id),
CHECK (case_law_id <> related_id)
);
CREATE INDEX IF NOT EXISTS idx_clr_a ON case_law_relations(case_law_id);
CREATE INDEX IF NOT EXISTS idx_clr_b ON case_law_relations(related_id);
"""
# ── V12: BM25/lexical search via tsvector ─────────────────────────
# PostgreSQL doesn't ship a Hebrew stemmer; the 'simple' configuration
# lowercases + tokenises on whitespace without stemming — exactly what
# we want for Hebrew. It also preserves alphanumeric tokens like
# "1461/20" (case numbers) which are the prime motivator for adding a
# lexical layer on top of the semantic cosine index.
# Both columns are GENERATED STORED so they stay in sync with the
# source rows for free, and GIN-indexed for ts_rank_cd lookups.
SCHEMA_V12_SQL = """
ALTER TABLE precedent_chunks
ADD COLUMN IF NOT EXISTS content_tsv tsvector
GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED;
ALTER TABLE halachot
ADD COLUMN IF NOT EXISTS rule_tsv tsvector
GENERATED ALWAYS AS (
to_tsvector('simple',
coalesce(rule_statement,'') || ' ' ||
coalesce(supporting_quote,'') || ' ' ||
coalesce(reasoning_summary,'')
)
) STORED;
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_tsv
ON precedent_chunks USING GIN(content_tsv);
CREATE INDEX IF NOT EXISTS idx_halachot_tsv
ON halachot USING GIN(rule_tsv);
"""
# ── V13: Missing precedents log ───────────────────────────────────
# Track citations that the parties brought up but which are NOT yet in
# the precedent_library. Created by the researcher (auto or chair)
# whenever a citation can't be found in the corpus; closed by uploading
# the actual decision via internal_decision_upload or
# precedent_library_upload, at which point linked_case_law_id points to
# the new case_law row and status flips to 'closed'.
SCHEMA_V13_SQL = """
CREATE TABLE IF NOT EXISTS missing_precedents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
citation TEXT NOT NULL,
case_name TEXT,
cited_in_case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
cited_in_document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
cited_by_party TEXT CHECK (cited_by_party IN (
'appellant', 'respondent', 'committee', 'permit_applicant', 'unknown'
)),
cited_by_party_name TEXT,
legal_topic TEXT,
legal_issue TEXT,
claim_quote TEXT,
status TEXT DEFAULT 'open' CHECK (status IN (
'open', 'uploaded', 'closed', 'irrelevant'
)),
linked_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL,
closed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
notes TEXT
);
CREATE INDEX IF NOT EXISTS idx_missing_precedents_case
ON missing_precedents(cited_in_case_id);
CREATE INDEX IF NOT EXISTS idx_missing_precedents_status
ON missing_precedents(status);
CREATE INDEX IF NOT EXISTS idx_missing_precedents_citation
ON missing_precedents(citation);
"""
# ── V14: Legal arguments (aggregated propositions) ────────────────
# After ``claims_extractor`` extracts raw propositions (rows in ``claims``)
# the LLM-driven aggregator groups them into ~6-12 distinct legal arguments
# per party. ``legal_arguments`` holds the consolidated argument; the M:M
# join table ``legal_argument_propositions`` links back to the source
# propositions for traceability ("which raw claims feed this argument?").
SCHEMA_V14_SQL = """
CREATE TABLE IF NOT EXISTS legal_arguments (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE,
party TEXT NOT NULL CHECK (party IN (
'appellant', 'respondent', 'committee', 'permit_applicant', 'unknown'
)),
argument_index INTEGER NOT NULL,
argument_title TEXT NOT NULL,
argument_body TEXT NOT NULL,
legal_topic TEXT,
priority TEXT DEFAULT 'substantive' CHECK (priority IN (
'threshold', 'substantive', 'procedural', 'relief'
)),
cited_precedents TEXT[],
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_legal_arguments_case
ON legal_arguments(case_id);
CREATE INDEX IF NOT EXISTS idx_legal_arguments_party
ON legal_arguments(case_id, party);
-- M:M back to ``claims`` (raw propositions).
CREATE TABLE IF NOT EXISTS legal_argument_propositions (
argument_id UUID NOT NULL REFERENCES legal_arguments(id) ON DELETE CASCADE,
claim_id UUID NOT NULL REFERENCES claims(id) ON DELETE CASCADE,
PRIMARY KEY (argument_id, claim_id)
);
"""
# proceeding_type — מבחין בין הליך ערר עיקרי לבל"מ (בקשה להארכת מועד).
# חל גם על case_law (קורפוס) וגם על cases (תיקים חיים). שני הסוגים
# יכולים לחלוק אותו case_number, ולכן ה-uniqueness עוברת ל-(case_number,
# proceeding_type). בקורפוס: רק internal_committee מקבלים ערך מאוכלס;
# פסיקה חיצונית נשארת עם ''.
SCHEMA_V15_SQL = """
-- ------- case_law (קורפוס) -------
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS proceeding_type TEXT NOT NULL DEFAULT '';
ALTER TABLE case_law DROP CONSTRAINT IF EXISTS case_law_proceeding_type_check;
ALTER TABLE case_law ADD CONSTRAINT case_law_proceeding_type_check
CHECK (proceeding_type IN ('', 'ערר', 'בל"מ'));
-- Backfill לפי appeal_subtype הקיים
UPDATE case_law SET proceeding_type = 'בל"מ'
WHERE source_kind = 'internal_committee' AND proceeding_type = ''
AND appeal_subtype LIKE 'extension_request_%';
UPDATE case_law SET proceeding_type = 'ערר'
WHERE source_kind = 'internal_committee' AND proceeding_type = '';
ALTER TABLE case_law DROP CONSTRAINT IF EXISTS case_law_internal_proceeding_check;
ALTER TABLE case_law ADD CONSTRAINT case_law_internal_proceeding_check
CHECK (source_kind != 'internal_committee' OR proceeding_type IN ('ערר', 'בל"מ'));
-- החלפת UNIQUE(case_number) ב-partial unique לפי source_kind
ALTER TABLE case_law DROP CONSTRAINT IF EXISTS case_law_case_number_key;
DROP INDEX IF EXISTS case_law_case_number_key;
CREATE UNIQUE INDEX IF NOT EXISTS uq_case_law_internal_number_proc
ON case_law (case_number, proceeding_type)
WHERE source_kind = 'internal_committee';
CREATE UNIQUE INDEX IF NOT EXISTS uq_case_law_external_number
ON case_law (case_number)
WHERE source_kind <> 'internal_committee';
-- ------- cases (תיקים חיים) -------
ALTER TABLE cases ADD COLUMN IF NOT EXISTS proceeding_type TEXT NOT NULL DEFAULT 'ערר';
ALTER TABLE cases DROP CONSTRAINT IF EXISTS cases_proceeding_type_check;
ALTER TABLE cases ADD CONSTRAINT cases_proceeding_type_check
CHECK (proceeding_type IN ('ערר', 'בל"מ'));
UPDATE cases SET proceeding_type = 'בל"מ'
WHERE proceeding_type = 'ערר' AND appeal_subtype LIKE 'extension_request_%';
ALTER TABLE cases DROP CONSTRAINT IF EXISTS cases_case_number_key;
DROP INDEX IF EXISTS cases_case_number_key;
CREATE UNIQUE INDEX IF NOT EXISTS uq_cases_number_proc
ON cases (case_number, proceeding_type);
"""
# ── V16: Internal citations graph (TaskMaster #34) ────────────────
# Auto-extracted citation graph between Daphna's (and other internal_committee)
# decisions. When an internal decision cites another committee decision in a
# patterned way ("ונפנה ל…", "כפי שקבעתי…", "ראה החלטתי…"), the citation
# extractor records the link here. ``cited_case_law_id`` is populated when the
# cited case_number resolves to a row in ``case_law``; otherwise it stays NULL
# and shows up in ``idx_pic_unlinked`` so the chair can decide whether to
# upload the missing decision.
SCHEMA_V16_SQL = """
CREATE TABLE IF NOT EXISTS precedent_internal_citations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
source_case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE,
cited_case_number TEXT NOT NULL,
cited_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL,
match_context TEXT,
match_pattern TEXT,
confidence NUMERIC(3,2) DEFAULT 0.85,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE (source_case_law_id, cited_case_number)
);
CREATE INDEX IF NOT EXISTS idx_pic_source
ON precedent_internal_citations(source_case_law_id);
CREATE INDEX IF NOT EXISTS idx_pic_target
ON precedent_internal_citations(cited_case_law_id);
CREATE INDEX IF NOT EXISTS idx_pic_unlinked
ON precedent_internal_citations(cited_case_number)
WHERE cited_case_law_id IS NULL;
"""
# ── V17: Parent-doc retrieval (TaskMaster #48) ─────────────────────
# Hierarchical chunking: tiny "child" chunks (~300 tokens) are indexed
# and matched at search time for high recall on focused phrases, but
# every child links upward to a larger "parent" chunk (~1500 tokens)
# that supplies broader context to the LLM. The retrieval step swaps
# the child hit for its parent before returning rows to callers — so
# rule statements, multi-paragraph quotes, and "אשר על כן…" passages
# come back whole instead of clipped mid-sentence.
#
# Schema layout:
# parent_chunk_id — self-FK on precedent_chunks. NULL for legacy
# rows (single-tier chunking) and for parent
# rows themselves. Cascade=SET NULL so deleting
# a parent doesn't orphan the children's payload.
# chunk_role — 'child' | 'parent'. Defaults to 'child' so any
# row created by the pre-V17 ingestion path is
# treated as a child without a parent (i.e. the
# parent-doc swap is a no-op and the legacy chunk
# continues to surface as-is).
#
# Activation is gated by ``config.PARENT_DOC_RETRIEVAL_ENABLED``. Even
# after the schema is in place, search keeps the legacy behaviour
# until both the chunker emits hierarchical chunks *and* the flag is
# flipped on — so this migration is safe to apply ahead of time.
SCHEMA_V17_SQL = """
ALTER TABLE precedent_chunks
ADD COLUMN IF NOT EXISTS parent_chunk_id UUID
REFERENCES precedent_chunks(id) ON DELETE SET NULL;
ALTER TABLE precedent_chunks
ADD COLUMN IF NOT EXISTS chunk_role TEXT DEFAULT 'child';
DO $$ BEGIN
ALTER TABLE precedent_chunks ADD CONSTRAINT precedent_chunks_role_check
CHECK (chunk_role IN ('child', 'parent'));
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_parent
ON precedent_chunks(parent_chunk_id);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_role
ON precedent_chunks(chunk_role);
"""
# ── V18: RAG telemetry — closed-loop retrieval feedback (TaskMaster #50)
#
# Captures every semantic search call (query, agent, top results,
# latency) so we can compute nDCG@10 over time and surface drift before
# it bites. Relevance signal comes from two places:
# 1. ``cited_in_decision`` — auto-inferred. If a precedent cited in a
# final draft's ``decision_paragraphs.citations`` also appears in
# the ``top_case_law_ids`` of a search log for the same case, that
# hit is treated as highly relevant (score=3).
# 2. ``chair_marked`` — explicit feedback (future hook for the UI).
#
# ``top_case_law_ids`` is intentionally nullable: ``search_decisions``
# returns document chunks from active cases (not case_law rows), so its
# rows log the query but leave the array empty. nDCG aggregation skips
# those.
SCHEMA_V18_SQL = """
CREATE TABLE IF NOT EXISTS search_logs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
search_type TEXT NOT NULL,
-- 'precedent_library' / 'internal_decisions'
-- / 'decisions' / 'case_documents' / 'similar_cases'
query TEXT NOT NULL,
practice_area TEXT,
case_id UUID REFERENCES cases(id) ON DELETE SET NULL,
user_agent TEXT,
-- 'writer' / 'researcher' / 'analyst' / 'manual' / 'unknown'
result_count INTEGER,
top_case_law_ids UUID[],
-- nullable: empty for search_decisions/search_case_documents
-- which return document chunks not case_law rows
duration_ms INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_search_logs_type ON search_logs(search_type);
CREATE INDEX IF NOT EXISTS idx_search_logs_case ON search_logs(case_id);
CREATE INDEX IF NOT EXISTS idx_search_logs_date ON search_logs(created_at DESC);
CREATE TABLE IF NOT EXISTS search_relevance_feedback (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
search_log_id UUID REFERENCES search_logs(id) ON DELETE CASCADE,
case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE,
rank INTEGER NOT NULL,
-- 1-based position in the original results (1 = top hit)
relevance_score INTEGER NOT NULL
CHECK (relevance_score IN (0, 1, 2, 3)),
-- 0=irrelevant, 1=marginal, 2=relevant, 3=highly relevant
feedback_source TEXT,
-- 'cited_in_decision' / 'chair_marked' / 'auto_inferred'
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(search_log_id, case_law_id, feedback_source)
);
CREATE INDEX IF NOT EXISTS idx_relevance_log
ON search_relevance_feedback(search_log_id);
CREATE INDEX IF NOT EXISTS idx_relevance_case_law
ON search_relevance_feedback(case_law_id);
"""
# ── V19: case_law.citation_formatted ───────────────────────────────
# Full formal citation per the Israeli unified citation rules ("כללי
# הציטוט האחיד"). Stored as Markdown: parties wrapped in **…** so the
# copy-to-clipboard helper can render bold for Word/Docs while keeping
# the plain-text form readable.
#
# Example:
# ערר (ועדות ערר - תכנון ובנייה ת"א-יפו) 81002-01-21 **אברהם אגסי
# נ' הועדה המקומית לתכנון ובנייה תל אביב** (נבו 25.9.2025)
SCHEMA_V19_SQL = """
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS citation_formatted TEXT DEFAULT '';
"""
# ── V20: case-name / case-number lexical match ────────────────────
# RC-A fix: the V12 tsvectors cover only chunk *content* + halacha
# text, so a bare case-name query ("אגסי") matched decisions that
# *cite* the case rather than the case itself. case_name and
# case_number live on the parent case_law row, so we add a dedicated
# meta tsvector there and OR it into the lexical search — a name/number
# hit then surfaces all of that case's chunks + halachot. 'simple'
# config (no stemmer) preserves Hebrew names + alphanumeric case
# numbers like "81002-01-21" exactly as V12 does for content.
SCHEMA_V20_SQL = """
ALTER TABLE case_law
ADD COLUMN IF NOT EXISTS meta_tsv tsvector
GENERATED ALWAYS AS (
to_tsvector('simple',
coalesce(case_name,'') || ' ' || coalesce(case_number,'')
)
) STORED;
CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv
ON case_law USING GIN(meta_tsv);
"""
# ── V21: explicit `searchable` flag (GAP-13 / INV-DM1) ─────────────
# Materialized completeness flag — a case_law row is exposed to search only
# when it satisfies the completeness contract (02-data-model §2a). Recomputed
# on ingest/metadata completion via recompute_searchable(); not inferred at
# query time. Default false so a freshly-inserted row is excluded until proven
# complete. Health-check surfaces count(*) FILTER (WHERE NOT searchable).
SCHEMA_V21_SQL = """
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS searchable boolean NOT NULL DEFAULT false;
CREATE INDEX IF NOT EXISTS idx_case_law_searchable ON case_law (searchable);
"""
# ── V22: cases.blocks_stale — DOCX↔blocks drift flag (GAP-17 / INV-EX1) ──
# Set true when revise_draft/apply_user_edit make active_draft_path the live
# source-of-truth without re-syncing decision_blocks; cleared when blocks are
# re-exported or re-saved. Surfaced by health-check. Source-of-truth remains
# decision_blocks — this only flags known drift (no fragile DOCX→blocks reparse).
SCHEMA_V22_SQL = """
ALTER TABLE cases ADD COLUMN IF NOT EXISTS blocks_stale boolean NOT NULL DEFAULT false;
"""
# ── V23: case_law content/indexed hashes — re-index on content change (GAP-09) ──
# content_hash = SHA-256 of current full_text (written at the create boundary).
# indexed_hash = the content_hash the CURRENT chunks/embeddings were built from
# (set by mark_indexed after a successful store). Stale ⇔ content_hash IS
# DISTINCT FROM indexed_hash. embedding can't be a GENERATED column (needs an
# API call), so freshness is enforced by detection + reindex_case_law + health-check.
SCHEMA_V23_SQL = """
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS content_hash text NOT NULL DEFAULT '';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS indexed_hash text;
"""
SCHEMA_V24_SQL = """
-- X11: citation corroboration (treatment + halacha-level link)
ALTER TABLE precedent_internal_citations
ADD COLUMN IF NOT EXISTS treatment TEXT DEFAULT '';
CREATE TABLE IF NOT EXISTS halacha_citation_corroboration (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE,
citing_case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
citing_decision_id UUID REFERENCES decisions(id) ON DELETE SET NULL,
source_citation_id UUID NOT NULL,
treatment TEXT NOT NULL,
match_score NUMERIC(4,3) DEFAULT 0,
match_context TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE (halacha_id, source_citation_id)
);
CREATE INDEX IF NOT EXISTS idx_hcc_halacha ON halacha_citation_corroboration(halacha_id);
"""
SCHEMA_V25_SQL = """
-- Crash-safe halacha extraction: per-chunk checkpoint enables incremental store
-- + resume. A chunk with halacha_extracted_at set has been processed; a resumed
-- run skips it (so a crash never loses completed chunks or re-pays for them).
ALTER TABLE precedent_chunks
ADD COLUMN IF NOT EXISTS halacha_extracted_at TIMESTAMPTZ;
"""
SCHEMA_V26_SQL = """
-- draft_final_pairs (T5 / INV-LRN4): the reconciliation ledger.
-- Every decision is "closed" only after it is compared against the chair's signed
-- final. Captures an immutable snapshot of the AI draft at mark-final time (before
-- it can be overwritten), paired with the final. The LLM distillation (curator)
-- fills final_text + diff_stats + analysis later and advances status.
CREATE TABLE IF NOT EXISTS draft_final_pairs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE,
draft_text TEXT NOT NULL DEFAULT '',
final_path TEXT DEFAULT '',
final_text TEXT DEFAULT '',
diff_stats JSONB DEFAULT NULL,
analysis JSONB DEFAULT NULL,
-- final_received → analyzed → lessons_folded
status TEXT NOT NULL DEFAULT 'final_received',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_case ON draft_final_pairs(case_id);
CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_status ON draft_final_pairs(status);
"""
SCHEMA_V27_SQL = """
-- style_exemplars (T1-T3): block-level paragraphs from Dafna's OWN decisions
-- (style_corpus + internal_committee finals), embedded for retrieval as
-- style exemplars at write-time. Purpose-built so we DON'T fabricate synthetic
-- cases just to reuse decision_paragraphs. INV-LRN5: style material only — the
-- writer is told to adapt structure/voice, copy only boilerplate, never substance.
CREATE TABLE IF NOT EXISTS style_exemplars (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
decision_number TEXT DEFAULT '',
source TEXT DEFAULT '', -- style_corpus | internal_committee
practice_area TEXT DEFAULT '',
outcome TEXT DEFAULT '', -- rejection | partial_acceptance | full_acceptance | ''
section TEXT DEFAULT 'other', -- background | claims | discussion | summary | other
paragraph_text TEXT NOT NULL,
word_count INTEGER DEFAULT 0,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_style_exemplars_section ON style_exemplars(section);
CREATE INDEX IF NOT EXISTS idx_style_exemplars_decision ON style_exemplars(decision_number, source);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL)
await conn.execute(MIGRATIONS_SQL)
await conn.execute(SCHEMA_V2_SQL)
await conn.execute(SCHEMA_V3_SQL)
await conn.execute(SCHEMA_V4_SQL)
await conn.execute(SCHEMA_V5_SQL)
await conn.execute(SCHEMA_V6_SQL)
await conn.execute(SCHEMA_V7_SQL)
await conn.execute(SCHEMA_V8_SQL)
await conn.execute(SCHEMA_V9_SQL)
await conn.execute(SCHEMA_V10_SQL)
await conn.execute(SCHEMA_V11_SQL)
await conn.execute(SCHEMA_V12_SQL)
await conn.execute(SCHEMA_V13_SQL)
await conn.execute(SCHEMA_V14_SQL)
await conn.execute(SCHEMA_V15_SQL)
await conn.execute(SCHEMA_V16_SQL)
await conn.execute(SCHEMA_V17_SQL)
await conn.execute(SCHEMA_V18_SQL)
await conn.execute(SCHEMA_V19_SQL)
await conn.execute(SCHEMA_V20_SQL)
await conn.execute(SCHEMA_V21_SQL)
await conn.execute(SCHEMA_V22_SQL)
await conn.execute(SCHEMA_V23_SQL)
await conn.execute(SCHEMA_V24_SQL)
await conn.execute(SCHEMA_V25_SQL)
await conn.execute(SCHEMA_V26_SQL)
await conn.execute(SCHEMA_V27_SQL)
logger.info("Database schema initialized (v1-v27)")
async def init_schema() -> None:
"""Backward-compatible wrapper. Schema init now runs lazily inside get_pool()."""
await get_pool()
# ── Case CRUD ───────────────────────────────────────────────────────
async def create_case(
case_number: str,
title: str,
appellants: list[str] | None = None,
respondents: list[str] | None = None,
subject: str = "",
property_address: str = "",
permit_number: str = "",
committee_type: str = "ועדה מקומית",
hearing_date: date | None = None,
notes: str = "",
expected_outcome: str = "",
# Default "" — DB CHECK constraint accepts empty, the upstream tool
# (cases.case_create) is responsible for deriving the domain value
# from the case_number prefix before calling here.
practice_area: str = "",
appeal_subtype: str = "",
proceeding_type: str = "ערר",
) -> dict:
pool = await get_pool()
case_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO cases (id, case_number, title, appellants, respondents,
subject, property_address, permit_number, committee_type,
hearing_date, notes, expected_outcome,
practice_area, appeal_subtype, proceeding_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)""",
case_id, _canonical_case_number(case_number), title,
json.dumps(appellants or []),
json.dumps(respondents or []),
subject, property_address, permit_number, committee_type,
hearing_date, notes, expected_outcome,
practice_area, appeal_subtype, proceeding_type,
)
return await get_case(case_id)
async def get_case(case_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id)
if row is None:
return None
return _row_to_case(row)
async def set_active_draft_path(case_id: UUID, path: str | None) -> None:
"""Update the case's active_draft_path (the DOCX that is source of truth)."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"UPDATE cases SET active_draft_path = $1, updated_at = now() WHERE id = $2",
path, case_id,
)
async def get_active_draft_path(case_id: UUID) -> str | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT active_draft_path FROM cases WHERE id = $1", case_id,
)
return row["active_draft_path"] if row else None
async def mark_blocks_stale(case_id: UUID, stale: bool) -> None:
"""Flag/clear DOCX↔blocks drift for a case (GAP-17)."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"UPDATE cases SET blocks_stale = $1, updated_at = now() WHERE id = $2",
stale, case_id,
)
async def resolve_citation_case_law_ids(ids) -> dict:
"""Structural citation→corpus resolution (GAP-20 / INV-AUD3).
Given case_law_id values referenced by a decision's citations/provenance,
split into resolvable (exist in case_law) vs unresolvable.
"""
resolved, unresolved = [], []
pool = await get_pool()
async with pool.acquire() as conn:
for cid in ids:
try:
exists = await conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM case_law WHERE id = $1)", cid)
except Exception:
exists = False
(resolved if exists else unresolved).append(cid)
return {"resolved": resolved, "unresolved": unresolved}
def _normalize_case_number(s: str) -> str:
"""Canonicalise a case number for tolerant lookup.
Agents receive the number in many shapes — from a Paperclip issue
title ("ערר 8137/24"), with a slash instead of a dash, padded, or with
surrounding whitespace. Stored values are bare ("8137-24"). Without
this, get_case_by_number's exact match silently fails and the agent
concludes the case has no documents (see #58). Strategy: drop any
leading proceeding-type prefix (everything before the first digit),
trim, and unify '/''-'.
"""
s = (s or "").strip()
m = re.search(r"\d", s)
if m:
s = s[m.start():]
return s.strip().replace("/", "-")
def _canonical_case_number(s: str) -> str:
"""Canonical write-time form per X1 §1: trim · prefix-strip · '/''-'.
Deterministic and format-only — does NOT add or remove a month segment.
Used at the write boundary for identifier-keyed corpora (internal
committee decisions, active cases). NOT for external precedents, whose
canonical identifier is the full citation.
"""
s = (s or "").strip()
m = re.search(r"\d", s)
if m:
s = s[m.start():]
return s.strip().replace("/", "-")
def _content_hash(text: str) -> str:
"""SHA-256 hex of the text — deterministic content fingerprint (FU-3/GAP-09).
Empty/None → "" (a row with no text has no content fingerprint).
"""
if not text:
return ""
return hashlib.sha256(text.encode("utf-8")).hexdigest()
async def get_case_by_number(case_number: str) -> dict | None:
pool = await get_pool()
norm = _normalize_case_number(case_number)
async with pool.acquire() as conn:
# Exact match first (fast path + preferred); fall back to a
# separator/prefix-normalised comparison so common formatting
# variants still resolve to the right case. See #58.
row = await conn.fetchrow(
"""SELECT * FROM cases
WHERE case_number = $1
OR replace(btrim(case_number), '/', '-') = $2
ORDER BY (case_number = $1) DESC, created_at
LIMIT 1""",
case_number, norm,
)
if row is None:
return None
return _row_to_case(row)
async def list_cases(
status: str | None = None,
limit: int = 50,
include_archived: bool = False,
archived_only: bool = False,
) -> list[dict]:
pool = await get_pool()
where = []
args: list = []
if status:
where.append(f"status = ${len(args) + 1}")
args.append(status)
if archived_only:
where.append("archived_at IS NOT NULL")
elif not include_archived:
where.append("archived_at IS NULL")
where_clause = f"WHERE {' AND '.join(where)}" if where else ""
args.append(limit)
sql = f"SELECT * FROM cases {where_clause} ORDER BY updated_at DESC LIMIT ${len(args)}"
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *args)
return [_row_to_case(r) for r in rows]
async def update_case(case_id: UUID, **fields) -> dict | None:
if not fields:
return await get_case(case_id)
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key in ("appellants", "respondents", "tags"):
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, case_id, *values)
return await get_case(case_id)
def _row_to_case(row: asyncpg.Record) -> dict:
d = dict(row)
for field in ("appellants", "respondents", "tags"):
if isinstance(d.get(field), str):
d[field] = json.loads(d[field])
d["id"] = str(d["id"])
return d
async def archive_case(case_id: UUID) -> dict | None:
"""Mark a case as archived. Returns updated row, or None if not found."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"UPDATE cases SET archived_at = now(), updated_at = now() "
"WHERE id = $1 RETURNING *",
case_id,
)
return _row_to_case(row) if row else None
async def restore_case(case_id: UUID) -> dict | None:
"""Clear the archived_at timestamp. Returns updated row, or None if not found."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"UPDATE cases SET archived_at = NULL, updated_at = now() "
"WHERE id = $1 RETURNING *",
case_id,
)
return _row_to_case(row) if row else None
async def delete_case(case_id: UUID) -> bool:
"""Delete a case row. Returns True if a row was actually removed.
All dependent rows are removed automatically by FK constraints:
• CASCADE: documents, document_chunks, claims, appraiser_facts,
decisions, qa_results, case_precedents
• SET NULL: audit_log.case_id, chair_feedback.case_id
NOTE: this only touches the legal-ai database. The Paperclip project
(issues, comments, runs) and Gitea repo for the case live in other
systems and are NOT cleaned up here — call sites that need a full
reset must handle those separately.
"""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute("DELETE FROM cases WHERE id = $1", case_id)
# asyncpg execute returns "DELETE <n>" — extract count.
return int(result.split()[-1]) > 0
# ── Document CRUD ───────────────────────────────────────────────────
async def create_document(
case_id: UUID,
doc_type: str,
title: str,
file_path: str,
page_count: int | None = None,
content_hash: str = "",
) -> dict:
pool = await get_pool()
doc_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
doc_id, case_id, doc_type, title, file_path, page_count, content_hash,
)
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row)
async def get_document_by_hash(case_id: UUID, content_hash: str) -> dict | None:
"""Return an existing document for this case with the same file hash, or None.
INV-TOOL3 / GAP-52: deterministic key for idempotent upload. Empty hashes
(legacy rows) are never matched.
"""
if not content_hash:
return None
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM documents WHERE case_id = $1 AND content_hash = $2 LIMIT 1",
case_id, content_hash,
)
return _row_to_doc(row) if row else None
async def update_document(doc_id: UUID, **fields) -> None:
if not fields:
return
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key == "metadata":
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, doc_id, *values)
async def get_document(doc_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row) if row else None
async def list_documents(case_id: UUID) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id
)
return [_row_to_doc(r) for r in rows]
async def get_document_text(doc_id: UUID) -> str:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT extracted_text FROM documents WHERE id = $1", doc_id
)
return row["extracted_text"] if row else ""
def _row_to_doc(row: asyncpg.Record) -> dict:
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("metadata"), str):
d["metadata"] = json.loads(d["metadata"])
return d
# ── Claims ─────────────────────────────────────────────────────────
async def store_claims(case_id: UUID, claims: list[dict], source_document: str = "") -> int:
"""Store extracted claims. Replaces existing claims from same source.
Each claim dict: party_role, claim_text, claim_index, party_name (optional)
"""
pool = await get_pool()
async with pool.acquire() as conn:
if source_document:
await conn.execute(
"DELETE FROM claims WHERE case_id = $1 AND source_document = $2",
case_id, source_document,
)
for claim in claims:
await conn.execute(
"""INSERT INTO claims (case_id, party_role, party_name, claim_text, claim_index, source_document, claim_type)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
case_id,
claim["party_role"],
claim.get("party_name", ""),
claim["claim_text"],
claim.get("claim_index", 0),
source_document,
claim.get("claim_type", "claim"),
)
return len(claims)
async def get_claims(case_id: UUID, party_role: str | None = None) -> list[dict]:
"""Get claims for a case, optionally filtered by party role."""
pool = await get_pool()
async with pool.acquire() as conn:
if party_role:
rows = await conn.fetch(
"SELECT * FROM claims WHERE case_id = $1 AND party_role = $2 ORDER BY claim_index",
case_id, party_role,
)
else:
rows = await conn.fetch(
"SELECT * FROM claims WHERE case_id = $1 ORDER BY party_role, claim_index",
case_id,
)
return [dict(r) for r in rows]
# ── Decisions ──────────────────────────────────────────────────────
async def create_decision(
case_id: UUID,
outcome: str = "",
outcome_summary: str = "",
outcome_reasoning: str = "",
direction_doc: dict | None = None,
) -> dict:
"""Create a decision record for a case."""
pool = await get_pool()
decision_id = uuid4()
async with pool.acquire() as conn:
# Check if a decision already exists for this case
existing = await conn.fetchrow(
"SELECT id, version FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1",
case_id,
)
version = (existing["version"] + 1) if existing else 1
await conn.execute(
"""INSERT INTO decisions (id, case_id, version, outcome, outcome_summary,
outcome_reasoning, direction_doc)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
decision_id, case_id, version, outcome, outcome_summary,
outcome_reasoning, json.dumps(direction_doc) if direction_doc else None,
)
return await get_decision(decision_id)
async def get_decision(decision_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM decisions WHERE id = $1", decision_id)
if not row:
return None
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("direction_doc"), str):
d["direction_doc"] = json.loads(d["direction_doc"])
if isinstance(d.get("panel_members"), str):
d["panel_members"] = json.loads(d["panel_members"])
return d
async def get_decision_by_case(case_id: UUID) -> dict | None:
"""Get the latest decision for a case."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1",
case_id,
)
if not row:
return None
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("direction_doc"), str):
d["direction_doc"] = json.loads(d["direction_doc"])
if isinstance(d.get("panel_members"), str):
d["panel_members"] = json.loads(d["panel_members"])
return d
async def get_critical_qa_failures(case_id: UUID) -> list[dict]:
"""Return critical-severity failures from the case's latest QA run.
``qa_results`` is cleared+rewritten per ``validate_decision`` run, so the
current rows for a ``case_id`` ARE the latest run. Returns rows where
``severity='critical' AND passed=false``. Callers distinguish "no QA run
yet" (no rows at all) via ``qa_run_exists`` below.
"""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""SELECT check_name, severity, passed, errors
FROM qa_results
WHERE case_id = $1 AND severity = 'critical' AND passed = false
ORDER BY check_name""",
case_id,
)
return [dict(r) for r in rows]
async def qa_run_exists(case_id: UUID) -> bool:
"""True if a QA run has ever been recorded for this case (any rows)."""
pool = await get_pool()
async with pool.acquire() as conn:
n = await conn.fetchval(
"SELECT count(*) FROM qa_results WHERE case_id = $1",
case_id,
)
return bool(n)
async def update_decision(decision_id: UUID, **fields) -> None:
if not fields:
return
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key in ("direction_doc", "panel_members") and isinstance(val, (dict, list)):
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = f"UPDATE decisions SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, decision_id, *values)
# ── Document deletion ──────────────────────────────────────────────
async def delete_document(doc_id: UUID) -> bool:
"""Delete a document and all its chunks. Returns True if deleted."""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", doc_id
)
result = await conn.execute(
"DELETE FROM documents WHERE id = $1", doc_id
)
return int(result.split()[-1]) > 0
# ── Chunks & Vectors ───────────────────────────────────────────────
async def delete_document_chunks(document_id: UUID) -> int:
"""Delete all chunks for a document (used before reprocessing)."""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", document_id
)
return int(result.split()[-1]) # e.g. "DELETE 5" -> 5
async def store_chunks(
document_id: UUID,
case_id: UUID | None,
chunks: list[dict],
) -> int:
"""Store document chunks with embeddings. Each chunk dict has:
content, section_type, embedding (list[float]), page_number, chunk_index
"""
pool = await get_pool()
async with pool.acquire() as conn:
# Delete existing chunks for this document
await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", document_id
)
for chunk in chunks:
await conn.execute(
"""INSERT INTO document_chunks
(document_id, case_id, chunk_index, content, section_type, embedding, page_number)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
document_id, case_id,
chunk["chunk_index"],
chunk["content"],
chunk.get("section_type", "other"),
chunk["embedding"],
chunk.get("page_number"),
)
return len(chunks)
async def search_similar(
query_embedding: list[float],
limit: int = 10,
case_id: UUID | None = None,
section_type: str | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Cosine similarity search on document chunks."""
pool = await get_pool()
conditions = []
params: list = [query_embedding, limit]
param_idx = 3
if case_id:
conditions.append(f"dc.case_id = ${param_idx}")
params.append(case_id)
param_idx += 1
if section_type:
conditions.append(f"dc.section_type = ${param_idx}")
params.append(section_type)
param_idx += 1
if practice_area:
conditions.append(f"c.practice_area = ${param_idx}")
params.append(practice_area)
param_idx += 1
if appeal_subtype:
conditions.append(f"c.appeal_subtype = ${param_idx}")
params.append(appeal_subtype)
param_idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT dc.content, dc.section_type, dc.page_number,
dc.document_id, dc.case_id,
d.title AS document_title,
c.case_number,
1 - (dc.embedding <=> $1) AS score
FROM document_chunks dc
JOIN documents d ON d.id = dc.document_id
JOIN cases c ON c.id = dc.case_id
{where}
ORDER BY dc.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
# ── Style corpus ────────────────────────────────────────────────────
async def add_to_style_corpus(
document_id: UUID | None,
decision_number: str,
decision_date: date | None,
subject_categories: list[str],
full_text: str,
summary: str = "",
outcome: str = "",
key_principles: list[str] | None = None,
practice_area: str = "appeals_committee",
appeal_subtype: str = "",
) -> UUID:
pool = await get_pool()
corpus_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO style_corpus
(id, document_id, decision_number, decision_date,
subject_categories, full_text, summary, outcome, key_principles,
practice_area, appeal_subtype)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""",
corpus_id, document_id, decision_number, decision_date,
json.dumps(subject_categories), full_text, summary, outcome,
json.dumps(key_principles or []),
practice_area, appeal_subtype,
)
return corpus_id
async def delete_from_style_corpus(corpus_id: UUID) -> dict:
"""Remove a decision from style_corpus + related documents (cascades chunks).
Also tries to delete the [קורפוס] document associated by title match,
since the current training pipeline inserts style_corpus with document_id=NULL.
"""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
row = await conn.fetchrow(
"DELETE FROM style_corpus WHERE id = $1 "
"RETURNING decision_number, document_id",
corpus_id,
)
if not row:
return {"deleted": False, "reason": "not found"}
docs_deleted = 0
if row["document_id"]:
await conn.execute(
"DELETE FROM documents WHERE id = $1", row["document_id"]
)
docs_deleted = 1
else:
# Best-effort: match a [קורפוס] document by the decision_number
# in its title. Only for single, unambiguous matches.
if row["decision_number"]:
docs = await conn.fetch(
"SELECT id FROM documents "
"WHERE case_id IS NULL AND title LIKE $1",
f"%{row['decision_number']}%",
)
if len(docs) == 1:
await conn.execute(
"DELETE FROM documents WHERE id = $1", docs[0]["id"]
)
docs_deleted = 1
return {
"deleted": True,
"decision_number": row["decision_number"],
"docs_deleted": docs_deleted,
}
async def get_style_corpus_row(corpus_id: UUID) -> dict | None:
"""Return a single style_corpus row by id, or None if missing."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT id, document_id, decision_number, decision_date,
subject_categories, full_text, summary, outcome,
key_principles, practice_area, appeal_subtype, created_at
FROM style_corpus WHERE id = $1
""",
corpus_id,
)
return dict(row) if row else None
async def update_style_corpus_metadata(
corpus_id: UUID,
*,
summary: str | None = None,
outcome: str | None = None,
key_principles: list[str] | None = None,
appeal_subtype: str | None = None,
practice_area: str | None = None,
overwrite: bool = False,
) -> dict:
"""Patch the enriched-metadata columns of a style_corpus row.
By default, only empty columns are filled — passing ``overwrite=True``
is the caller's signal that they intentionally want to replace existing
values (used by the re-extract flow when the chair runs it manually).
"""
pool = await get_pool()
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT summary, outcome, key_principles, appeal_subtype, practice_area "
"FROM style_corpus WHERE id = $1",
corpus_id,
)
if not existing:
return {"updated": False, "reason": "not found"}
sets: dict = {}
if summary is not None and (overwrite or not (existing["summary"] or "").strip()):
sets["summary"] = summary
if outcome is not None and (overwrite or not (existing["outcome"] or "").strip()):
sets["outcome"] = outcome
if key_principles is not None:
current = existing["key_principles"]
if isinstance(current, str):
try:
current = json.loads(current)
except json.JSONDecodeError:
current = []
if overwrite or not (current or []):
sets["key_principles"] = json.dumps(key_principles)
if appeal_subtype is not None and (overwrite or not (existing["appeal_subtype"] or "").strip()):
sets["appeal_subtype"] = appeal_subtype
if practice_area is not None and (overwrite or not (existing["practice_area"] or "").strip()):
sets["practice_area"] = practice_area
if not sets:
return {"updated": False, "reason": "nothing to update", "fields": []}
cols = list(sets.keys())
set_clause = ", ".join(f"{c} = ${i + 2}" for i, c in enumerate(cols))
values = [sets[c] for c in cols]
await conn.execute(
f"UPDATE style_corpus SET {set_clause} WHERE id = $1",
corpus_id, *values,
)
return {"updated": True, "fields": cols}
# ── decision_lessons (per-corpus row notes) ────────────────────────
async def list_decision_lessons(corpus_id: UUID) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT id, style_corpus_id, lesson_text, category, source, "
" applied_to_skill, created_by, created_at, updated_at "
"FROM decision_lessons WHERE style_corpus_id = $1 "
"ORDER BY created_at DESC",
corpus_id,
)
return [dict(r) for r in rows]
async def add_decision_lesson(
corpus_id: UUID,
*,
lesson_text: str,
category: str = "general",
source: str = "manual",
created_by: str = "chaim",
) -> dict:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"INSERT INTO decision_lessons "
"(style_corpus_id, lesson_text, category, source, created_by) "
"VALUES ($1, $2, $3, $4, $5) "
"RETURNING id, style_corpus_id, lesson_text, category, source, "
" applied_to_skill, created_by, created_at, updated_at",
corpus_id, lesson_text, category, source, created_by,
)
return dict(row) if row else {}
async def update_decision_lesson(
lesson_id: UUID,
*,
lesson_text: str | None = None,
category: str | None = None,
applied_to_skill: bool | None = None,
) -> dict:
sets: dict = {}
if lesson_text is not None:
sets["lesson_text"] = lesson_text
if category is not None:
sets["category"] = category
if applied_to_skill is not None:
sets["applied_to_skill"] = applied_to_skill
if not sets:
return {"updated": False, "reason": "nothing to update"}
sets["updated_at"] = "now()" # sentinel — replaced inline below
cols = [c for c in sets if c != "updated_at"]
set_clause = ", ".join(f"{c} = ${i + 2}" for i, c in enumerate(cols))
set_clause += ", updated_at = now()"
values = [sets[c] for c in cols]
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
f"UPDATE decision_lessons SET {set_clause} WHERE id = $1 "
f"RETURNING id, style_corpus_id, lesson_text, category, source, "
f" applied_to_skill, updated_at",
lesson_id, *values,
)
if not row:
return {"updated": False, "reason": "not found"}
return {"updated": True, **dict(row)}
async def delete_decision_lesson(lesson_id: UUID) -> dict:
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM decision_lessons WHERE id = $1", lesson_id,
)
# asyncpg returns "DELETE n"
deleted = result.split(" ", 1)[1].strip() if " " in result else "0"
return {"deleted": deleted != "0"}
async def count_decision_lessons_per_corpus() -> dict[str, int]:
"""Map style_corpus.id (str) → lesson count, for badge display in the list."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT style_corpus_id, count(*) AS n "
"FROM decision_lessons GROUP BY style_corpus_id"
)
return {str(r["style_corpus_id"]): r["n"] for r in rows}
# ── chat (style agent conversations) ───────────────────────────────
async def create_chat_conversation(
*,
title: str = "שיחה חדשה",
style_corpus_id: UUID | None = None,
system_prompt_version: str = "v1",
) -> dict:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"INSERT INTO chat_conversations "
"(title, style_corpus_id, system_prompt_version) "
"VALUES ($1, $2, $3) "
"RETURNING id, title, style_corpus_id, claude_session_id, "
" system_prompt_version, created_at, last_message_at",
title, style_corpus_id, system_prompt_version,
)
return dict(row) if row else {}
async def list_chat_conversations(limit: int = 50) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT c.id, c.title, c.style_corpus_id, c.claude_session_id,
c.created_at, c.last_message_at,
sc.decision_number,
(SELECT count(*) FROM chat_messages m WHERE m.conversation_id = c.id) AS message_count
FROM chat_conversations c
LEFT JOIN style_corpus sc ON sc.id = c.style_corpus_id
ORDER BY c.last_message_at DESC NULLS LAST
LIMIT $1
""",
limit,
)
return [dict(r) for r in rows]
async def get_chat_conversation(conv_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT id, title, style_corpus_id, claude_session_id, "
" system_prompt_version, created_at, last_message_at "
"FROM chat_conversations WHERE id = $1",
conv_id,
)
return dict(row) if row else None
async def delete_chat_conversation(conv_id: UUID) -> dict:
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM chat_conversations WHERE id = $1", conv_id,
)
deleted = result.split(" ", 1)[1].strip() if " " in result else "0"
return {"deleted": deleted != "0"}
async def update_chat_conversation_session_id(
conv_id: UUID, claude_session_id: str,
) -> None:
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"UPDATE chat_conversations SET claude_session_id = $1, "
" last_message_at = now() "
"WHERE id = $2",
claude_session_id, conv_id,
)
async def add_chat_message(
conv_id: UUID,
*,
role: str,
content: str,
raw_events: list | None = None,
) -> dict:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"INSERT INTO chat_messages "
"(conversation_id, role, content, raw_events) "
"VALUES ($1, $2, $3, $4) "
"RETURNING id, conversation_id, role, content, created_at",
conv_id, role, content, json.dumps(raw_events or []),
)
await conn.execute(
"UPDATE chat_conversations SET last_message_at = now() WHERE id = $1",
conv_id,
)
return dict(row) if row else {}
async def list_chat_messages(conv_id: UUID) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT id, role, content, created_at "
"FROM chat_messages WHERE conversation_id = $1 "
"ORDER BY created_at ASC",
conv_id,
)
return [dict(r) for r in rows]
async def get_style_patterns(pattern_type: str | None = None) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if pattern_type:
rows = await conn.fetch(
"SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC",
pattern_type,
)
else:
rows = await conn.fetch(
"SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC"
)
return [dict(r) for r in rows]
async def get_methodology_overrides(category: str) -> dict:
"""Chair's /methodology edits for one category (golden_ratios / discussion_rules /
content_checklists). Returns {rule_key: parsed_value}. These OVERRIDE the hardcoded
lessons.py defaults — the writer must consume them (T15 / INV-LRN4). Mirrors the merge
in GET /api/methodology/{category}."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT rule_key, rule_value FROM appeal_type_rules "
"WHERE appeal_type = '_global' AND rule_category = $1",
category,
)
out: dict = {}
for r in rows:
raw = r["rule_value"]
if isinstance(raw, str):
try:
raw = json.loads(raw)
except (json.JSONDecodeError, TypeError):
pass
out[r["rule_key"]] = raw
return out
async def get_recent_decision_lessons(limit: int = 15, practice_area: str = "") -> list[dict]:
"""Per-decision learnings the chair/curator attached in /training (decision_lessons),
so the writer consumes them too (T15). Prefers style/structure/lexicon, recent first."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""SELECT dl.lesson_text, dl.category, dl.source,
sc.decision_number, sc.practice_area
FROM decision_lessons dl
JOIN style_corpus sc ON sc.id = dl.style_corpus_id
WHERE ($2 = '' OR sc.practice_area = $2)
ORDER BY dl.created_at DESC
LIMIT $1""",
limit, practice_area,
)
return [dict(r) for r in rows]
async def create_draft_final_pair(case_id: UUID, draft_text: str, final_path: str = "") -> str:
"""Capture the draft↔final pairing at mark-final (T5 / INV-LRN4). Immutable draft
snapshot; final_text/diff_stats/analysis filled later by the curator distillation."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""INSERT INTO draft_final_pairs (case_id, draft_text, final_path, status)
VALUES ($1, $2, $3, 'final_received') RETURNING id""",
case_id, draft_text, final_path,
)
return str(row["id"])
async def update_draft_final_pair(
pair_id: UUID,
final_text: str | None = None,
diff_stats: dict | None = None,
analysis: dict | None = None,
status: str | None = None,
) -> None:
"""Advance a pairing row (curator distillation): final_text → diff_stats → analysis → status."""
sets, params, idx = [], [], 1
if final_text is not None:
sets.append(f"final_text = ${idx}"); params.append(final_text); idx += 1
if diff_stats is not None:
sets.append(f"diff_stats = ${idx}::jsonb"); params.append(json.dumps(diff_stats, ensure_ascii=False)); idx += 1
if analysis is not None:
sets.append(f"analysis = ${idx}::jsonb"); params.append(json.dumps(analysis, ensure_ascii=False)); idx += 1
if status is not None:
sets.append(f"status = ${idx}"); params.append(status); idx += 1
if not sets:
return
sets.append("updated_at = now()")
params.append(pair_id)
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
f"UPDATE draft_final_pairs SET {', '.join(sets)} WHERE id = ${idx}", *params,
)
async def list_draft_final_pairs(status: str | None = None, limit: int = 200) -> list[dict]:
"""Reconciliation ledger: all decisions paired with their final + status."""
pool = await get_pool()
async with pool.acquire() as conn:
if status:
rows = await conn.fetch(
"""SELECT p.id, p.case_id, c.case_number, c.title, p.status,
p.diff_stats, p.created_at, p.updated_at
FROM draft_final_pairs p LEFT JOIN cases c ON c.id = p.case_id
WHERE p.status = $1 ORDER BY p.created_at DESC LIMIT $2""",
status, limit,
)
else:
rows = await conn.fetch(
"""SELECT p.id, p.case_id, c.case_number, c.title, p.status,
p.diff_stats, p.created_at, p.updated_at
FROM draft_final_pairs p LEFT JOIN cases c ON c.id = p.case_id
ORDER BY p.created_at DESC LIMIT $1""",
limit,
)
return [dict(r) for r in rows]
async def insert_style_exemplar(
decision_number: str, source: str, practice_area: str, outcome: str,
section: str, paragraph_text: str, word_count: int, embedding: list[float],
) -> None:
"""Insert one block-level style exemplar (T1 backfill)."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO style_exemplars
(decision_number, source, practice_area, outcome, section,
paragraph_text, word_count, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""",
decision_number, source, practice_area, outcome, section,
paragraph_text, word_count, embedding,
)
async def delete_style_exemplars(decision_number: str, source: str) -> int:
"""Idempotent backfill: clear a decision's exemplars before re-inserting."""
pool = await get_pool()
async with pool.acquire() as conn:
res = await conn.execute(
"DELETE FROM style_exemplars WHERE decision_number = $1 AND source = $2",
decision_number, source,
)
try:
return int(res.split()[-1])
except (ValueError, IndexError):
return 0
async def search_style_exemplars(
query_embedding: list[float],
section: str | None = None,
outcome: str | None = None,
practice_area: str | None = None,
limit: int = 6,
) -> list[dict]:
"""Retrieve Dafna's own block-level paragraphs as STYLE exemplars (T2).
Filters by section (block) + optionally outcome/practice_area for the closest
match to the block being written. Soft filters: outcome/practice_area narrow but
never zero-out — section is the hard filter."""
pool = await get_pool()
conditions, params, idx = [], [query_embedding, limit], 3
if section:
conditions.append(f"section = ${idx}"); params.append(section); idx += 1
if outcome:
conditions.append(f"(outcome = ${idx} OR outcome = '')"); params.append(outcome); idx += 1
if practice_area:
conditions.append(f"(practice_area = ${idx} OR practice_area = '')"); params.append(practice_area); idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT decision_number, source, section, outcome, practice_area,
paragraph_text, word_count,
1 - (embedding <=> $1) AS score
FROM style_exemplars
{where}
ORDER BY embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
async def count_style_exemplars() -> dict:
"""Coverage check for the backfill."""
pool = await get_pool()
async with pool.acquire() as conn:
total = await conn.fetchval("SELECT count(*) FROM style_exemplars")
by_section = await conn.fetch(
"SELECT section, count(*) AS n FROM style_exemplars GROUP BY section ORDER BY n DESC"
)
decisions = await conn.fetchval(
"SELECT count(DISTINCT decision_number) FROM style_exemplars"
)
return {"total": total, "decisions": decisions, "by_section": [dict(r) for r in by_section]}
async def upsert_style_pattern(
pattern_type: str,
pattern_text: str,
context: str = "",
examples: list[str] | None = None,
appeal_subtype: str = "",
) -> None:
pool = await get_pool()
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT id, frequency FROM style_patterns "
"WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3",
pattern_type, pattern_text, appeal_subtype,
)
if existing:
await conn.execute(
"UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1",
existing["id"],
)
else:
await conn.execute(
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype)
VALUES ($1, $2, $3, $4, $5)""",
pattern_type, pattern_text, context,
json.dumps(examples or []),
appeal_subtype,
)
async def clear_style_patterns(appeal_subtype: str = "") -> None:
"""Delete style patterns, optionally filtered by appeal_subtype.
Empty appeal_subtype = delete ALL patterns.
"""
pool = await get_pool()
async with pool.acquire() as conn:
if appeal_subtype:
await conn.execute(
"DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype
)
else:
await conn.execute("DELETE FROM style_patterns")
# ── Semantic Search (V2 — decision blocks & case law) ─────────────
async def search_similar_paragraphs(
query_embedding: list[float],
limit: int = 10,
block_type: str | None = None,
) -> list[dict]:
"""Search decision paragraphs by semantic similarity."""
pool = await get_pool()
conditions = []
params: list = [query_embedding, limit]
param_idx = 3
if block_type:
conditions.append(f"db.block_id = ${param_idx}")
params.append(block_type)
param_idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT dp.content, dp.word_count, dp.paragraph_number,
db.block_id AS block_type, db.title AS block_title,
c.case_number, c.title AS case_title,
d.outcome, d.author,
1 - (pe.embedding <=> $1) AS score
FROM paragraph_embeddings pe
JOIN decision_paragraphs dp ON dp.id = pe.paragraph_id
JOIN decision_blocks db ON db.id = dp.block_id
JOIN decisions d ON d.id = db.decision_id
JOIN cases c ON c.id = d.case_id
{where}
ORDER BY pe.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
async def search_similar_case_law(
query_embedding: list[float],
limit: int = 5,
) -> list[dict]:
"""Search case law by semantic similarity."""
pool = await get_pool()
sql = """
SELECT cl.case_number, cl.case_name, cl.court, cl.summary,
cl.key_quote, cl.subject_tags,
cle.chunk_text,
1 - (cle.embedding <=> $1) AS score
FROM case_law_embeddings cle
JOIN case_law cl ON cl.id = cle.case_law_id
ORDER BY cle.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, query_embedding, limit)
results = []
for r in rows:
d = dict(r)
if isinstance(d.get("subject_tags"), str):
d["subject_tags"] = json.loads(d["subject_tags"])
results.append(d)
return results
async def search_precedents(
query_embedding: list[float],
limit: int = 10,
) -> list[dict]:
"""Combined search: paragraphs + case law, ranked by score."""
paragraphs = await search_similar_paragraphs(query_embedding, limit=limit)
case_law = await search_similar_case_law(query_embedding, limit=limit)
# Combine and sort by score
results = []
for p in paragraphs:
results.append({
"type": "decision_paragraph",
"score": float(p["score"]),
"case_number": p["case_number"],
"case_title": p["case_title"],
"block_type": p["block_type"],
"content": p["content"][:500],
"author": p["author"],
})
for c in case_law:
results.append({
"type": "case_law",
"score": float(c["score"]),
"case_number": c["case_number"],
"case_name": c["case_name"],
"court": c["court"],
"content": c["summary"],
})
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]
# ── Case precedents (CRUD) ────────────────────────────────────────
async def create_case_precedent(
case_id: UUID,
quote: str,
citation: str,
section_id: str | None = None,
chair_note: str = "",
pdf_document_id: UUID | None = None,
practice_area: str | None = None,
) -> dict:
"""Insert a new precedent attached to a case."""
pool = await get_pool()
row = await pool.fetchrow(
"""
INSERT INTO case_precedents
(case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING *
""",
case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area,
)
return dict(row)
async def list_case_precedents(case_id: UUID) -> list[dict]:
"""List all precedents attached to a case, ordered by section then creation time."""
pool = await get_pool()
rows = await pool.fetch(
"""
SELECT id, case_id, section_id, quote, citation, chair_note,
pdf_document_id, practice_area, created_at, updated_at
FROM case_precedents
WHERE case_id = $1
ORDER BY section_id NULLS LAST, created_at
""",
case_id,
)
return [dict(r) for r in rows]
async def delete_case_precedent(precedent_id: UUID) -> bool:
"""Delete a precedent attachment by ID. Returns True if deleted."""
pool = await get_pool()
result = await pool.execute(
"DELETE FROM case_precedents WHERE id = $1", precedent_id
)
return result == "DELETE 1"
async def search_precedent_library(
query: str, practice_area: str = "", limit: int = 10,
) -> list[dict]:
"""Search all precedents across cases by citation or quote text."""
pool = await get_pool()
pattern = f"%{query}%"
if practice_area:
rows = await pool.fetch(
"""
SELECT id, case_id, section_id, quote, citation, chair_note,
practice_area, created_at
FROM case_precedents
WHERE (citation ILIKE $1 OR quote ILIKE $1)
AND practice_area = $2
ORDER BY created_at DESC
LIMIT $3
""",
pattern, practice_area, limit,
)
else:
rows = await pool.fetch(
"""
SELECT id, case_id, section_id, quote, citation, chair_note,
practice_area, created_at
FROM case_precedents
WHERE citation ILIKE $1 OR quote ILIKE $1
ORDER BY created_at DESC
LIMIT $2
""",
pattern, limit,
)
return [dict(r) for r in rows]
# ── Chair feedback ────────────────────────────────────────────────
async def record_chair_feedback(
case_id: UUID | None,
block_id: str,
feedback_text: str,
category: str = "other",
lesson_extracted: str = "",
) -> UUID:
"""Record feedback from the chair (Dafna) on a draft block."""
pool = await get_pool()
feedback_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO chair_feedback
(id, case_id, block_id, feedback_text, category, lesson_extracted)
VALUES ($1, $2, $3, $4, $5, $6)""",
feedback_id, case_id, block_id, feedback_text, category,
lesson_extracted,
)
return feedback_id
async def list_chair_feedback(
case_id: UUID | None = None,
category: str | None = None,
unresolved_only: bool = False,
limit: int = 100,
) -> list[dict]:
"""List chair feedback, optionally filtered. Capped by limit (INV-TOOL5 / GAP-53)."""
pool = await get_pool()
conditions = []
params: list = []
idx = 1
if case_id:
conditions.append(f"case_id = ${idx}")
params.append(case_id)
idx += 1
if category:
conditions.append(f"category = ${idx}")
params.append(category)
idx += 1
if unresolved_only:
conditions.append("resolved = FALSE")
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.append(max(1, int(limit)))
async with pool.acquire() as conn:
rows = await conn.fetch(
f"SELECT * FROM chair_feedback {where} ORDER BY created_at DESC LIMIT ${idx}",
*params,
)
return [dict(r) for r in rows]
async def get_chair_feedback(feedback_id: UUID) -> dict | None:
"""Return a single chair_feedback row by id (with case_number), or None."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""SELECT cf.*, c.case_number, c.appeal_type AS case_appeal_type
FROM chair_feedback cf
LEFT JOIN cases c ON c.id = cf.case_id
WHERE cf.id = $1""",
feedback_id,
)
return dict(row) if row else None
async def resolve_chair_feedback(
feedback_id: UUID,
applied_to: list[str],
) -> None:
"""Mark feedback as resolved and record where it was applied."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"""UPDATE chair_feedback
SET resolved = TRUE, applied_to = $2
WHERE id = $1""",
feedback_id, applied_to,
)
# ── Appraiser facts (V5 — for interim drafts) ─────────────────────
async def replace_appraiser_facts(
case_id: UUID,
document_id: UUID,
facts: list[dict],
) -> int:
"""Replace all appraiser_facts for a given document.
Each fact dict: appraiser_name, appraiser_side, fact_type ('plan'|'permit'),
identifier, details (dict), page_number (optional).
"""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
"DELETE FROM appraiser_facts WHERE document_id = $1", document_id,
)
for f in facts:
await conn.execute(
"""INSERT INTO appraiser_facts
(case_id, document_id, appraiser_name, appraiser_side,
fact_type, identifier, details, page_number)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""",
case_id, document_id,
f["appraiser_name"],
f.get("appraiser_side", ""),
f["fact_type"],
f["identifier"],
json.dumps(f.get("details", {}), ensure_ascii=False),
f.get("page_number"),
)
return len(facts)
async def list_appraiser_facts(
case_id: UUID,
fact_type: str | None = None,
) -> list[dict]:
"""List appraiser_facts for a case, optionally filtered by fact_type."""
pool = await get_pool()
async with pool.acquire() as conn:
if fact_type:
rows = await conn.fetch(
"""SELECT * FROM appraiser_facts
WHERE case_id = $1 AND fact_type = $2
ORDER BY identifier, appraiser_name""",
case_id, fact_type,
)
else:
rows = await conn.fetch(
"""SELECT * FROM appraiser_facts
WHERE case_id = $1
ORDER BY fact_type, identifier, appraiser_name""",
case_id,
)
results = []
for r in rows:
d = dict(r)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
d["document_id"] = str(d["document_id"])
if isinstance(d.get("details"), str):
d["details"] = json.loads(d["details"])
results.append(d)
return results
async def detect_appraiser_conflicts(case_id: UUID) -> list[dict]:
"""Detect conflicts: identifiers cited by 2+ different appraisers in this case.
A conflict exists when the SAME identifier (e.g., "תמ"א 38") was reported
differently by two appraisers — different details, or one cited it and the
other did not. Returns list of conflict groups. Each entry in a group
carries the appraiser's side so the caller can label it as committee /
appellant / deciding.
"""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""SELECT identifier, fact_type,
json_agg(jsonb_build_object(
'appraiser_name', appraiser_name,
'appraiser_side', appraiser_side,
'details', details,
'page_number', page_number,
'document_id', document_id
) ORDER BY
CASE appraiser_side
WHEN 'committee' THEN 1
WHEN 'appellant' THEN 2
WHEN 'deciding' THEN 3
ELSE 4
END,
appraiser_name
) AS entries,
COUNT(DISTINCT appraiser_name) AS n_appraisers
FROM appraiser_facts
WHERE case_id = $1
GROUP BY identifier, fact_type
HAVING COUNT(DISTINCT appraiser_name) > 1""",
case_id,
)
conflicts = []
for r in rows:
entries = r["entries"]
if isinstance(entries, str):
entries = json.loads(entries)
# Parse nested details if still strings
for e in entries:
if isinstance(e.get("details"), str):
e["details"] = json.loads(e["details"])
conflicts.append({
"identifier": r["identifier"],
"fact_type": r["fact_type"],
"n_appraisers": r["n_appraisers"],
"entries": entries,
})
return conflicts
# ── V7: External precedent library + halachot ─────────────────────
def _row_to_case_law(row: asyncpg.Record) -> dict:
"""Normalize a case_law row, parsing subject_tags JSONB to list."""
d = dict(row)
if isinstance(d.get("subject_tags"), str):
try:
d["subject_tags"] = json.loads(d["subject_tags"])
except (TypeError, ValueError):
d["subject_tags"] = []
if d.get("date") is not None:
d["date"] = d["date"].isoformat()
return d
async def get_case_law(case_law_id: UUID) -> dict | None:
pool = await get_pool()
row = await pool.fetchrow(
"SELECT * FROM case_law WHERE id = $1", case_law_id,
)
return _row_to_case_law(row) if row else None
async def get_external_case_law_by_citation(citation: str) -> dict | None:
"""Return the first external_upload row whose case_number matches citation, or None."""
pool = await get_pool()
row = await pool.fetchrow(
"""
SELECT id, case_number, case_name, court, date,
halacha_extraction_status, source_kind, created_at
FROM case_law
WHERE case_number = $1
AND source_kind = 'external_upload'
LIMIT 1
""",
citation,
)
return _row_to_case_law(row) if row else None
async def mark_indexed(case_law_id: UUID) -> None:
"""Mark a case_law row's embeddings as built from its current content (FU-3).
Sets indexed_hash := content_hash. Call AFTER a successful chunk+embed+store.
"""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"UPDATE case_law SET indexed_hash = content_hash WHERE id = $1",
case_law_id,
)
async def list_stale_case_law(limit: int = 500) -> list[dict]:
"""case_law rows whose embeddings are stale vs current content (GAP-09/INV-G6)."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""SELECT id, case_number, source_kind
FROM case_law
WHERE coalesce(full_text, '') <> ''
AND content_hash IS DISTINCT FROM indexed_hash
ORDER BY created_at LIMIT $1""",
limit,
)
return [dict(r) for r in rows]
async def recompute_content_hashes() -> dict:
"""Backfill (FU-3): set content_hash for all rows; set indexed_hash=content_hash
only where chunks already exist (those are already embedded). Rows with text but
no chunks get indexed_hash=NULL → surface as stale. Hash-only; no re-embed."""
pool = await get_pool()
updated = 0
async with pool.acquire() as conn:
rows = await conn.fetch("SELECT id, full_text FROM case_law")
for r in rows:
ch = _content_hash(r["full_text"] or "")
has_chunks = await conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM precedent_chunks WHERE case_law_id = $1)",
r["id"])
await conn.execute(
"UPDATE case_law SET content_hash = $2, "
"indexed_hash = CASE WHEN $3 THEN $2 ELSE indexed_hash END WHERE id = $1",
r["id"], ch, bool(has_chunks))
updated += 1
return {"updated": updated}
async def add_case_law_relation(
a_id: UUID, b_id: UUID, relation_type: str = "same_case_chain"
) -> None:
"""Link two case_law records bidirectionally. Idempotent (ON CONFLICT DO NOTHING)."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.executemany(
"""
INSERT INTO case_law_relations(case_law_id, related_id, relation_type)
VALUES($1, $2, $3)
ON CONFLICT (case_law_id, related_id) DO NOTHING
""",
[(a_id, b_id, relation_type), (b_id, a_id, relation_type)],
)
async def remove_case_law_relation(a_id: UUID, b_id: UUID) -> None:
"""Remove a bidirectional link between two case_law records."""
pool = await get_pool()
await pool.execute(
"""
DELETE FROM case_law_relations
WHERE (case_law_id = $1 AND related_id = $2)
OR (case_law_id = $2 AND related_id = $1)
""",
a_id,
b_id,
)
async def get_case_law_relations(case_law_id: UUID) -> list[dict]:
"""Return all case_law records linked to case_law_id, ordered by date asc."""
pool = await get_pool()
rows = await pool.fetch(
"""
SELECT cl.*, r.relation_type
FROM case_law_relations r
JOIN case_law cl ON cl.id = r.related_id
WHERE r.case_law_id = $1
ORDER BY cl.date ASC NULLS LAST
""",
case_law_id,
)
results = []
for row in rows:
d = dict(row)
relation_type = d.pop("relation_type")
normalized = _row_to_case_law(d)
normalized["relation_type"] = relation_type
results.append(normalized)
return results
async def get_case_law_by_citation(case_number: str) -> dict | None:
pool = await get_pool()
row = await pool.fetchrow(
"SELECT * FROM case_law WHERE case_number = $1", case_number,
)
return _row_to_case_law(row) if row else None
async def create_external_case_law(
case_number: str,
case_name: str,
full_text: str,
court: str = "",
decision_date: date | None = None,
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
summary: str = "",
headnote: str = "",
key_quote: str = "",
source_url: str = "",
source_type: str = "",
precedent_level: str = "",
is_binding: bool = True,
document_id: UUID | None = None,
) -> dict:
"""Insert a chair-uploaded external precedent into case_law.
If a row with this ``case_number`` already exists with
source_kind='cited_only' (auto-discovered), promote it to
source_kind='external_upload' and fill in the missing fields.
"""
pool = await get_pool()
tags_json = json.dumps(subject_tags or [], ensure_ascii=False)
async with pool.acquire() as conn:
# Atomic upsert on the V15 partial unique index
# uq_case_law_external_number (case_number) WHERE source_kind <> 'internal_committee'.
# The predicate is repeated in ON CONFLICT (required for partial indexes).
# This also subsumes the old cited_only→external_upload promotion: a
# cited_only row with the same case_number conflicts and is promoted by
# DO UPDATE. Scoped to the external partial index, so an internal row with
# the same number is NOT touched (the old SELECT-without-source_kind could
# wrongly promote it).
row = await conn.fetchrow(
"""
INSERT INTO case_law (
case_number, case_name, court, date, subject_tags,
summary, key_quote, full_text, source_url,
source_kind, document_id, extraction_status,
halacha_extraction_status, practice_area, appeal_subtype,
headnote, source_type, precedent_level, is_binding, content_hash
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9,
'external_upload', $10, 'processing', 'pending',
$11, $12, $13, $14, $15, $16, $17
)
ON CONFLICT (case_number) WHERE source_kind <> 'internal_committee'
DO UPDATE SET
case_name = EXCLUDED.case_name,
court = COALESCE(NULLIF(EXCLUDED.court, ''), case_law.court),
date = COALESCE(EXCLUDED.date, case_law.date),
practice_area = EXCLUDED.practice_area,
appeal_subtype = EXCLUDED.appeal_subtype,
subject_tags = EXCLUDED.subject_tags,
summary = COALESCE(NULLIF(EXCLUDED.summary, ''), case_law.summary),
headnote = EXCLUDED.headnote,
key_quote = COALESCE(NULLIF(EXCLUDED.key_quote, ''), case_law.key_quote),
full_text = EXCLUDED.full_text,
source_url = COALESCE(NULLIF(EXCLUDED.source_url, ''), case_law.source_url),
source_type = EXCLUDED.source_type,
precedent_level = EXCLUDED.precedent_level,
is_binding = EXCLUDED.is_binding,
document_id = COALESCE(EXCLUDED.document_id, case_law.document_id),
source_kind = 'external_upload',
extraction_status = 'processing',
halacha_extraction_status = 'pending',
content_hash = EXCLUDED.content_hash
RETURNING *
""",
case_number, case_name, court, decision_date, tags_json,
summary, key_quote, full_text, source_url,
document_id, practice_area, appeal_subtype, headnote,
source_type, precedent_level, is_binding,
_content_hash(full_text),
)
return _row_to_case_law(row)
async def create_internal_committee_decision(
case_number: str,
case_name: str,
full_text: str,
court: str = "",
decision_date: date | None = None,
chair_name: str = "",
district: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
summary: str = "",
is_binding: bool = True,
document_id: UUID | None = None,
proceeding_type: str = "ערר",
) -> dict:
"""Upsert an appeals-committee decision as source_kind='internal_committee'.
Idempotency key: (case_number, proceeding_type) — the same number can
exist as both 'ערר' and 'בל"מ' (an extension-of-time request can be
filed against an existing appeal with the same number).
"""
pool = await get_pool()
case_number = _canonical_case_number(case_number)
tags_json = json.dumps(subject_tags or [], ensure_ascii=False)
async with pool.acquire() as conn:
# Atomic upsert on V15 partial unique index
# uq_case_law_internal_number_proc (case_number, proceeding_type)
# WHERE source_kind = 'internal_committee'. Predicate repeated for the
# partial index. Replaces the old SELECT-then-INSERT/UPDATE (race-prone).
row = await conn.fetchrow(
"""
INSERT INTO case_law (
case_number, case_name, court, date, chair_name, district,
subject_tags, summary, full_text,
source_kind, source_type, document_id,
extraction_status, halacha_extraction_status,
practice_area, appeal_subtype, is_binding, proceeding_type, content_hash
) VALUES (
$1, $2, $3, $4, $5, $6,
$7, $8, $9,
'internal_committee', 'appeals_committee', $10,
'processing', 'pending',
$11, $12, $13, $14, $15
)
ON CONFLICT (case_number, proceeding_type)
WHERE source_kind = 'internal_committee'
DO UPDATE SET
case_name = EXCLUDED.case_name,
court = COALESCE(NULLIF(EXCLUDED.court, ''), case_law.court),
date = COALESCE(EXCLUDED.date, case_law.date),
chair_name = COALESCE(NULLIF(EXCLUDED.chair_name, ''), case_law.chair_name),
district = COALESCE(NULLIF(EXCLUDED.district, ''), case_law.district),
practice_area = EXCLUDED.practice_area,
appeal_subtype = EXCLUDED.appeal_subtype,
subject_tags = EXCLUDED.subject_tags,
summary = COALESCE(NULLIF(EXCLUDED.summary, ''), case_law.summary),
full_text = EXCLUDED.full_text,
source_type = 'appeals_committee',
source_kind = 'internal_committee',
is_binding = EXCLUDED.is_binding,
document_id = COALESCE(EXCLUDED.document_id, case_law.document_id),
extraction_status = 'processing',
halacha_extraction_status = 'pending',
content_hash = EXCLUDED.content_hash
RETURNING *
""",
case_number, case_name, court, decision_date, chair_name, district,
tags_json, summary, full_text,
document_id, practice_area, appeal_subtype, is_binding,
proceeding_type, _content_hash(full_text),
)
return _row_to_case_law(row)
def _compute_searchable(row: dict, has_embedded_chunk: bool) -> bool:
"""Completeness contract (INV-DM1 / 02-data-model §2a).
A row is searchable IFF: canonical id present · case_name/practice_area/
source_kind present · ≥1 chunk with a non-null embedding · extraction
completed · metadata non-empty (≥1 of headnote/summary/subject_tags).
Pure — `has_embedded_chunk` is supplied by the caller (cross-table check).
"""
if not has_embedded_chunk:
return False
if (row.get("extraction_status") or "") != "completed":
return False
if not (row.get("case_number") or "").strip():
return False
if not (row.get("case_name") or "").strip():
return False
# practice_area is required only for identifier-keyed corpora (internal
# committee decisions, active cases). External precedents (e.g. בג"ץ) are
# legitimately cross-domain and may have no single practice_area.
if (row.get("source_kind") or "") != "external_upload":
if not (row.get("practice_area") or "").strip():
return False
if not (row.get("source_kind") or "").strip():
return False
tags = row.get("subject_tags") or []
has_meta = bool((row.get("headnote") or "").strip()) \
or bool((row.get("summary") or "").strip()) \
or (len(tags) > 0)
return has_meta
async def recompute_searchable(case_law_id: "UUID | str | None" = None) -> int:
"""Recompute and persist the `searchable` flag. Idempotent / reversible.
If case_law_id is None, recompute ALL rows (used by the V21 backfill and
the dry-run). Returns the number of rows now marked searchable=true.
"""
pool = await get_pool()
async with pool.acquire() as conn:
if case_law_id is not None:
cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
rows = await conn.fetch(
"SELECT * FROM case_law WHERE id = $1", cid)
else:
rows = await conn.fetch("SELECT * FROM case_law")
n_true = 0
for r in rows:
row = dict(r)
tags = row.get("subject_tags")
if isinstance(tags, str):
try:
tags = json.loads(tags)
except (ValueError, TypeError):
tags = []
row["subject_tags"] = tags or []
has_chunk = await conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM precedent_chunks "
"WHERE case_law_id = $1 AND embedding IS NOT NULL)", row["id"])
val = _compute_searchable(row, bool(has_chunk))
await conn.execute(
"UPDATE case_law SET searchable = $2 WHERE id = $1", row["id"], val)
if val:
n_true += 1
return n_true
async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
"""Patch metadata fields on a case_law row.
Allowed fields: case_name, court, date, practice_area, appeal_subtype,
subject_tags, summary, headnote, key_quote, source_url, source_type,
precedent_level, is_binding, citation_formatted.
"""
allowed = {
"case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
"subject_tags", "summary", "headnote", "key_quote", "source_url",
"source_type", "precedent_level", "is_binding", "district", "chair_name",
"proceeding_type", "citation_formatted",
}
updates = {k: v for k, v in fields.items() if k in allowed}
if not updates:
return await get_case_law(case_law_id)
pool = await get_pool()
set_parts = []
params: list = [case_law_id]
for i, (k, v) in enumerate(updates.items(), start=2):
if k == "subject_tags":
v = json.dumps(v or [], ensure_ascii=False)
set_parts.append(f"{k} = ${i}")
params.append(v)
sql = f"UPDATE case_law SET {', '.join(set_parts)} WHERE id = $1 RETURNING *"
row = await pool.fetchrow(sql, *params)
return _row_to_case_law(row) if row else None
async def set_case_law_extraction_status(case_law_id: UUID, status: str) -> None:
"""Set text-extraction status. When transitioning to a terminal state
('completed'/'failed') we also NULL ``metadata_extraction_requested_at``
so the local-MCP queue (`process_pending_extractions`, which scans by
``WHERE *_requested_at IS NOT NULL``) doesn't re-pick the row forever
and leave the row blocked in the UI's `isPrecedentActive` check."""
pool = await get_pool()
if status in ("completed", "failed"):
await pool.execute(
"UPDATE case_law SET extraction_status = $2, "
"metadata_extraction_requested_at = NULL WHERE id = $1",
case_law_id, status,
)
else:
await pool.execute(
"UPDATE case_law SET extraction_status = $2 WHERE id = $1",
case_law_id, status,
)
async def set_case_law_halacha_status(case_law_id: UUID, status: str) -> None:
"""Set halacha-extraction status. Mirrors ``set_case_law_extraction_status``:
on terminal states we also clear ``halacha_extraction_requested_at`` so the
queue and UI don't see a stale request flag."""
pool = await get_pool()
if status in ("completed", "failed"):
await pool.execute(
"UPDATE case_law SET halacha_extraction_status = $2, "
"halacha_extraction_requested_at = NULL WHERE id = $1",
case_law_id, status,
)
else:
await pool.execute(
"UPDATE case_law SET halacha_extraction_status = $2 WHERE id = $1",
case_law_id, status,
)
async def set_case_law_metadata_status(case_law_id: UUID, status: str) -> None:
"""Set metadata-extraction status. Mirrors ``set_case_law_halacha_status``:
on terminal states ('completed'/'failed') we also clear
``metadata_extraction_requested_at`` so the local-MCP queue
(`process_pending_extractions`, which scans ``WHERE *_requested_at IS NOT
NULL``) stops re-picking the row and the UI's ``isPrecedentActive`` check
settles."""
pool = await get_pool()
if status in ("completed", "failed"):
await pool.execute(
"UPDATE case_law SET metadata_extraction_status = $2, "
"metadata_extraction_requested_at = NULL WHERE id = $1",
case_law_id, status,
)
else:
await pool.execute(
"UPDATE case_law SET metadata_extraction_status = $2 WHERE id = $1",
case_law_id, status,
)
async def list_external_case_law(
practice_area: str = "",
court: str = "",
precedent_level: str = "",
source_type: str = "",
search: str = "",
limit: int = 100,
offset: int = 0,
source_kind: str = "external_upload",
) -> list[dict]:
"""List chair-uploaded precedents, with simple filters.
source_kind="all_committees" expands to: source_kind='internal_committee'
OR (source_kind='external_upload' AND source_type='appeals_committee').
"""
pool = await get_pool()
if source_kind == "all_committees":
conditions = [
"(source_kind = 'internal_committee' OR "
"(source_kind = 'external_upload' AND source_type = 'appeals_committee'))"
]
else:
conditions = [f"source_kind = '{source_kind}'"]
params: list = []
idx = 1
if practice_area:
conditions.append(f"practice_area = ${idx}")
params.append(practice_area)
idx += 1
if court:
conditions.append(f"court ILIKE ${idx}")
params.append(f"%{court}%")
idx += 1
if precedent_level:
conditions.append(f"precedent_level = ${idx}")
params.append(precedent_level)
idx += 1
if source_type:
conditions.append(f"source_type = ${idx}")
params.append(source_type)
idx += 1
if search:
conditions.append(
f"(case_number ILIKE ${idx} OR case_name ILIKE ${idx} "
f"OR summary ILIKE ${idx} OR headnote ILIKE ${idx})"
)
params.append(f"%{search}%")
idx += 1
where_sql = " AND ".join(conditions)
params.extend([limit, offset])
sql = f"""
SELECT id, case_number, case_name, court, date, practice_area,
appeal_subtype, source_type, precedent_level, is_binding,
summary, headnote, subject_tags, source_kind,
chair_name, district, citation_formatted,
extraction_status, halacha_extraction_status,
metadata_extraction_status,
metadata_extraction_requested_at,
halacha_extraction_requested_at,
created_at,
(SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id) AS halachot_count,
(SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id
AND h.review_status IN ('approved', 'published')) AS approved_count
FROM case_law
WHERE {where_sql}
ORDER BY created_at DESC
LIMIT ${idx} OFFSET ${idx + 1}
"""
rows = await pool.fetch(sql, *params)
out = []
for r in rows:
d = _row_to_case_law(r)
# Render timestamps as ISO strings so the JSON layer stays simple
for k in ("metadata_extraction_requested_at", "halacha_extraction_requested_at"):
if d.get(k) is not None:
d[k] = d[k].isoformat()
out.append(d)
return out
async def delete_case_law(case_law_id: UUID) -> bool:
"""Delete a precedent and cascade chunks + halachot."""
pool = await get_pool()
result = await pool.execute(
"DELETE FROM case_law WHERE id = $1", case_law_id,
)
return result == "DELETE 1"
async def store_precedent_chunks(
case_law_id: UUID, chunks: list[dict],
) -> int:
"""Replace precedent chunks for a case_law row (single-tier).
Each chunk dict has: chunk_index, content, section_type, page_number,
embedding (list[float] or None).
All rows written here are stored with ``chunk_role='child'`` and
``parent_chunk_id IS NULL`` — backward-compatible with the V17
schema (parent-doc lookup is a no-op for these rows). For two-tier
ingestion, see :func:`store_precedent_chunks_hierarchical`.
"""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM precedent_chunks WHERE case_law_id = $1",
case_law_id,
)
for c in chunks:
await conn.execute(
"""INSERT INTO precedent_chunks
(case_law_id, chunk_index, content, section_type,
page_number, embedding)
VALUES ($1, $2, $3, $4, $5, $6)""",
case_law_id,
c["chunk_index"],
c["content"],
c.get("section_type", "other"),
c.get("page_number"),
c.get("embedding"),
)
return len(chunks)
async def store_precedent_chunks_hierarchical(
case_law_id: UUID,
chunks: list[dict],
) -> dict:
"""Replace precedent chunks for a case_law row (two-tier).
Each input dict must carry:
* ``role``: 'child' | 'parent'
* ``local_id``: in-batch identifier (int) used to wire children
to their parent's DB UUID
* ``parent_local_id``: int (only for children) — references the
``local_id`` of the parent in this same batch. For parents,
this is None.
* ``chunk_index``, ``content``, ``section_type``, ``page_number``
* ``embedding``: required for children, None for parents
Two-pass write inside a single transaction:
1. INSERT all parents (no FK back to children), capture
``local_id → DB UUID`` map.
2. INSERT all children with ``parent_chunk_id`` resolved.
Returns ``{"parents": N, "children": M, "total": N+M}``.
"""
parents = [c for c in chunks if c.get("role") == "parent"]
children = [c for c in chunks if c.get("role") == "child"]
if not parents and not children:
return {"parents": 0, "children": 0, "total": 0}
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
"DELETE FROM precedent_chunks WHERE case_law_id = $1",
case_law_id,
)
# Pass 1: parents — embedding intentionally NULL (parents
# aren't matched on; they only carry retrieval context).
local_to_uuid: dict[int, UUID] = {}
for p in parents:
row = await conn.fetchrow(
"""INSERT INTO precedent_chunks
(case_law_id, chunk_index, content, section_type,
page_number, embedding, chunk_role, parent_chunk_id)
VALUES ($1, $2, $3, $4, $5, NULL, 'parent', NULL)
RETURNING id""",
case_law_id,
p["chunk_index"],
p["content"],
p.get("section_type", "other"),
p.get("page_number"),
)
local_to_uuid[int(p["local_id"])] = row["id"]
# Pass 2: children with resolved parent_chunk_id.
for c in children:
parent_uuid = local_to_uuid.get(
int(c["parent_local_id"])
) if c.get("parent_local_id") is not None else None
await conn.execute(
"""INSERT INTO precedent_chunks
(case_law_id, chunk_index, content, section_type,
page_number, embedding, chunk_role, parent_chunk_id)
VALUES ($1, $2, $3, $4, $5, $6, 'child', $7)""",
case_law_id,
c["chunk_index"],
c["content"],
c.get("section_type", "other"),
c.get("page_number"),
c.get("embedding"),
parent_uuid,
)
return {
"parents": len(parents),
"children": len(children),
"total": len(parents) + len(children),
}
async def list_precedent_chunks(
case_law_id: UUID,
section_types: tuple[str, ...] | None = None,
) -> list[dict]:
pool = await get_pool()
if section_types:
rows = await pool.fetch(
"""SELECT id, chunk_index, content, section_type, page_number,
halacha_extracted_at
FROM precedent_chunks
WHERE case_law_id = $1 AND section_type = ANY($2::text[])
ORDER BY chunk_index""",
case_law_id, list(section_types),
)
else:
rows = await pool.fetch(
"""SELECT id, chunk_index, content, section_type, page_number,
halacha_extracted_at
FROM precedent_chunks
WHERE case_law_id = $1
ORDER BY chunk_index""",
case_law_id,
)
return [dict(r) for r in rows]
async def delete_halachot(case_law_id: UUID) -> int:
pool = await get_pool()
result = await pool.execute(
"DELETE FROM halachot WHERE case_law_id = $1", case_law_id,
)
# result is e.g. "DELETE 5" — extract the number.
try:
return int(result.split()[-1])
except (ValueError, IndexError):
return 0
async def store_halachot(case_law_id: UUID, halachot: list[dict]) -> int:
"""Bulk-insert extracted halachot.
Each halacha enters with review_status determined by extractor
confidence vs ``config.HALACHA_AUTO_APPROVE_THRESHOLD``:
- confidence >= threshold → 'approved' (visible to search immediately)
- else → 'pending_review' (chair must approve manually)
The auto-approval reviewer is recorded as 'auto' for traceability.
"""
if not halachot:
return 0
threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
pool = await get_pool()
async with pool.acquire() as conn:
for i, h in enumerate(halachot):
confidence = float(h.get("confidence", 0.0))
auto_approve = confidence >= threshold
review_status = "approved" if auto_approve else "pending_review"
reviewer = (
f"auto-approved (confidence ≥ {threshold:.2f})"
if auto_approve else None
)
reviewed_at_clause = "now()" if auto_approve else "NULL"
await conn.execute(
f"""INSERT INTO halachot
(case_law_id, halacha_index, rule_statement, rule_type,
reasoning_summary, supporting_quote, page_reference,
practice_areas, subject_tags, cites, confidence,
quote_verified, embedding, review_status,
reviewer, reviewed_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
$12, $13, $14, $15, {reviewed_at_clause})""",
case_law_id,
i,
h["rule_statement"],
h.get("rule_type", "binding"),
h.get("reasoning_summary", ""),
h["supporting_quote"],
h.get("page_reference", ""),
h.get("practice_areas", []),
h.get("subject_tags", []),
h.get("cites", []),
confidence,
h.get("quote_verified", False),
h.get("embedding"),
review_status,
reviewer,
)
return len(halachot)
async def reset_halacha_extraction(case_law_id: UUID) -> None:
"""Force a clean re-extraction: wipe halachot + clear per-chunk checkpoints
so every chunk is re-processed (used by explicit re-extract, not resume)."""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute("DELETE FROM halachot WHERE case_law_id = $1", case_law_id)
await conn.execute(
"UPDATE precedent_chunks SET halacha_extracted_at = NULL "
"WHERE case_law_id = $1", case_law_id,
)
async def mark_all_chunks_extracted(case_law_id: UUID) -> int:
"""Checkpoint every un-marked chunk of a precedent as extracted.
Used to backfill pre-V25 precedents (halachot already exist but no chunk was
checkpointed) so a resume run skips them instead of re-extracting (which
would duplicate). Returns rows updated.
"""
pool = await get_pool()
result = await pool.execute(
"UPDATE precedent_chunks SET halacha_extracted_at = now() "
"WHERE case_law_id = $1 AND halacha_extracted_at IS NULL", case_law_id,
)
try:
return int(result.split()[-1])
except (ValueError, IndexError):
return 0
async def store_halachot_for_chunk(
case_law_id: UUID, chunk_id: UUID, halachot: list[dict],
) -> int:
"""Persist ONE chunk's halachot and mark the chunk done — atomically.
Crash-safe + resumable: each chunk's results land in the DB the moment it
finishes, and the chunk is flagged (``halacha_extracted_at``) so a resumed
run skips it. ``halacha_index`` continues from the current max so appends
across chunks never collide. The chunk is marked even when ``halachot`` is
empty (so resume skips genuinely-empty chunks too). Caller serializes calls
(a single in-process store-lock) so the MAX read stays race-free.
Two gates encode the strict rubric (docs/halacha-strict-rubric.md) so the
corpus stays clean at the source instead of accumulating noise:
* Auto-approve gate — a halacha auto-approves only if confidence ≥ threshold
AND it carries no ``quality_flags`` (non_decision / truncated_quote /
thin_restatement / quote_unverified). Flagged items route to
``pending_review`` regardless of confidence.
* Dedup-on-insert — within the SAME precedent, a halacha is skipped if its
normalized ``supporting_quote`` already exists, or its rule-embedding has
cosine ≥ ``HALACHA_DEDUP_COSINE`` against an already-stored halacha.
Returns the number of halachot actually INSERTED (after dedup skips).
"""
threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE # cosine sim → distance
pool = await get_pool()
inserted = 0
skipped = 0
async with pool.acquire() as conn:
async with conn.transaction():
base = await conn.fetchval(
"SELECT COALESCE(MAX(halacha_index), -1) + 1 FROM halachot "
"WHERE case_law_id = $1", case_law_id,
)
# Existing normalized quotes for exact-dedup (incl. within-batch).
existing_quotes = {
halacha_quality.normalize_text(r["supporting_quote"])
for r in await conn.fetch(
"SELECT supporting_quote FROM halachot WHERE case_law_id = $1",
case_law_id,
)
}
for h in halachot:
norm_quote = halacha_quality.normalize_text(h["supporting_quote"])
# 1) exact normalized-quote duplicate within this precedent
if norm_quote and norm_quote in existing_quotes:
skipped += 1
continue
# 2) semantic near-duplicate (rule embedding cosine)
emb = h.get("embedding")
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
dup = await conn.fetchval(
"SELECT 1 FROM halachot WHERE case_law_id = $1 "
"AND embedding IS NOT NULL AND (embedding <=> $2) <= $3 "
"LIMIT 1",
case_law_id, emb, dedup_distance,
)
if dup:
skipped += 1
continue
confidence = float(h.get("confidence", 0.0))
flags = h.get("quality_flags") or []
auto_approve = confidence >= threshold and not flags
review_status = "approved" if auto_approve else "pending_review"
reviewer = (
f"auto-approved (confidence ≥ {threshold:.2f})"
if auto_approve else None
)
reviewed_at_clause = "now()" if auto_approve else "NULL"
await conn.execute(
f"""INSERT INTO halachot
(case_law_id, halacha_index, rule_statement, rule_type,
reasoning_summary, supporting_quote, page_reference,
practice_areas, subject_tags, cites, confidence,
quote_verified, quality_flags, embedding, review_status,
reviewer, reviewed_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
$12, $13, $14, $15, $16, {reviewed_at_clause})""",
case_law_id, base + inserted, h["rule_statement"],
h.get("rule_type", "binding"), h.get("reasoning_summary", ""),
h["supporting_quote"], h.get("page_reference", ""),
h.get("practice_areas", []), h.get("subject_tags", []),
h.get("cites", []), confidence, h.get("quote_verified", False),
flags, h.get("embedding"), review_status, reviewer,
)
existing_quotes.add(norm_quote)
inserted += 1
await conn.execute(
"UPDATE precedent_chunks SET halacha_extracted_at = now() "
"WHERE id = $1", chunk_id,
)
if skipped:
logger.info(
"store_halachot_for_chunk: case_law=%s chunk=%s%d inserted, "
"%d skipped as duplicates", case_law_id, chunk_id, inserted, skipped,
)
return inserted
async def list_halachot(
case_law_id: UUID | None = None,
review_status: str | None = None,
practice_area: str | None = None,
limit: int = 200,
offset: int = 0,
) -> list[dict]:
pool = await get_pool()
conditions = []
params: list = []
idx = 1
if case_law_id is not None:
conditions.append(f"h.case_law_id = ${idx}")
params.append(case_law_id)
idx += 1
if review_status:
conditions.append(f"h.review_status = ${idx}")
params.append(review_status)
idx += 1
if practice_area:
conditions.append(f"${idx} = ANY(h.practice_areas)")
params.append(practice_area)
idx += 1
where_sql = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.extend([limit, offset])
sql = f"""
SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement,
h.rule_type, h.reasoning_summary, h.supporting_quote,
h.page_reference, h.practice_areas, h.subject_tags,
h.cites, h.confidence, h.quote_verified, h.quality_flags,
h.review_status,
h.reviewer, h.reviewed_at, h.created_at, h.updated_at,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level,
COALESCE(cor.corroboration_count, 0)::int AS corroboration_count,
COALESCE(cor.corroboration_negative, false) AS corroboration_negative
FROM halachot h
LEFT JOIN case_law cl ON cl.id = h.case_law_id
LEFT JOIN (
SELECT halacha_id,
count(DISTINCT COALESCE(citing_case_law_id::text,
citing_decision_id::text, source_citation_id::text))
FILTER (WHERE treatment IN ('followed','explained'))
AS corroboration_count,
bool_or(treatment IN
('distinguished','criticized','questioned','overruled'))
AS corroboration_negative
FROM halacha_citation_corroboration
GROUP BY halacha_id
) cor ON cor.halacha_id = h.id
{where_sql}
ORDER BY h.case_law_id, h.halacha_index
LIMIT ${idx} OFFSET ${idx + 1}
"""
rows = await pool.fetch(sql, *params)
out = []
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
out.append(d)
return out
async def update_halacha(
halacha_id: UUID,
review_status: str | None = None,
reviewer: str = "",
rule_statement: str | None = None,
reasoning_summary: str | None = None,
subject_tags: list[str] | None = None,
practice_areas: list[str] | None = None,
) -> dict | None:
"""Update a halacha — used by the chair to approve/reject/edit."""
pool = await get_pool()
set_parts: list[str] = []
params: list = [halacha_id]
idx = 2
if review_status is not None:
set_parts.append(f"review_status = ${idx}")
params.append(review_status)
idx += 1
if review_status in ("approved", "rejected", "published", "deferred"):
set_parts.append(f"reviewed_at = now()")
set_parts.append(f"reviewer = ${idx}")
params.append(reviewer)
idx += 1
if rule_statement is not None:
set_parts.append(f"rule_statement = ${idx}")
params.append(rule_statement)
idx += 1
if reasoning_summary is not None:
set_parts.append(f"reasoning_summary = ${idx}")
params.append(reasoning_summary)
idx += 1
if subject_tags is not None:
set_parts.append(f"subject_tags = ${idx}")
params.append(subject_tags)
idx += 1
if practice_areas is not None:
set_parts.append(f"practice_areas = ${idx}")
params.append(practice_areas)
idx += 1
if not set_parts:
return None
set_parts.append("updated_at = now()")
# Exclude `embedding` — it's a numpy.ndarray of np.float32 that breaks
# FastAPI's jsonable_encoder downstream (PATCH /api/halachot/{id}).
# Callers that need it (none today) can re-fetch with get_halacha.
sql = f"""
UPDATE halachot SET {', '.join(set_parts)} WHERE id = $1
RETURNING id, case_law_id, halacha_index, rule_statement, rule_type,
reasoning_summary, supporting_quote, page_reference,
practice_areas, subject_tags, cites, confidence,
quote_verified, quality_flags, review_status, reviewer,
reviewed_at, created_at, updated_at
"""
row = await pool.fetchrow(sql, *params)
return dict(row) if row else None
# Statuses the chair can set via review (batch or single). 'deferred' = snooze:
# stays out of the active library AND out of the default pending queue, without
# the finality of 'rejected'. #84 review-queue triage.
HALACHA_REVIEW_STATUSES = {
"pending_review", "approved", "rejected", "published", "deferred",
}
async def update_halachot_batch(
halacha_ids: list[str], review_status: str, reviewer: str = "",
) -> int:
"""Bulk-set review_status for many halachot in one atomic statement.
Powers the #84 "approve/reject/defer the whole group" action — one request,
one transaction, one refetch (vs N PATCH round-trips). Only the status +
reviewer + reviewed_at are touched (no content edits in batch). Returns the
number of rows updated.
"""
if not halacha_ids or review_status not in HALACHA_REVIEW_STATUSES:
return 0
ids = [UUID(str(i)) for i in halacha_ids]
stamp = review_status in ("approved", "rejected", "published", "deferred")
pool = await get_pool()
result = await pool.execute(
f"""UPDATE halachot
SET review_status = $2,
updated_at = now()
{", reviewed_at = now(), reviewer = $3" if stamp else ""}
WHERE id = ANY($1::uuid[])""",
ids, review_status, *( [reviewer] if stamp else [] ),
)
try:
return int(result.split()[-1])
except (ValueError, IndexError):
return 0
async def approve_halacha_by_corroboration(
halacha_id: UUID, n_sources: int, min_cites: int,
) -> bool:
"""Approve a halacha on citation corroboration — ONLY if it is currently
awaiting the chair (``pending_review``). Never touches ``published`` /
``rejected`` / already-``approved`` (INV-COR5: the chair gate is preserved for
everything else). The reviewer records the corroboration basis as provenance
(INV-COR6). Returns True iff a row actually transitioned."""
pool = await get_pool()
reviewer = f"corroborated ({n_sources} judicial citations ≥ {min_cites})"
row = await pool.fetchrow(
"UPDATE halachot SET review_status='approved', reviewer=$2, "
"reviewed_at=now(), updated_at=now() "
"WHERE id=$1 AND review_status='pending_review' RETURNING id",
halacha_id, reviewer,
)
return row is not None
async def demote_halacha_overruled(halacha_id: UUID) -> bool:
"""Demote an APPROVED halacha back to the chair gate because a later citing
court overruled it (INV-COR2). Acts only on ``approved`` → ``pending_review``;
leaves ``published`` / ``rejected`` / already-``pending_review`` untouched. The
reviewer note records why it re-entered the queue. Returns True iff a row
transitioned."""
pool = await get_pool()
row = await pool.fetchrow(
"UPDATE halachot SET review_status='pending_review', "
"reviewer='flagged: overruled by later citation (X11)', "
"reviewed_at=NULL, updated_at=now() "
"WHERE id=$1 AND review_status='approved' RETURNING id",
halacha_id,
)
return row is not None
async def list_corroboration_grouped(case_law_id: UUID) -> dict[str, list[dict]]:
"""Per-halacha corroboration links for a cited precedent, in the
``{source_id, treatment}`` shape ``aggregate()`` consumes. The distinct citing
source is keyed by case_law/decision id (falling back to the citation row id
so two anonymous rows are not collapsed)."""
pool = await get_pool()
rows = await pool.fetch(
"SELECT hcc.halacha_id::text AS halacha_id, "
" COALESCE(hcc.citing_case_law_id::text, hcc.citing_decision_id::text, "
" hcc.source_citation_id::text) AS source_id, "
" hcc.treatment "
"FROM halacha_citation_corroboration hcc "
"JOIN halachot h ON h.id = hcc.halacha_id "
"WHERE h.case_law_id = $1",
case_law_id,
)
out: dict[str, list[dict]] = {}
for r in rows:
out.setdefault(r["halacha_id"], []).append(
{"source_id": r["source_id"], "treatment": r["treatment"]}
)
return out
async def precedents_with_halachot_and_incoming_citations() -> list[str]:
"""case_law ids that have at least one halacha AND at least one incoming
citation (either graph) — the corroboration backfill target set."""
pool = await get_pool()
rows = await pool.fetch(
"SELECT c.id::text FROM case_law c "
"WHERE EXISTS (SELECT 1 FROM halachot h WHERE h.case_law_id=c.id) "
" AND (EXISTS (SELECT 1 FROM precedent_internal_citations p "
" WHERE p.cited_case_law_id=c.id) "
" OR EXISTS (SELECT 1 FROM case_law_citations cc "
" WHERE cc.case_law_id=c.id))",
)
return [r["id"] for r in rows]
async def nearest_halacha_for_vector(case_law_id: UUID, vec: list[float]) -> tuple[str, float] | None:
"""Best-matching halacha of `case_law_id` for a context embedding (cosine)."""
pool = await get_pool()
row = await pool.fetchrow(
"SELECT id::text AS id, 1 - (embedding <=> $2) AS sim "
"FROM halachot WHERE case_law_id = $1 AND embedding IS NOT NULL "
"ORDER BY embedding <=> $2 LIMIT 1",
case_law_id, vec,
)
return (row["id"], float(row["sim"])) if row else None
async def incoming_citations_for_precedent(case_law_id: UUID) -> list[dict]:
"""All incoming citations (both graphs) with their context + source id."""
pool = await get_pool()
rows = await pool.fetch(
"SELECT id::text AS source_id, source_case_law_id::text AS citing_case_law_id, "
" NULL::text AS citing_decision_id, match_context AS context "
"FROM precedent_internal_citations WHERE cited_case_law_id = $1 "
"UNION ALL "
"SELECT id::text, NULL, decision_id::text, context_text "
"FROM case_law_citations WHERE case_law_id = $1",
case_law_id,
)
return [dict(r) for r in rows]
async def store_corroboration(
halacha_id: str,
source_id: str,
citing_case_law_id,
citing_decision_id,
treatment: str,
score: float,
context: str,
) -> None:
from uuid import UUID as _UUID
pool = await get_pool()
# asyncpg requires UUID objects for uuid-typed columns; convert non-None strings.
h_id = _UUID(halacha_id) if isinstance(halacha_id, str) else halacha_id
s_id = _UUID(source_id) if isinstance(source_id, str) else source_id
cl_id = _UUID(citing_case_law_id) if (citing_case_law_id and isinstance(citing_case_law_id, str)) else citing_case_law_id
d_id = _UUID(citing_decision_id) if (citing_decision_id and isinstance(citing_decision_id, str)) else citing_decision_id
await pool.execute(
"INSERT INTO halacha_citation_corroboration "
"(halacha_id, citing_case_law_id, citing_decision_id, source_citation_id, treatment, match_score, match_context) "
"VALUES ($1,$2,$3,$4,$5,$6,$7) "
"ON CONFLICT (halacha_id, source_citation_id) DO UPDATE SET "
"treatment=EXCLUDED.treatment, match_score=EXCLUDED.match_score",
h_id, cl_id, d_id, s_id, treatment, score, context,
)
async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]:
"""Return all corroboration rows for one halacha, ordered by match_score DESC."""
pool = await get_pool()
rows = await pool.fetch(
"SELECT treatment, match_score, match_context, citing_case_law_id::text, "
" citing_decision_id::text, created_at "
"FROM halacha_citation_corroboration WHERE halacha_id = $1 "
"ORDER BY match_score DESC", halacha_id,
)
return [
{
"treatment": r["treatment"],
"match_score": float(r["match_score"]) if r["match_score"] is not None else None,
"match_context": r["match_context"],
"citing_case_law_id": r["citing_case_law_id"],
"citing_decision_id": r["citing_decision_id"],
"created_at": r["created_at"].isoformat() if r["created_at"] else None,
}
for r in rows
]
async def search_precedent_library_semantic(
query_embedding: list[float],
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
limit: int = 10,
include_halachot: bool = True,
source_kind: str = "external_upload",
district: str = "",
chair_name: str = "",
) -> list[dict]:
"""Semantic search over precedents filtered by source_kind.
source_kind='external_upload' → court rulings (default)
source_kind='internal_committee' → appeals-committee decisions
Returns merged halachot + chunks. Halachot are pre-distilled rules, so
they get a small score boost. Only ``approved`` / ``published`` halachot
are visible (per chair-review policy). Chunks are visible regardless
of halacha review status.
"""
pool = await get_pool()
halacha_filters = [
"h.review_status IN ('approved', 'published')",
f"cl.source_kind = '{source_kind}'",
"cl.searchable = true",
]
chunk_filters = [f"cl.source_kind = '{source_kind}'", "cl.searchable = true"]
h_params: list = [query_embedding, limit]
c_params: list = [query_embedding, limit]
h_idx = 3
c_idx = 3
if practice_area:
halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)")
h_params.append(practice_area)
h_idx += 1
chunk_filters.append(f"cl.practice_area = ${c_idx}")
c_params.append(practice_area)
c_idx += 1
if court:
halacha_filters.append(f"cl.court ILIKE ${h_idx}")
h_params.append(f"%{court}%")
h_idx += 1
chunk_filters.append(f"cl.court ILIKE ${c_idx}")
c_params.append(f"%{court}%")
c_idx += 1
if precedent_level:
halacha_filters.append(f"cl.precedent_level = ${h_idx}")
h_params.append(precedent_level)
h_idx += 1
chunk_filters.append(f"cl.precedent_level = ${c_idx}")
c_params.append(precedent_level)
c_idx += 1
if appeal_subtype:
halacha_filters.append(f"cl.appeal_subtype = ${h_idx}")
h_params.append(appeal_subtype)
h_idx += 1
chunk_filters.append(f"cl.appeal_subtype = ${c_idx}")
c_params.append(appeal_subtype)
c_idx += 1
if is_binding is not None:
halacha_filters.append(f"cl.is_binding = ${h_idx}")
h_params.append(is_binding)
h_idx += 1
chunk_filters.append(f"cl.is_binding = ${c_idx}")
c_params.append(is_binding)
c_idx += 1
if subject_tag:
halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)")
h_params.append(subject_tag)
h_idx += 1
if district:
halacha_filters.append(f"cl.district = ${h_idx}")
h_params.append(district)
h_idx += 1
chunk_filters.append(f"cl.district = ${c_idx}")
c_params.append(district)
c_idx += 1
if chair_name:
halacha_filters.append(f"cl.chair_name = ${h_idx}")
h_params.append(chair_name)
h_idx += 1
chunk_filters.append(f"cl.chair_name = ${c_idx}")
c_params.append(chair_name)
c_idx += 1
halacha_sql = f"""
SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
h.reasoning_summary, h.supporting_quote, h.page_reference,
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.chair_name, cl.district,
1 - (h.embedding <=> $1) AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}
AND h.embedding IS NOT NULL
ORDER BY h.embedding <=> $1
LIMIT $2
"""
# Parent-doc retrieval (V17 / TaskMaster #48): the LEFT JOIN
# surfaces each chunk's parent_chunk's content alongside it. When
# ``config.PARENT_DOC_RETRIEVAL_ENABLED`` is true *and* the row has
# a non-null parent, the post-processing loop swaps in the parent's
# content so the writer sees the broader passage instead of the
# 300-token sliver that matched. Legacy rows (parent_chunk_id NULL)
# are unaffected — the JOIN returns NULL parent_* and the swap is a
# no-op. Index ``idx_precedent_chunks_role`` is not used here
# intentionally: filtering on chunk_role='child' would exclude
# legacy single-tier rows that default to 'child' but have no
# parent; an embedding-IS-NOT-NULL filter is equivalent because
# parents store NULL embeddings.
chunk_sql = f"""
SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
pc.section_type, pc.page_number,
pc.parent_chunk_id,
parent.content AS parent_content,
parent.section_type AS parent_section_type,
parent.page_number AS parent_page_number,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
1 - (pc.embedding <=> $1) AS score
FROM precedent_chunks pc
JOIN case_law cl ON cl.id = pc.case_law_id
LEFT JOIN precedent_chunks parent
ON parent.id = pc.parent_chunk_id
WHERE {' AND '.join(chunk_filters)}
AND pc.embedding IS NOT NULL
-- #55: exclude tiny fragment chunks (artifacts of pre-fix
-- mid-sentence header splits) that carry no retrievable signal.
AND length(trim(pc.content)) >= 50
ORDER BY pc.embedding <=> $1
LIMIT $2
"""
results: list[dict] = []
if include_halachot:
rows = await pool.fetch(halacha_sql, *h_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
# Dynamic rule-level boost: scales with extractor confidence
# so high-conf halachot rank higher than low-conf ones.
# conf=0.78 → +0.047, conf=0.90 → +0.054, conf=0.95 → +0.057
# Calibrated so the average (≈0.85) stays at +0.05 (legacy value).
_conf = float(d.get("confidence") or 0.0)
d["score"] = float(d["score"]) + max(_conf * 0.06, 0.0)
d["type"] = "halacha"
results.append(d)
rows = await pool.fetch(chunk_sql, *c_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"])
d["type"] = "passage"
_maybe_swap_parent(d)
results.append(d)
results.sort(key=lambda x: x["score"], reverse=True)
# Dedupe: when multiple child hits share the same parent, we'd
# otherwise return duplicate parent content. Keep the highest-
# scoring hit per parent (skip if parent swap disabled or row has
# no parent — chunk_id alone remains unique).
return _dedupe_by_parent(results, limit)
def _maybe_swap_parent(row: dict) -> None:
"""Promote parent content into ``content`` when the flag is on
and the row has a non-NULL parent. Mutates ``row`` in place.
Adds debug fields ``child_content`` / ``child_section_type`` /
``child_page_number`` so callers can see what originally matched.
Strips the ``parent_*`` keys that come back from the LEFT JOIN —
they're an implementation detail of the swap.
"""
parent_content = row.pop("parent_content", None)
parent_section = row.pop("parent_section_type", None)
parent_page = row.pop("parent_page_number", None)
if (
config.PARENT_DOC_RETRIEVAL_ENABLED
and row.get("parent_chunk_id") is not None
and parent_content
):
row["child_content"] = row.get("content")
row["child_section_type"] = row.get("section_type")
row["child_page_number"] = row.get("page_number")
row["content"] = parent_content
# Parent's section_type is authoritative for the swapped row
# (children inherit from their parent, but a parent that spans
# a boundary uses its first section's type — same convention).
if parent_section:
row["section_type"] = parent_section
if parent_page is not None:
row["page_number"] = parent_page
row["parent_swap"] = True
def _dedupe_by_parent(rows: list[dict], limit: int) -> list[dict]:
"""When parent-doc swap is active, multiple children sharing a
parent collapse to one parent row (the highest-scored child wins).
Rows without a parent (legacy chunks, halachot) pass through
unchanged.
"""
if not config.PARENT_DOC_RETRIEVAL_ENABLED:
return rows[:limit]
seen_parents: set = set()
out: list[dict] = []
for r in rows:
pid = r.get("parent_chunk_id")
if pid and r.get("parent_swap"):
if pid in seen_parents:
continue
seen_parents.add(pid)
out.append(r)
if len(out) >= limit:
break
return out
async def search_precedent_library_lexical(
*,
query: str,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
source_kind: str = "external_upload",
district: str = "",
chair_name: str = "",
limit: int = 30,
include_halachot: bool = True,
) -> list[dict]:
"""Lexical (BM25-like) search via ``ts_rank_cd`` over ``content_tsv``
and ``rule_tsv`` (V12 columns).
Mirrors the filter set of :func:`search_precedent_library_semantic`
so the two layers can be fused 1:1 by rank in
:mod:`hybrid_search` via RRF.
Why ``plainto_tsquery``: it accepts free-text input, lowercases, and
AND-joins the terms — matches the bi-encoder's "all words contribute"
assumption better than ``websearch_to_tsquery`` (which inserts ORs).
Empty / stopword-only queries return zero rows (no error).
Why ``ts_rank_cd``: cover density variant — rewards documents where
the query terms appear close together (e.g. "1461/20 אנטרים" matches
the same paragraph). Higher is more relevant.
"""
if not (query or "").strip():
return []
pool = await get_pool()
halacha_filters = [
"h.review_status IN ('approved', 'published')",
f"cl.source_kind = '{source_kind}'",
"cl.searchable = true",
]
chunk_filters = [f"cl.source_kind = '{source_kind}'", "cl.searchable = true"]
# $1 = query, $2 = limit. Filters append starting at $3.
h_params: list = [query, limit]
c_params: list = [query, limit]
h_idx = 3
c_idx = 3
if practice_area:
halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)")
h_params.append(practice_area)
h_idx += 1
chunk_filters.append(f"cl.practice_area = ${c_idx}")
c_params.append(practice_area)
c_idx += 1
if court:
halacha_filters.append(f"cl.court ILIKE ${h_idx}")
h_params.append(f"%{court}%")
h_idx += 1
chunk_filters.append(f"cl.court ILIKE ${c_idx}")
c_params.append(f"%{court}%")
c_idx += 1
if precedent_level:
halacha_filters.append(f"cl.precedent_level = ${h_idx}")
h_params.append(precedent_level)
h_idx += 1
chunk_filters.append(f"cl.precedent_level = ${c_idx}")
c_params.append(precedent_level)
c_idx += 1
if appeal_subtype:
halacha_filters.append(f"cl.appeal_subtype = ${h_idx}")
h_params.append(appeal_subtype)
h_idx += 1
chunk_filters.append(f"cl.appeal_subtype = ${c_idx}")
c_params.append(appeal_subtype)
c_idx += 1
if is_binding is not None:
halacha_filters.append(f"cl.is_binding = ${h_idx}")
h_params.append(is_binding)
h_idx += 1
chunk_filters.append(f"cl.is_binding = ${c_idx}")
c_params.append(is_binding)
c_idx += 1
if subject_tag:
halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)")
h_params.append(subject_tag)
h_idx += 1
if district:
halacha_filters.append(f"cl.district = ${h_idx}")
h_params.append(district)
h_idx += 1
chunk_filters.append(f"cl.district = ${c_idx}")
c_params.append(district)
c_idx += 1
if chair_name:
halacha_filters.append(f"cl.chair_name = ${h_idx}")
h_params.append(chair_name)
h_idx += 1
chunk_filters.append(f"cl.chair_name = ${c_idx}")
c_params.append(chair_name)
c_idx += 1
halacha_sql = f"""
SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
h.reasoning_summary, h.supporting_quote, h.page_reference,
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.chair_name, cl.district,
GREATEST(
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)),
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
)
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
THEN 1.0 ELSE 0.0 END AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}
AND (h.rule_tsv @@ plainto_tsquery('simple', $1)
OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
ORDER BY score DESC
LIMIT $2
"""
# Parent-doc retrieval (V17) — same LEFT JOIN strategy as the
# semantic side. The tsvector match still runs over the child's
# ``content_tsv``; only the *returned* content is promoted to the
# parent when the flag is on and a parent exists. See
# :func:`search_precedent_library_semantic` for the rationale.
# We intentionally restrict matching to chunks with an embedding
# (i.e. children + legacy single-tier rows). Hierarchical parents
# store NULL embeddings, so even though their ``content_tsv`` is
# populated they're excluded here — preventing a parent from
# matching directly and then being "swapped" with itself.
chunk_sql = f"""
SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
pc.section_type, pc.page_number,
pc.parent_chunk_id,
parent.content AS parent_content,
parent.section_type AS parent_section_type,
parent.page_number AS parent_page_number,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
GREATEST(
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)),
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
)
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
THEN 1.0 ELSE 0.0 END AS score
FROM precedent_chunks pc
JOIN case_law cl ON cl.id = pc.case_law_id
LEFT JOIN precedent_chunks parent
ON parent.id = pc.parent_chunk_id
WHERE {' AND '.join(chunk_filters)}
AND pc.embedding IS NOT NULL
-- #55: exclude tiny fragment chunks (see semantic query above).
AND length(trim(pc.content)) >= 50
AND (pc.content_tsv @@ plainto_tsquery('simple', $1)
OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
ORDER BY score DESC
LIMIT $2
"""
results: list[dict] = []
if include_halachot:
rows = await pool.fetch(halacha_sql, *h_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"])
d["type"] = "halacha"
results.append(d)
rows = await pool.fetch(chunk_sql, *c_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"])
d["type"] = "passage"
_maybe_swap_parent(d)
results.append(d)
results.sort(key=lambda x: x["score"], reverse=True)
return _dedupe_by_parent(results, limit)
async def precedent_library_stats() -> dict:
"""Aggregate stats for the /precedents stats tab."""
pool = await get_pool()
async with pool.acquire() as conn:
total = await conn.fetchval(
"SELECT COUNT(*) FROM case_law"
)
by_practice = await conn.fetch(
"""SELECT practice_area, COUNT(*) AS n
FROM case_law
GROUP BY practice_area
ORDER BY n DESC"""
)
by_level = await conn.fetch(
"""SELECT precedent_level, COUNT(*) AS n
FROM case_law
GROUP BY precedent_level
ORDER BY n DESC"""
)
halachot_total = await conn.fetchval(
"SELECT COUNT(*) FROM halachot"
)
halachot_pending = await conn.fetchval(
"SELECT COUNT(*) FROM halachot WHERE review_status = 'pending_review'"
)
halachot_approved = await conn.fetchval(
"SELECT COUNT(*) FROM halachot WHERE review_status IN ('approved', 'published')"
)
return {
"precedents_total": int(total or 0),
"by_practice_area": [
{"practice_area": r["practice_area"], "count": int(r["n"])}
for r in by_practice
],
"by_precedent_level": [
{"precedent_level": r["precedent_level"], "count": int(r["n"])}
for r in by_level
],
"halachot_total": int(halachot_total or 0),
"halachot_pending": int(halachot_pending or 0),
"halachot_approved": int(halachot_approved or 0),
}
# ── V8: extraction request queue helpers ─────────────────────────
async def request_metadata_extraction(case_law_id: UUID) -> bool:
"""Stamp ``metadata_extraction_requested_at`` for the local MCP worker
to pick up. Returns False if the row is missing.
Originally restricted to ``source_kind='external_upload'`` (see git
blame). Opened to all source kinds 2026-05-06 — internal_committee
rows can also need re-extraction (e.g. corrupted subject_tags from
an early ingest pipeline). The extractor itself preserves user
values (``precedent_metadata_extractor.extract_and_apply`` only
fills empty fields), so this is safe.
"""
pool = await get_pool()
# Reset the status to 'pending' alongside the timestamp so a re-request
# after a prior 'completed'/'failed' run shows "בתור" again in the UI
# instead of a stale terminal badge.
result = await pool.execute(
"UPDATE case_law SET metadata_extraction_requested_at = now(), "
"metadata_extraction_status = 'pending' "
"WHERE id = $1",
case_law_id,
)
return result == "UPDATE 1"
async def request_halacha_extraction(case_law_id: UUID) -> bool:
"""Same but for halacha extraction. See note on
:func:`request_metadata_extraction` re: opening to all source kinds."""
pool = await get_pool()
result = await pool.execute(
"UPDATE case_law SET halacha_extraction_requested_at = now() "
"WHERE id = $1",
case_law_id,
)
return result == "UPDATE 1"
async def list_pending_extraction_requests(
kind: str = "metadata", # 'metadata' | 'halacha'
limit: int = 20,
) -> list[dict]:
"""Return rows requesting extraction, oldest request first.
The MCP worker drains the queue in order: process → clear timestamp.
"""
col = (
"metadata_extraction_requested_at"
if kind == "metadata"
else "halacha_extraction_requested_at"
)
pool = await get_pool()
# Drop the legacy ``source_kind = 'external_upload'`` filter — without it
# internal_committee rows could be stamped (we opened that gate in
# request_metadata_extraction / request_halacha_extraction) but stayed
# invisible to the worker forever.
rows = await pool.fetch(
f"""SELECT id, case_number, case_name, court, date,
practice_area, is_binding, {col} AS requested_at
FROM case_law
WHERE {col} IS NOT NULL
ORDER BY {col} ASC
LIMIT $1""",
limit,
)
out = []
for r in rows:
d = dict(r)
if d.get("date") is not None:
d["date"] = d["date"].isoformat()
if d.get("requested_at") is not None:
d["requested_at"] = d["requested_at"].isoformat()
out.append(d)
return out
async def extraction_queue_status() -> dict:
"""Pending-extraction queue depth per kind (INV-TOOL4 visibility / GAP-45).
Surfaces the otherwise-hidden queue that ``process_pending_extractions``
drains: how many case_law rows still carry a metadata/halacha extraction
request, and the age of the oldest one. Read-only — does not drain.
"""
pool = await get_pool()
async with pool.acquire() as conn:
meta = await conn.fetchrow(
"SELECT COUNT(*) AS n, MIN(metadata_extraction_requested_at) AS oldest "
"FROM case_law WHERE metadata_extraction_requested_at IS NOT NULL"
)
hal = await conn.fetchrow(
"SELECT COUNT(*) AS n, MIN(halacha_extraction_requested_at) AS oldest "
"FROM case_law WHERE halacha_extraction_requested_at IS NOT NULL"
)
def _fmt(r: dict) -> dict:
oldest = r["oldest"]
return {"pending": r["n"], "oldest_request": oldest.isoformat() if oldest else None}
return {"metadata": _fmt(meta), "halacha": _fmt(hal)}
async def clear_extraction_request(
case_law_id: UUID, kind: str = "metadata",
) -> None:
col = (
"metadata_extraction_requested_at"
if kind == "metadata"
else "halacha_extraction_requested_at"
)
pool = await get_pool()
await pool.execute(
f"UPDATE case_law SET {col} = NULL WHERE id = $1",
case_law_id,
)
# ── V9: Multimodal page image embeddings ─────────────────────────
async def store_document_image_embeddings(
document_id: UUID,
case_id: UUID | None,
page_records: list[dict],
model_name: str = "voyage-multimodal-3",
) -> int:
"""Replace per-page image embeddings for a document.
Each ``page_records`` entry: ``{page_number, embedding, image_thumbnail_path}``.
Embeddings should already be 1024-dim lists (or None for skipped pages).
"""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM document_image_embeddings WHERE document_id = $1",
document_id,
)
for r in page_records:
await conn.execute(
"""INSERT INTO document_image_embeddings
(document_id, case_id, page_number, embedding,
image_thumbnail_path, model_name)
VALUES ($1, $2, $3, $4, $5, $6)""",
document_id, case_id,
r["page_number"],
r.get("embedding"),
r.get("image_thumbnail_path"),
model_name,
)
return len(page_records)
async def store_precedent_image_embeddings(
case_law_id: UUID,
page_records: list[dict],
model_name: str = "voyage-multimodal-3",
) -> int:
"""Same pattern as store_document_image_embeddings but for precedents."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM precedent_image_embeddings WHERE case_law_id = $1",
case_law_id,
)
for r in page_records:
await conn.execute(
"""INSERT INTO precedent_image_embeddings
(case_law_id, page_number, embedding,
image_thumbnail_path, model_name)
VALUES ($1, $2, $3, $4, $5)""",
case_law_id,
r["page_number"],
r.get("embedding"),
r.get("image_thumbnail_path"),
model_name,
)
return len(page_records)
async def search_document_images_similar(
query_embedding: list[float],
limit: int = 10,
case_id: UUID | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Cosine search over per-page image embeddings of case documents."""
pool = await get_pool()
conditions: list[str] = []
params: list = [query_embedding, limit]
idx = 3
if case_id:
conditions.append(f"die.case_id = ${idx}")
params.append(case_id); idx += 1
if practice_area:
conditions.append(f"c.practice_area = ${idx}")
params.append(practice_area); idx += 1
if appeal_subtype:
conditions.append(f"c.appeal_subtype = ${idx}")
params.append(appeal_subtype); idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT die.document_id, die.case_id, die.page_number,
die.image_thumbnail_path,
d.title AS document_title,
c.case_number,
1 - (die.embedding <=> $1) AS score
FROM document_image_embeddings die
JOIN documents d ON d.id = die.document_id
JOIN cases c ON c.id = die.case_id
{where}
ORDER BY die.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
async def search_precedent_images_similar(
query_embedding: list[float],
limit: int = 10,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
) -> list[dict]:
"""Cosine search over per-page image embeddings of precedent rulings."""
pool = await get_pool()
conditions: list[str] = ["cl.source_kind = 'external_upload'"]
params: list = [query_embedding, limit]
idx = 3
if practice_area:
conditions.append(f"cl.practice_area = ${idx}")
params.append(practice_area); idx += 1
if court:
conditions.append(f"cl.court ILIKE ${idx}")
params.append(f"%{court}%"); idx += 1
if precedent_level:
conditions.append(f"cl.precedent_level = ${idx}")
params.append(precedent_level); idx += 1
if appeal_subtype:
conditions.append(f"cl.appeal_subtype = ${idx}")
params.append(appeal_subtype); idx += 1
if is_binding is not None:
conditions.append(f"cl.is_binding = ${idx}")
params.append(is_binding); idx += 1
where = " AND ".join(conditions)
sql = f"""
SELECT pie.case_law_id, pie.page_number, pie.image_thumbnail_path,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area,
1 - (pie.embedding <=> $1) AS score
FROM precedent_image_embeddings pie
JOIN case_law cl ON cl.id = pie.case_law_id
WHERE {where}
ORDER BY pie.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
out = []
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
out.append(d)
return out
async def search_similar_hybrid(
query_text_embedding: list[float],
query_image_embedding: list[float],
limit: int = 10,
fetch_k: int = 30,
text_weight: float = 0.65,
case_id: UUID | None = None,
section_type: str | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Weighted merge of text-chunk and per-page image search.
Same (document_id, page_number) → boost text chunk by image score
on that page. Image-only pages with no overlapping text chunk are
surfaced as ``match_type='image'`` so dense scanned content still
appears in results.
"""
img_weight = 1.0 - text_weight
text_rows = await search_similar(
query_text_embedding, limit=fetch_k, case_id=case_id,
section_type=section_type, practice_area=practice_area,
appeal_subtype=appeal_subtype,
)
img_rows = await search_document_images_similar(
query_image_embedding, limit=fetch_k, case_id=case_id,
practice_area=practice_area, appeal_subtype=appeal_subtype,
)
img_by_page: dict[tuple, dict] = {
(str(r["document_id"]), r["page_number"]): r for r in img_rows
}
seen: set = set()
merged: list[dict] = []
for r in text_rows:
page = r.get("page_number")
key = (str(r["document_id"]), page) if page is not None else None
img_hit = img_by_page.get(key) if key else None
text_score = float(r["score"])
image_score = float(img_hit["score"]) if img_hit else 0.0
d = dict(r)
d["text_score"] = text_score
d["image_score"] = image_score
d["score"] = text_score * text_weight + image_score * img_weight
d["match_type"] = "text+image" if img_hit else "text"
if img_hit:
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
merged.append(d)
if key:
seen.add(key)
for r in img_rows:
key = (str(r["document_id"]), r["page_number"])
if key in seen:
continue
d = dict(r)
d["text_score"] = 0.0
d["image_score"] = float(r["score"])
d["score"] = float(r["score"]) * img_weight
d["match_type"] = "image"
d["content"] = ""
d["section_type"] = "image"
merged.append(d)
merged.sort(key=lambda x: -x["score"])
return merged[:limit]
async def search_precedent_library_hybrid(
query_text_embedding: list[float],
query_image_embedding: list[float],
limit: int = 10,
fetch_k: int = 30,
text_weight: float = 0.65,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
include_halachot: bool = True,
) -> list[dict]:
"""Hybrid variant of search_precedent_library_semantic.
Halachot have no ``page_number`` — they're boosted by the max
image score from any page in the same case_law row.
"""
img_weight = 1.0 - text_weight
text_results = await search_precedent_library_semantic(
query_text_embedding,
practice_area=practice_area, court=court,
precedent_level=precedent_level, appeal_subtype=appeal_subtype,
is_binding=is_binding, subject_tag=subject_tag,
limit=fetch_k, include_halachot=include_halachot,
)
img_results = await search_precedent_images_similar(
query_image_embedding, limit=fetch_k,
practice_area=practice_area, court=court,
precedent_level=precedent_level, appeal_subtype=appeal_subtype,
is_binding=is_binding,
)
img_by_page: dict[tuple, dict] = {}
img_by_case: dict[str, float] = {}
for r in img_results:
cid = str(r["case_law_id"])
img_by_page[(cid, r["page_number"])] = r
img_by_case[cid] = max(img_by_case.get(cid, 0.0), float(r["score"]))
seen: set = set()
merged: list[dict] = []
for r in text_results:
cid = str(r["case_law_id"])
page = r.get("page_number")
key = (cid, page) if page is not None else None
img_hit = img_by_page.get(key) if key else None
if img_hit:
image_score = float(img_hit["score"])
elif r.get("type") == "halacha":
image_score = img_by_case.get(cid, 0.0)
else:
image_score = 0.0
text_score = float(r["score"])
d = dict(r)
d["text_score"] = text_score
d["image_score"] = image_score
d["score"] = text_score * text_weight + image_score * img_weight
if img_hit:
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
if key:
seen.add(key)
merged.append(d)
for r in img_results:
key = (str(r["case_law_id"]), r["page_number"])
if key in seen:
continue
d = dict(r)
d["text_score"] = 0.0
d["image_score"] = float(r["score"])
d["score"] = float(r["score"]) * img_weight
d["type"] = "image_page"
d["content"] = ""
d["section_type"] = "image"
merged.append(d)
merged.sort(key=lambda x: -x["score"])
return merged[:limit]
# ── Missing precedents (V13) ───────────────────────────────────────
# Track citations from party briefs that aren't yet in the corpus.
# Lifecycle: 'open' → researcher logs gap → chair uploads decision
# → status='uploaded' (file ingested) → status='closed' (linked to
# case_law row). 'irrelevant' = chair decided the citation isn't worth
# adding to the library.
ALLOWED_MP_PARTIES = {
"appellant", "respondent", "committee", "permit_applicant", "unknown",
}
ALLOWED_MP_STATUS = {"open", "uploaded", "closed", "irrelevant"}
def _row_to_missing_precedent(row: asyncpg.Record) -> dict:
d = dict(row)
d["id"] = str(d["id"])
if d.get("cited_in_case_id") is not None:
d["cited_in_case_id"] = str(d["cited_in_case_id"])
if d.get("cited_in_document_id") is not None:
d["cited_in_document_id"] = str(d["cited_in_document_id"])
if d.get("linked_case_law_id") is not None:
d["linked_case_law_id"] = str(d["linked_case_law_id"])
return d
async def create_missing_precedent(
citation: str,
case_name: str | None = None,
cited_in_case_id: UUID | None = None,
cited_in_document_id: UUID | None = None,
cited_by_party: str | None = None,
cited_by_party_name: str | None = None,
legal_topic: str | None = None,
legal_issue: str | None = None,
claim_quote: str | None = None,
notes: str | None = None,
) -> dict:
"""Create a new missing-precedent row (status='open' by default)."""
if not citation.strip():
raise ValueError("citation is required")
if cited_by_party and cited_by_party not in ALLOWED_MP_PARTIES:
raise ValueError(
f"cited_by_party must be one of {sorted(ALLOWED_MP_PARTIES)}"
)
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""INSERT INTO missing_precedents (
citation, case_name, cited_in_case_id, cited_in_document_id,
cited_by_party, cited_by_party_name, legal_topic, legal_issue,
claim_quote, notes
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
RETURNING *""",
citation.strip(), case_name, cited_in_case_id, cited_in_document_id,
cited_by_party, cited_by_party_name, legal_topic, legal_issue,
claim_quote, notes,
)
return _row_to_missing_precedent(row)
async def list_missing_precedents(
status: str | None = None,
case_id: UUID | None = None,
legal_topic: str | None = None,
limit: int = 200,
offset: int = 0,
) -> list[dict]:
"""List missing precedents, joining the cited-in case_number for display."""
pool = await get_pool()
conditions: list[str] = []
params: list = []
idx = 1
if status:
conditions.append(f"mp.status = ${idx}")
params.append(status)
idx += 1
if case_id:
conditions.append(f"mp.cited_in_case_id = ${idx}")
params.append(case_id)
idx += 1
if legal_topic:
conditions.append(f"mp.legal_topic ILIKE ${idx}")
params.append(f"%{legal_topic}%")
idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.append(limit)
params.append(offset)
sql = f"""
SELECT mp.*,
c.case_number AS cited_in_case_number,
cl.case_number AS linked_case_law_number,
cl.case_name AS linked_case_law_name
FROM missing_precedents mp
LEFT JOIN cases c ON c.id = mp.cited_in_case_id
LEFT JOIN case_law cl ON cl.id = mp.linked_case_law_id
{where}
ORDER BY
CASE mp.status
WHEN 'open' THEN 0
WHEN 'uploaded' THEN 1
WHEN 'closed' THEN 2
WHEN 'irrelevant' THEN 3
END,
mp.created_at DESC
LIMIT ${idx} OFFSET ${idx + 1}
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [_row_to_missing_precedent(r) for r in rows]
async def get_missing_precedent(mp_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT mp.*,
c.case_number AS cited_in_case_number,
cl.case_number AS linked_case_law_number,
cl.case_name AS linked_case_law_name
FROM missing_precedents mp
LEFT JOIN cases c ON c.id = mp.cited_in_case_id
LEFT JOIN case_law cl ON cl.id = mp.linked_case_law_id
WHERE mp.id = $1
""",
mp_id,
)
return _row_to_missing_precedent(row) if row else None
async def update_missing_precedent(mp_id: UUID, **fields) -> dict | None:
"""Patch a missing-precedent row. Allowed fields: legal_topic,
legal_issue, notes, cited_by_party, cited_by_party_name, case_name,
status, linked_case_law_id, closed_at."""
if not fields:
return await get_missing_precedent(mp_id)
allowed = {
"legal_topic", "legal_issue", "notes", "cited_by_party",
"cited_by_party_name", "case_name", "status", "linked_case_law_id",
"closed_at", "claim_quote", "citation",
}
clean = {k: v for k, v in fields.items() if k in allowed}
if not clean:
return await get_missing_precedent(mp_id)
if "status" in clean and clean["status"] not in ALLOWED_MP_STATUS:
raise ValueError(
f"status must be one of {sorted(ALLOWED_MP_STATUS)}"
)
if "cited_by_party" in clean and clean["cited_by_party"] and \
clean["cited_by_party"] not in ALLOWED_MP_PARTIES:
raise ValueError(
f"cited_by_party must be one of {sorted(ALLOWED_MP_PARTIES)}"
)
set_clauses = []
values = []
for i, (key, val) in enumerate(clean.items(), start=2):
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = (
f"UPDATE missing_precedents SET {', '.join(set_clauses)} "
f"WHERE id = $1 RETURNING *"
)
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(sql, mp_id, *values)
return _row_to_missing_precedent(row) if row else None
async def close_missing_precedent(
mp_id: UUID,
linked_case_law_id: UUID | None = None,
notes: str | None = None,
status: str = "closed",
) -> dict | None:
"""Mark a missing-precedent row as closed (or 'uploaded'/'irrelevant')
and link it to a case_law row if provided."""
if status not in ALLOWED_MP_STATUS:
raise ValueError(
f"status must be one of {sorted(ALLOWED_MP_STATUS)}"
)
pool = await get_pool()
async with pool.acquire() as conn:
sets = ["status = $2", "closed_at = now()", "updated_at = now()"]
params: list = [mp_id, status]
idx = 3
if linked_case_law_id is not None:
sets.append(f"linked_case_law_id = ${idx}")
params.append(linked_case_law_id)
idx += 1
if notes is not None:
sets.append(f"notes = ${idx}")
params.append(notes)
idx += 1
sql = (
f"UPDATE missing_precedents SET {', '.join(sets)} "
f"WHERE id = $1 RETURNING *"
)
row = await conn.fetchrow(sql, *params)
return _row_to_missing_precedent(row) if row else None
async def find_missing_precedent_by_citation(
citation: str,
case_id: UUID | None = None,
) -> dict | None:
"""Look up an existing row by citation string (exact match) and optionally
cited-in case_id. Used to deduplicate auto-creation by the researcher."""
pool = await get_pool()
async with pool.acquire() as conn:
if case_id is not None:
row = await conn.fetchrow(
"SELECT * FROM missing_precedents "
"WHERE citation = $1 AND cited_in_case_id = $2 LIMIT 1",
citation.strip(), case_id,
)
else:
row = await conn.fetchrow(
"SELECT * FROM missing_precedents WHERE citation = $1 LIMIT 1",
citation.strip(),
)
return _row_to_missing_precedent(row) if row else None