"""Database service - asyncpg connection pool and queries.""" from __future__ import annotations import asyncio import hashlib import json import logging import re from datetime import date from uuid import UUID, uuid4 import asyncpg from pgvector.asyncpg import register_vector from legal_mcp import config from legal_mcp.services import halacha_quality logger = logging.getLogger(__name__) _pool: asyncpg.Pool | None = None _schema_ready: bool = False _init_lock: asyncio.Lock = asyncio.Lock() async def get_pool() -> asyncpg.Pool: """Return the connection pool, creating it (and running schema init) lazily. The MCP server's `lifespan` no longer blocks on schema init — it's done here on first DB access. This keeps the `initialize`/`tools/list` MCP handshake immediate so Claude Code never sees a stale "No such tool". """ global _pool, _schema_ready if _pool is not None and _schema_ready: return _pool async with _init_lock: if _pool is None: # First, ensure pgvector extension exists (before registering type codec) conn = await asyncpg.connect(config.POSTGRES_URL) try: await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"') finally: await conn.close() _pool = await asyncpg.create_pool( config.POSTGRES_URL, min_size=2, max_size=10, init=_init_connection, ) if not _schema_ready: await _run_schema_migrations(_pool) _schema_ready = True return _pool async def _init_connection(conn: asyncpg.Connection) -> None: await register_vector(conn) async def close_pool() -> None: global _pool if _pool: await _pool.close() _pool = None # ── Schema ────────────────────────────────────────────────────────── SCHEMA_SQL = """ CREATE TABLE IF NOT EXISTS cases ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_number TEXT UNIQUE NOT NULL, title TEXT NOT NULL, appellants JSONB DEFAULT '[]', respondents JSONB DEFAULT '[]', subject TEXT DEFAULT '', property_address TEXT DEFAULT '', permit_number TEXT DEFAULT '', committee_type TEXT DEFAULT 'ועדה מקומית', status TEXT DEFAULT 'new', hearing_date DATE, decision_date DATE, tags JSONB DEFAULT '[]', notes TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS documents ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE CASCADE, doc_type TEXT NOT NULL, title TEXT NOT NULL, file_path TEXT NOT NULL, extracted_text TEXT DEFAULT '', extraction_status TEXT DEFAULT 'pending', page_count INTEGER, metadata JSONB DEFAULT '{}', created_at TIMESTAMPTZ DEFAULT now() ); -- INV-TOOL3 / GAP-52: SHA-256 of the uploaded file bytes, for idempotent upload -- (re-uploading the same file to a case returns the existing document). Empty -- default = legacy rows with unknown hash; never matched as a duplicate. ALTER TABLE documents ADD COLUMN IF NOT EXISTS content_hash text NOT NULL DEFAULT ''; CREATE TABLE IF NOT EXISTS document_chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID REFERENCES documents(id) ON DELETE CASCADE, case_id UUID REFERENCES cases(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, section_type TEXT DEFAULT 'other', embedding vector(1024), page_number INTEGER, created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS style_corpus ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID REFERENCES documents(id) ON DELETE SET NULL, decision_number TEXT, decision_date DATE, subject_categories JSONB DEFAULT '[]', full_text TEXT NOT NULL, summary TEXT DEFAULT '', outcome TEXT DEFAULT '', key_principles JSONB DEFAULT '[]', practice_area TEXT DEFAULT 'appeals_committee', appeal_subtype TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS style_patterns ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), pattern_type TEXT NOT NULL, pattern_text TEXT NOT NULL, frequency INTEGER DEFAULT 1, context TEXT DEFAULT '', examples JSONB DEFAULT '[]', appeal_subtype TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id); CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id); CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id); CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status); CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number); """ MIGRATIONS_SQL = """ ALTER TABLE cases ADD COLUMN IF NOT EXISTS expected_outcome TEXT DEFAULT ''; CREATE TABLE IF NOT EXISTS audit_log ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), action TEXT NOT NULL, case_id UUID REFERENCES cases(id) ON DELETE SET NULL, document_id UUID REFERENCES documents(id) ON DELETE SET NULL, details JSONB DEFAULT '{}', actor TEXT DEFAULT 'system', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_audit_case ON audit_log(case_id); CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_log(action); CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_log(created_at DESC); """ # ── Phase 3: Workflow expansion ──────────────────────────────────── SCHEMA_V3_SQL = """ -- הרחבת decisions עם שדות חדשים ALTER TABLE decisions ADD COLUMN IF NOT EXISTS direction_doc JSONB DEFAULT NULL; ALTER TABLE decisions ADD COLUMN IF NOT EXISTS outcome_reasoning TEXT DEFAULT ''; -- הרחבת cases עם appeal_type (אם לא קיים) ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_type TEXT DEFAULT ''; ALTER TABLE cases ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee'; ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; -- active_draft_path = path to the DOCX that is the current source of truth -- for this case's decision text. Set to the latest טיוטה-v*.docx after export, -- or the latest עריכה-v*.docx after user upload. Used by revise_draft to know -- what file to base Track Changes revisions on. ALTER TABLE cases ADD COLUMN IF NOT EXISTS active_draft_path TEXT; -- הרחבת style_corpus עם practice_area / appeal_subtype ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee'; ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; -- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; -- decision_lessons: per-decision learnings the chair / curator / style_analyzer -- attaches to a corpus row. The generic legal-decision-lessons.md file stays -- as the source of truth for cross-corpus patterns; this table stores the -- granular "what we learned from THIS decision" notes that drive the writer's -- future drafts and let the curator look up prior observations on the same row. CREATE TABLE IF NOT EXISTS decision_lessons ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), style_corpus_id UUID NOT NULL REFERENCES style_corpus(id) ON DELETE CASCADE, lesson_text TEXT NOT NULL, category TEXT DEFAULT 'general', -- style / structure / lexicon / tabular / general source TEXT DEFAULT 'manual', -- manual / curator / chair / style_analyzer applied_to_skill BOOLEAN DEFAULT false, -- has this been promoted into SKILL.md? created_by TEXT DEFAULT 'chaim', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_decision_lessons_corpus ON decision_lessons(style_corpus_id); CREATE INDEX IF NOT EXISTS idx_decision_lessons_applied ON decision_lessons(applied_to_skill); -- chat_conversations / chat_messages: persistent history for the -- "שיחה עם הסוכן" tab on /training. Each conversation can optionally be -- scoped to a single style_corpus row (when the chair starts a chat -- "about decision X"). claude_session_id is the value the local claude -- CLI returns in stream-json — we pass it back via `--resume` on the -- next message so the model continues the same conversation without -- re-loading the system prompt every time. CREATE TABLE IF NOT EXISTS chat_conversations ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), title TEXT NOT NULL DEFAULT 'שיחה חדשה', style_corpus_id UUID REFERENCES style_corpus(id) ON DELETE SET NULL, claude_session_id TEXT, system_prompt_version TEXT DEFAULT 'v1', created_at TIMESTAMPTZ DEFAULT now(), last_message_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS chat_messages ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), conversation_id UUID NOT NULL REFERENCES chat_conversations(id) ON DELETE CASCADE, role TEXT NOT NULL, -- 'user' | 'assistant' content TEXT NOT NULL, raw_events JSONB DEFAULT '[]', -- stream-json events for the assistant turn (optional, for debug) created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_chat_messages_conv ON chat_messages(conversation_id, created_at); CREATE INDEX IF NOT EXISTS idx_chat_conv_corpus ON chat_conversations(style_corpus_id); CREATE INDEX IF NOT EXISTS idx_chat_conv_last ON chat_conversations(last_message_at DESC); -- טבלת qa_results CREATE TABLE IF NOT EXISTS qa_results ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, case_id UUID REFERENCES cases(id) ON DELETE CASCADE, check_name TEXT NOT NULL, passed BOOLEAN NOT NULL, severity TEXT DEFAULT 'warning', errors JSONB DEFAULT '[]', details TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_qa_results_decision ON qa_results(decision_id); CREATE INDEX IF NOT EXISTS idx_qa_results_case ON qa_results(case_id); -- טבלת decision_definitions (אם לא קיימת) CREATE TABLE IF NOT EXISTS decision_definitions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, term TEXT NOT NULL, definition TEXT NOT NULL, block_id TEXT DEFAULT 'block-he', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_definitions_decision ON decision_definitions(decision_id); -- טבלת appeal_type_rules (אם לא קיימת) CREATE TABLE IF NOT EXISTS appeal_type_rules ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), appeal_type TEXT NOT NULL, rule_category TEXT NOT NULL, rule_key TEXT NOT NULL, rule_value JSONB NOT NULL, description TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(appeal_type, rule_category, rule_key) ); -- image_placeholders על decision_blocks ALTER TABLE decision_blocks ADD COLUMN IF NOT EXISTS image_placeholders JSONB DEFAULT '[]'; """ # ── Phase 2: Decision + Knowledge + RAG layers ──────────────────── SCHEMA_V2_SQL = """ -- ═══════════════════════════════════════════════════════════════════ -- Layer 2: Decision -- ═══════════════════════════════════════════════════════════════════ -- decisions: מטאדטה של החלטה (גרסה אחת = רשומה אחת) CREATE TABLE IF NOT EXISTS decisions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE CASCADE, version INTEGER DEFAULT 1, status TEXT DEFAULT 'draft', -- draft/review/final/published outcome TEXT DEFAULT '', -- rejected/accepted/partial outcome_summary TEXT DEFAULT '', -- תמצית תוצאה (שורה אחת) total_paragraphs INTEGER DEFAULT 0, total_words INTEGER DEFAULT 0, decision_date DATE, author TEXT DEFAULT 'דפנה תמיר', panel_members JSONB DEFAULT '[]', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now(), UNIQUE(case_id, version) ); -- decision_blocks: 12 בלוקים לפי block-schema.md CREATE TABLE IF NOT EXISTS decision_blocks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, block_id TEXT NOT NULL, -- block-alef, block-bet, ... block-yod-bet block_index INTEGER NOT NULL, -- 1-12 title TEXT DEFAULT '', -- כותרת הבלוק (ריק לבלוקים ללא כותרת) content TEXT DEFAULT '', -- תוכן מלא (markdown) word_count INTEGER DEFAULT 0, weight_percent NUMERIC(5,2) DEFAULT 0, -- משקל בפועל (%) generation_type TEXT DEFAULT '', -- template-fill/reproduction/paraphrase/... model_used TEXT DEFAULT '', -- sonnet/opus/script temperature NUMERIC(3,2) DEFAULT 0, status TEXT DEFAULT 'empty', -- empty/draft/review/final notes TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now(), UNIQUE(decision_id, block_id) ); -- decision_paragraphs: סעיפים בודדים עם מעקב ציטוטים CREATE TABLE IF NOT EXISTS decision_paragraphs ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), block_id UUID REFERENCES decision_blocks(id) ON DELETE CASCADE, paragraph_number INTEGER NOT NULL, -- מספור רציף בתוך ההחלטה content TEXT NOT NULL, word_count INTEGER DEFAULT 0, citations JSONB DEFAULT '[]', -- [{case_law_id, text, type}] cross_references JSONB DEFAULT '[]', -- הפניות לסעיפים אחרים ["סעיף 5 לעיל"] created_at TIMESTAMPTZ DEFAULT now() ); -- claims: טענות צדדים (בלוק ז) CREATE TABLE IF NOT EXISTS claims ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE CASCADE, party_role TEXT NOT NULL, -- appellant/respondent/permit_applicant/committee party_name TEXT DEFAULT '', claim_text TEXT NOT NULL, claim_index INTEGER DEFAULT 0, -- סדר הופעה source_document TEXT DEFAULT '', -- מאיזה מסמך חולצה הטענה addressed_in_paragraph INTEGER, -- באיזה סעיף בדיון נענתה created_at TIMESTAMPTZ DEFAULT now() ); -- ═══════════════════════════════════════════════════════════════════ -- Layer 3: Legal Knowledge -- ═══════════════════════════════════════════════════════════════════ -- case_law: פסיקה (תקדימים) CREATE TABLE IF NOT EXISTS case_law ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_number TEXT UNIQUE NOT NULL, -- עע"מ 3975/22 או ערר 1011-03-25 case_name TEXT NOT NULL, -- שם קצר: "ב. קרן-נכסים" court TEXT DEFAULT '', -- בג"ץ / עליון / מנהלי / ועדת ערר date DATE, subject_tags JSONB DEFAULT '[]', -- ["proprietary_claims", "parking"] summary TEXT DEFAULT '', -- תמצית 2-3 משפטים key_quote TEXT DEFAULT '', -- ציטוט מרכזי full_text TEXT DEFAULT '', -- טקסט מלא אם זמין source_url TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); -- case_law_citations: קשרים בין פסיקה להחלטות שלנו CREATE TABLE IF NOT EXISTS case_law_citations ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE SET NULL, citation_type TEXT DEFAULT 'support', -- support/distinguish/overrule/obiter context_text TEXT DEFAULT '', -- ההקשר שבו צוטט created_at TIMESTAMPTZ DEFAULT now() ); -- statutory_provisions: חקיקה נפוצה CREATE TABLE IF NOT EXISTS statutory_provisions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), statute_name TEXT NOT NULL, -- "חוק התכנון והבנייה" section_number TEXT NOT NULL, -- "152(א)(2)" section_title TEXT DEFAULT '', -- "זכות ערר" full_text TEXT DEFAULT '', -- נוסח הסעיף common_usage TEXT DEFAULT '', -- מתי משתמשים subject_tags JSONB DEFAULT '[]', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(statute_name, section_number) ); -- transition_phrases: ביטויי מעבר של דפנה CREATE TABLE IF NOT EXISTS transition_phrases ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), phrase TEXT UNIQUE NOT NULL, -- "ועל מנת לא לצאת בחסר" usage_context TEXT DEFAULT '', -- מתי להשתמש block_types JSONB DEFAULT '[]', -- באילו בלוקים: ["block-yod"] frequency INTEGER DEFAULT 1, -- כמה פעמים ראינו source_decision TEXT DEFAULT '', -- מאיזו החלטה created_at TIMESTAMPTZ DEFAULT now() ); -- lessons_learned: לקחים מהשוואת טיוטות לגרסאות סופיות CREATE TABLE IF NOT EXISTS lessons_learned ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), lesson_title TEXT NOT NULL, -- "Discussion = continuous essay, no sub-headers" lesson_text TEXT NOT NULL, -- תיאור מלא category TEXT DEFAULT '', -- structure/style/content/process applies_to JSONB DEFAULT '[]', -- ["block-yod", "all"] source_case TEXT DEFAULT '', -- "הכט 1180-1181" severity TEXT DEFAULT 'important', -- critical/important/nice-to-have created_at TIMESTAMPTZ DEFAULT now() ); -- ═══════════════════════════════════════════════════════════════════ -- Layer 4: Extended RAG -- ═══════════════════════════════════════════════════════════════════ -- paragraph_embeddings: embeddings של סעיפים בהחלטות CREATE TABLE IF NOT EXISTS paragraph_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE CASCADE, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); -- case_law_embeddings: embeddings של פסיקה CREATE TABLE IF NOT EXISTS case_law_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, chunk_text TEXT NOT NULL, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); -- ═══════════════════════════════════════════════════════════════════ -- Chair Feedback (הערות דפנה על טיוטות) -- ═══════════════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS chair_feedback ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE SET NULL, block_id TEXT DEFAULT '', -- block-yod, block-vav, etc. feedback_text TEXT NOT NULL, -- ההערה של דפנה category TEXT DEFAULT 'other', -- missing_content/wrong_tone/wrong_structure/factual_error/style/other lesson_extracted TEXT DEFAULT '', -- הלקח שהופק applied_to TEXT[] DEFAULT '{}', -- לאילו קבצים/כללים הלקח יושם resolved BOOLEAN DEFAULT FALSE, -- האם הלקח יושם created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS tag_company_mappings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), tag TEXT NOT NULL, -- appeal_subtype value (e.g. building_permit) tag_label TEXT NOT NULL DEFAULT '', -- Hebrew display label company_id TEXT NOT NULL, -- Paperclip company UUID company_name TEXT NOT NULL DEFAULT '', -- cached company name for display created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(tag, company_id) ); -- ═══════════════════════════════════════════════════════════════════ -- Indexes -- ═══════════════════════════════════════════════════════════════════ CREATE INDEX IF NOT EXISTS idx_decisions_case ON decisions(case_id); CREATE INDEX IF NOT EXISTS idx_decisions_status ON decisions(status); CREATE INDEX IF NOT EXISTS idx_decision_blocks_decision ON decision_blocks(decision_id); CREATE INDEX IF NOT EXISTS idx_decision_blocks_block_id ON decision_blocks(block_id); CREATE INDEX IF NOT EXISTS idx_decision_paragraphs_block ON decision_paragraphs(block_id); CREATE INDEX IF NOT EXISTS idx_claims_case ON claims(case_id); CREATE INDEX IF NOT EXISTS idx_claims_role ON claims(party_role); CREATE INDEX IF NOT EXISTS idx_case_law_subject ON case_law USING gin(subject_tags); CREATE INDEX IF NOT EXISTS idx_case_law_citations_decision ON case_law_citations(decision_id); CREATE INDEX IF NOT EXISTS idx_statutory_provisions_statute ON statutory_provisions(statute_name); CREATE INDEX IF NOT EXISTS idx_transition_phrases_block ON transition_phrases USING gin(block_types); CREATE INDEX IF NOT EXISTS idx_lessons_category ON lessons_learned(category); CREATE INDEX IF NOT EXISTS idx_paragraph_embeddings_vec ON paragraph_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); CREATE INDEX IF NOT EXISTS idx_case_law_embeddings_vec ON case_law_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); """ # ── Phase 4: Methodology alignment ────────────────────────────── SCHEMA_V4_SQL = """ -- ═══════════════════════════════════════════════════════════════════ -- V4: Methodology alignment (decision-methodology.md) -- ═══════════════════════════════════════════════════════════════════ -- claims: טיפול בטענות (bundle/skip) + סוג טענה ALTER TABLE claims ADD COLUMN IF NOT EXISTS claim_type TEXT DEFAULT 'claim'; -- claim / response / reply ALTER TABLE claims ADD COLUMN IF NOT EXISTS claim_handling TEXT DEFAULT 'address'; -- address (דיון מלא) / bundle (קיבוץ) / skip (דילוג) ALTER TABLE claims ADD COLUMN IF NOT EXISTS bundle_group TEXT DEFAULT ''; -- שם הקבוצה לקיבוץ (למשל "פגמים פרוצדורליים") ALTER TABLE claims ADD COLUMN IF NOT EXISTS handling_reason TEXT DEFAULT ''; -- נימוק לדילוג/קיבוץ (למשל "נבחנה ולא מצאנו ממש") -- cases: תקן ביקורת + קטגוריות נושא ALTER TABLE cases ADD COLUMN IF NOT EXISTS standard_of_review TEXT DEFAULT ''; -- "שיקול דעת תכנוני עצמאי" / "בחינת שומה מכרעת" / ... ALTER TABLE cases ADD COLUMN IF NOT EXISTS subject_categories JSONB DEFAULT '[]'; -- ["חניה", "קווי בניין", "גובה", "שימוש חורג", ...] -- case_law: רמת תקדים + מעמד ALTER TABLE case_law ADD COLUMN IF NOT EXISTS precedent_level TEXT DEFAULT ''; -- עליון / מנהלי / ועדת ערר ארצית / ועדת ערר מחוזית ALTER TABLE case_law ADD COLUMN IF NOT EXISTS is_binding BOOLEAN DEFAULT TRUE; -- הלכה מחייבת (true) / אמרת אגב (false) ALTER TABLE case_law ADD COLUMN IF NOT EXISTS creac_role TEXT DEFAULT ''; -- rule (הנחה עליונה) / explanation (הרחבה) / analogy (אנלוגיה) -- decisions: סדר סוגיות + תקן ביקורת ALTER TABLE decisions ADD COLUMN IF NOT EXISTS issue_order JSONB DEFAULT '[]'; -- סדר הסוגיות שנקבע ע"י המנצח: [{"title": "...", "type": "threshold/dispositive/secondary"}] ALTER TABLE decisions ADD COLUMN IF NOT EXISTS claim_handling JSONB DEFAULT '{}'; -- {"overrides": [{"claim_id": "...", "handling": "bundle", "group": "..."}]} -- indexes CREATE INDEX IF NOT EXISTS idx_claims_handling ON claims(claim_handling); CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type); CREATE INDEX IF NOT EXISTS idx_case_law_level ON case_law(precedent_level); """ # ── Phase 5: Interim draft (appraiser facts + post-hearing flag) ─── SCHEMA_V5_SQL = """ -- appraiser_facts: תכניות והיתרים שצוינו ע"י כל שמאי בנפרד. -- בשונה מ-claims (שהוא טענה משפטית), כאן מאוחסנת עובדה עניינית מתוך השומה. -- שימוש ראשי: זיהוי סתירות בין שמאים על איזו תכנית או היתר חל בנכס. CREATE TABLE IF NOT EXISTS appraiser_facts ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE, document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, appraiser_name TEXT NOT NULL, fact_type TEXT NOT NULL CHECK (fact_type IN ('plan', 'permit')), identifier TEXT NOT NULL, details JSONB NOT NULL DEFAULT '{}', page_number INTEGER, created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_appraiser_facts_case ON appraiser_facts(case_id, fact_type); CREATE INDEX IF NOT EXISTS idx_appraiser_facts_identifier ON appraiser_facts(case_id, identifier); -- V5.1: appraiser_side — which party this appraiser represents. -- Values: 'committee' (הוועדה), 'appellant' (העורר), 'deciding' (מכריע). -- Required by extract_appraiser_facts; the chair tags it via the UI before extraction. -- Set via documents.metadata.appraiser_side at upload/edit time, then propagated here -- so that conflict rendering in block-tet can label each entry with its side. ALTER TABLE appraiser_facts ADD COLUMN IF NOT EXISTS appraiser_side TEXT DEFAULT ''; CREATE INDEX IF NOT EXISTS idx_appraiser_facts_side ON appraiser_facts(case_id, appraiser_side); -- documents.metadata.is_post_hearing: flag for materials submitted after the hearing -- (השלמות טיעון, הצעות פשרה). Used by block-chet to include them in the proceedings narrative. -- documents.metadata.appraiser_side: which side the appraiser represents (see above). -- No schema change needed — uses existing JSONB metadata column. """ # ── V6: Case archiving ──────────────────────────────────────────── SCHEMA_V6_SQL = """ -- archived_at: timestamp when the case was moved to the archive screen. -- NULL = active (default). Set via POST /api/cases/{case_number}/archive. -- Cleared via POST /api/cases/{case_number}/restore. -- The /api/cases endpoint filters out archived cases by default; -- pass ?include_archived=true (or use /api/cases/archived) to see them. ALTER TABLE cases ADD COLUMN IF NOT EXISTS archived_at TIMESTAMPTZ; CREATE INDEX IF NOT EXISTS idx_cases_archived ON cases(archived_at) WHERE archived_at IS NOT NULL; """ # ── V7: External Precedent Library + halacha extraction ────────── # Chair-uploaded external court rulings and other appeals committee decisions # become an authoritative law corpus. Distinct from style_corpus (Daphna's # style) and case_precedents (chair-attached quotes scoped to a single case). SCHEMA_V7_SQL = """ -- case_law extensions: distinguish chair-uploaded full rulings from -- auto-extracted citation stubs, and track ingestion progress. ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_kind TEXT DEFAULT 'cited_only'; -- 'external_upload' (chair uploaded full ruling) | 'cited_only' (stub from -- references_extractor) | 'nevo_seed' (future: auto-fetched from Nevo). ALTER TABLE case_law ADD COLUMN IF NOT EXISTS document_id UUID REFERENCES documents(id) ON DELETE SET NULL; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS extraction_status TEXT DEFAULT 'pending'; -- 'pending' | 'processing' | 'completed' | 'failed' ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_status TEXT DEFAULT 'pending'; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS metadata_extraction_status TEXT DEFAULT 'pending'; -- 'pending' | 'processing' | 'completed' | 'failed'. Mirrors the -- text/halacha status columns so the UI can show a live badge while the -- local-MCP worker drains the metadata queue (previously only the -- metadata_extraction_requested_at timestamp existed — no 'processing'). ALTER TABLE case_law ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT ''; -- chair-editable abstract shown in search results. ALTER TABLE case_law ADD COLUMN IF NOT EXISTS nevo_ratio TEXT DEFAULT ''; -- The Nevo editorial מיני-רציו block, captured at ingest *before* it is -- stripped from the body (#86.3). Kept separate from `headnote` (which is -- our own abstract) so it can serve as a free professional gold-set for -- benchmarking halacha-extraction recall/precision. Empty when the source -- is not a Nevo export or carries no mini-ratio. ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT ''; -- 'court_ruling' | 'appeals_committee' -- practice_area is closed to the three appeals committee domains. DO $$ BEGIN ALTER TABLE case_law ADD CONSTRAINT case_law_practice_area_check CHECK (practice_area IN ('', 'rishuy_uvniya', 'betterment_levy', 'compensation_197')); EXCEPTION WHEN duplicate_object THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind); CREATE INDEX IF NOT EXISTS idx_case_law_practice ON case_law(practice_area, appeal_subtype); -- precedent_chunks: full-text chunks of an uploaded ruling, with embeddings. -- Analog of document_chunks for case_law rows where source_kind='external_upload'. CREATE TABLE IF NOT EXISTS precedent_chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, section_type TEXT DEFAULT 'other', -- intro | facts | legal_analysis | ruling | conclusion | other page_number INTEGER, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_case_law ON precedent_chunks(case_law_id); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_section ON precedent_chunks(case_law_id, section_type); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_vec ON precedent_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); -- halachot: extracted binding rules. One halacha = one rule + verbatim quote. -- Embedded separately for rule-precision semantic match (chunks centroid is -- dominated by surrounding context). All halachot start as pending_review; -- only approved/published rows are visible to search_precedent_library. CREATE TABLE IF NOT EXISTS halachot ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, halacha_index INTEGER NOT NULL, rule_statement TEXT NOT NULL, rule_type TEXT DEFAULT 'interpretive', -- rule ROLE only (INV-DM7): holding | interpretive | procedural | -- application | obiter. authority (binding/persuasive) is DERIVED -- from case_law.precedent_level, never stored here. reasoning_summary TEXT DEFAULT '', supporting_quote TEXT NOT NULL, page_reference TEXT DEFAULT '', practice_areas TEXT[] DEFAULT '{}', subject_tags TEXT[] DEFAULT '{}', cites TEXT[] DEFAULT '{}', confidence NUMERIC(3,2) DEFAULT 0.0, quote_verified BOOLEAN DEFAULT FALSE, review_status TEXT DEFAULT 'pending_review', -- pending_review | approved | rejected | published | deferred (#84 snooze) reviewer TEXT DEFAULT '', reviewed_at TIMESTAMPTZ, quality_flags TEXT[] DEFAULT '{}', -- non_decision | truncated_quote | thin_restatement | quote_unverified -- (any flag blocks auto-approve → routes to pending_review) embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); ALTER TABLE halachot ADD COLUMN IF NOT EXISTS quality_flags TEXT[] DEFAULT '{}'; CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id); CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status); CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas); CREATE INDEX IF NOT EXISTS idx_halachot_tags ON halachot USING gin(subject_tags); CREATE INDEX IF NOT EXISTS idx_halachot_vec ON halachot USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); -- #83: halacha_index must be unique per precedent. The extractor assigns it as -- MAX(halacha_index)+1 under an in-process store-lock + a cross-process advisory -- lock, so collisions shouldn't occur — but per FireHydrant/OneUptime the -- constraint is the actual correctness guarantee (the lock is the optimization). -- A racing/double run now fails LOUDLY instead of silently appending duplicates -- (the 2026-05/06 over-extraction root cause). Requires clean data first (see -- scripts: the 6 colliding precedents were renumbered 2026-06-03). CREATE UNIQUE INDEX IF NOT EXISTS idx_halachot_unique_index ON halachot(case_law_id, halacha_index); """ # ── V8: Extraction request queue ───────────────────────────────── # Web UI buttons ("Sparkles" = request metadata extraction; "Refresh" = # request halacha extraction) run inside the FastAPI container, which has # no `claude` CLI. They can't run the LLM extractor directly. Instead they # stamp a request timestamp here, and the chair (or me) runs the MCP tool # `precedent_process_pending_extractions` from local Claude Code, where the # CLI is available, to drain the queue. See claude_session.py for the rule. SCHEMA_V8_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS metadata_extraction_requested_at TIMESTAMPTZ; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_requested_at TIMESTAMPTZ; CREATE INDEX IF NOT EXISTS idx_case_law_metadata_requested ON case_law(metadata_extraction_requested_at) WHERE metadata_extraction_requested_at IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_case_law_halacha_requested ON case_law(halacha_extraction_requested_at) WHERE halacha_extraction_requested_at IS NOT NULL; """ # ── V9: Multimodal page-image embeddings ───────────────────────── # voyage-multimodal-3 (1024-dim) embeds the whole page as an image: # captures table layout, scanned content, signatures, plans — content # that text-OCR loses. Ingestion is gated by config.MULTIMODAL_ENABLED; # search_*_hybrid() merge text-cosine + image-cosine when present. # image_thumbnail_path is a relative path under DATA_DIR/cases/{case}/ # thumbnails/ or DATA_DIR/precedent-library/thumbnails/ — a small JPEG # rendered at config.MULTIMODAL_THUMB_DPI for UI preview, distinct from # the higher-DPI render fed to the embedder (which is not persisted). SCHEMA_V9_SQL = """ CREATE TABLE IF NOT EXISTS document_image_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID REFERENCES documents(id) ON DELETE CASCADE, case_id UUID REFERENCES cases(id) ON DELETE CASCADE, page_number INTEGER NOT NULL, image_thumbnail_path TEXT, embedding vector(1024), model_name TEXT DEFAULT 'voyage-multimodal-3', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(document_id, page_number) ); CREATE INDEX IF NOT EXISTS idx_doc_img_emb_vec ON document_image_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); CREATE INDEX IF NOT EXISTS idx_doc_img_emb_doc ON document_image_embeddings(document_id); CREATE INDEX IF NOT EXISTS idx_doc_img_emb_case ON document_image_embeddings(case_id); CREATE TABLE IF NOT EXISTS precedent_image_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, page_number INTEGER NOT NULL, image_thumbnail_path TEXT, embedding vector(1024), model_name TEXT DEFAULT 'voyage-multimodal-3', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(case_law_id, page_number) ); CREATE INDEX IF NOT EXISTS idx_prec_img_emb_vec ON precedent_image_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); CREATE INDEX IF NOT EXISTS idx_prec_img_emb_case_law ON precedent_image_embeddings(case_law_id); """ SCHEMA_V10_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS chair_name TEXT DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS district TEXT DEFAULT ''; ALTER TABLE cases ADD COLUMN IF NOT EXISTS chair_name TEXT DEFAULT ''; CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind); CREATE INDEX IF NOT EXISTS idx_case_law_chair ON case_law(chair_name) WHERE chair_name <> ''; CREATE INDEX IF NOT EXISTS idx_case_law_district ON case_law(district) WHERE district <> ''; """ SCHEMA_V11_SQL = """ CREATE TABLE IF NOT EXISTS case_law_relations ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, related_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, relation_type TEXT NOT NULL DEFAULT 'same_case_chain', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(case_law_id, related_id), CHECK (case_law_id <> related_id) ); CREATE INDEX IF NOT EXISTS idx_clr_a ON case_law_relations(case_law_id); CREATE INDEX IF NOT EXISTS idx_clr_b ON case_law_relations(related_id); """ # ── V12: BM25/lexical search via tsvector ───────────────────────── # PostgreSQL doesn't ship a Hebrew stemmer; the 'simple' configuration # lowercases + tokenises on whitespace without stemming — exactly what # we want for Hebrew. It also preserves alphanumeric tokens like # "1461/20" (case numbers) which are the prime motivator for adding a # lexical layer on top of the semantic cosine index. # Both columns are GENERATED STORED so they stay in sync with the # source rows for free, and GIN-indexed for ts_rank_cd lookups. SCHEMA_V12_SQL = """ ALTER TABLE precedent_chunks ADD COLUMN IF NOT EXISTS content_tsv tsvector GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED; ALTER TABLE halachot ADD COLUMN IF NOT EXISTS rule_tsv tsvector GENERATED ALWAYS AS ( to_tsvector('simple', coalesce(rule_statement,'') || ' ' || coalesce(supporting_quote,'') || ' ' || coalesce(reasoning_summary,'') ) ) STORED; CREATE INDEX IF NOT EXISTS idx_precedent_chunks_tsv ON precedent_chunks USING GIN(content_tsv); CREATE INDEX IF NOT EXISTS idx_halachot_tsv ON halachot USING GIN(rule_tsv); """ # ── V13: Missing precedents log ─────────────────────────────────── # Track citations that the parties brought up but which are NOT yet in # the precedent_library. Created by the researcher (auto or chair) # whenever a citation can't be found in the corpus; closed by uploading # the actual decision via internal_decision_upload or # precedent_library_upload, at which point linked_case_law_id points to # the new case_law row and status flips to 'closed'. SCHEMA_V13_SQL = """ CREATE TABLE IF NOT EXISTS missing_precedents ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), citation TEXT NOT NULL, case_name TEXT, cited_in_case_id UUID REFERENCES cases(id) ON DELETE CASCADE, cited_in_document_id UUID REFERENCES documents(id) ON DELETE SET NULL, cited_by_party TEXT CHECK (cited_by_party IN ( 'appellant', 'respondent', 'committee', 'permit_applicant', 'unknown' )), cited_by_party_name TEXT, legal_topic TEXT, legal_issue TEXT, claim_quote TEXT, status TEXT DEFAULT 'open' CHECK (status IN ( 'open', 'uploaded', 'closed', 'irrelevant' )), linked_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL, closed_at TIMESTAMPTZ, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW(), notes TEXT ); CREATE INDEX IF NOT EXISTS idx_missing_precedents_case ON missing_precedents(cited_in_case_id); CREATE INDEX IF NOT EXISTS idx_missing_precedents_status ON missing_precedents(status); CREATE INDEX IF NOT EXISTS idx_missing_precedents_citation ON missing_precedents(citation); """ # ── V14: Legal arguments (aggregated propositions) ──────────────── # After ``claims_extractor`` extracts raw propositions (rows in ``claims``) # the LLM-driven aggregator groups them into ~6-12 distinct legal arguments # per party. ``legal_arguments`` holds the consolidated argument; the M:M # join table ``legal_argument_propositions`` links back to the source # propositions for traceability ("which raw claims feed this argument?"). SCHEMA_V14_SQL = """ CREATE TABLE IF NOT EXISTS legal_arguments ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE, party TEXT NOT NULL CHECK (party IN ( 'appellant', 'respondent', 'committee', 'permit_applicant', 'unknown' )), argument_index INTEGER NOT NULL, argument_title TEXT NOT NULL, argument_body TEXT NOT NULL, legal_topic TEXT, priority TEXT DEFAULT 'substantive' CHECK (priority IN ( 'threshold', 'substantive', 'procedural', 'relief' )), cited_precedents TEXT[], created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_legal_arguments_case ON legal_arguments(case_id); CREATE INDEX IF NOT EXISTS idx_legal_arguments_party ON legal_arguments(case_id, party); -- M:M back to ``claims`` (raw propositions). CREATE TABLE IF NOT EXISTS legal_argument_propositions ( argument_id UUID NOT NULL REFERENCES legal_arguments(id) ON DELETE CASCADE, claim_id UUID NOT NULL REFERENCES claims(id) ON DELETE CASCADE, PRIMARY KEY (argument_id, claim_id) ); """ # proceeding_type — מבחין בין הליך ערר עיקרי לבל"מ (בקשה להארכת מועד). # חל גם על case_law (קורפוס) וגם על cases (תיקים חיים). שני הסוגים # יכולים לחלוק אותו case_number, ולכן ה-uniqueness עוברת ל-(case_number, # proceeding_type). בקורפוס: רק internal_committee מקבלים ערך מאוכלס; # פסיקה חיצונית נשארת עם ''. SCHEMA_V15_SQL = """ -- ------- case_law (קורפוס) ------- ALTER TABLE case_law ADD COLUMN IF NOT EXISTS proceeding_type TEXT NOT NULL DEFAULT ''; ALTER TABLE case_law DROP CONSTRAINT IF EXISTS case_law_proceeding_type_check; ALTER TABLE case_law ADD CONSTRAINT case_law_proceeding_type_check CHECK (proceeding_type IN ('', 'ערר', 'בל"מ')); -- Backfill לפי appeal_subtype הקיים UPDATE case_law SET proceeding_type = 'בל"מ' WHERE source_kind = 'internal_committee' AND proceeding_type = '' AND appeal_subtype LIKE 'extension_request_%'; UPDATE case_law SET proceeding_type = 'ערר' WHERE source_kind = 'internal_committee' AND proceeding_type = ''; ALTER TABLE case_law DROP CONSTRAINT IF EXISTS case_law_internal_proceeding_check; ALTER TABLE case_law ADD CONSTRAINT case_law_internal_proceeding_check CHECK (source_kind != 'internal_committee' OR proceeding_type IN ('ערר', 'בל"מ')); -- החלפת UNIQUE(case_number) ב-partial unique לפי source_kind ALTER TABLE case_law DROP CONSTRAINT IF EXISTS case_law_case_number_key; DROP INDEX IF EXISTS case_law_case_number_key; CREATE UNIQUE INDEX IF NOT EXISTS uq_case_law_internal_number_proc ON case_law (case_number, proceeding_type) WHERE source_kind = 'internal_committee'; CREATE UNIQUE INDEX IF NOT EXISTS uq_case_law_external_number ON case_law (case_number) WHERE source_kind <> 'internal_committee'; -- ------- cases (תיקים חיים) ------- ALTER TABLE cases ADD COLUMN IF NOT EXISTS proceeding_type TEXT NOT NULL DEFAULT 'ערר'; ALTER TABLE cases DROP CONSTRAINT IF EXISTS cases_proceeding_type_check; ALTER TABLE cases ADD CONSTRAINT cases_proceeding_type_check CHECK (proceeding_type IN ('ערר', 'בל"מ')); UPDATE cases SET proceeding_type = 'בל"מ' WHERE proceeding_type = 'ערר' AND appeal_subtype LIKE 'extension_request_%'; ALTER TABLE cases DROP CONSTRAINT IF EXISTS cases_case_number_key; DROP INDEX IF EXISTS cases_case_number_key; CREATE UNIQUE INDEX IF NOT EXISTS uq_cases_number_proc ON cases (case_number, proceeding_type); """ # ── V16: Internal citations graph (TaskMaster #34) ──────────────── # Auto-extracted citation graph between Daphna's (and other internal_committee) # decisions. When an internal decision cites another committee decision in a # patterned way ("ונפנה ל…", "כפי שקבעתי…", "ראה החלטתי…"), the citation # extractor records the link here. ``cited_case_law_id`` is populated when the # cited case_number resolves to a row in ``case_law``; otherwise it stays NULL # and shows up in ``idx_pic_unlinked`` so the chair can decide whether to # upload the missing decision. SCHEMA_V16_SQL = """ CREATE TABLE IF NOT EXISTS precedent_internal_citations ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), source_case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, cited_case_number TEXT NOT NULL, cited_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL, match_context TEXT, match_pattern TEXT, confidence NUMERIC(3,2) DEFAULT 0.85, created_at TIMESTAMPTZ DEFAULT NOW(), UNIQUE (source_case_law_id, cited_case_number) ); CREATE INDEX IF NOT EXISTS idx_pic_source ON precedent_internal_citations(source_case_law_id); CREATE INDEX IF NOT EXISTS idx_pic_target ON precedent_internal_citations(cited_case_law_id); CREATE INDEX IF NOT EXISTS idx_pic_unlinked ON precedent_internal_citations(cited_case_number) WHERE cited_case_law_id IS NULL; """ # ── V17: Parent-doc retrieval (TaskMaster #48) ───────────────────── # Hierarchical chunking: tiny "child" chunks (~300 tokens) are indexed # and matched at search time for high recall on focused phrases, but # every child links upward to a larger "parent" chunk (~1500 tokens) # that supplies broader context to the LLM. The retrieval step swaps # the child hit for its parent before returning rows to callers — so # rule statements, multi-paragraph quotes, and "אשר על כן…" passages # come back whole instead of clipped mid-sentence. # # Schema layout: # parent_chunk_id — self-FK on precedent_chunks. NULL for legacy # rows (single-tier chunking) and for parent # rows themselves. Cascade=SET NULL so deleting # a parent doesn't orphan the children's payload. # chunk_role — 'child' | 'parent'. Defaults to 'child' so any # row created by the pre-V17 ingestion path is # treated as a child without a parent (i.e. the # parent-doc swap is a no-op and the legacy chunk # continues to surface as-is). # # Activation is gated by ``config.PARENT_DOC_RETRIEVAL_ENABLED``. Even # after the schema is in place, search keeps the legacy behaviour # until both the chunker emits hierarchical chunks *and* the flag is # flipped on — so this migration is safe to apply ahead of time. SCHEMA_V17_SQL = """ ALTER TABLE precedent_chunks ADD COLUMN IF NOT EXISTS parent_chunk_id UUID REFERENCES precedent_chunks(id) ON DELETE SET NULL; ALTER TABLE precedent_chunks ADD COLUMN IF NOT EXISTS chunk_role TEXT DEFAULT 'child'; DO $$ BEGIN ALTER TABLE precedent_chunks ADD CONSTRAINT precedent_chunks_role_check CHECK (chunk_role IN ('child', 'parent')); EXCEPTION WHEN duplicate_object THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_precedent_chunks_parent ON precedent_chunks(parent_chunk_id); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_role ON precedent_chunks(chunk_role); """ # ── V18: RAG telemetry — closed-loop retrieval feedback (TaskMaster #50) # # Captures every semantic search call (query, agent, top results, # latency) so we can compute nDCG@10 over time and surface drift before # it bites. Relevance signal comes from two places: # 1. ``cited_in_decision`` — auto-inferred. If a precedent cited in a # final draft's ``decision_paragraphs.citations`` also appears in # the ``top_case_law_ids`` of a search log for the same case, that # hit is treated as highly relevant (score=3). # 2. ``chair_marked`` — explicit feedback (future hook for the UI). # # ``top_case_law_ids`` is intentionally nullable: ``search_decisions`` # returns document chunks from active cases (not case_law rows), so its # rows log the query but leave the array empty. nDCG aggregation skips # those. SCHEMA_V18_SQL = """ CREATE TABLE IF NOT EXISTS search_logs ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), search_type TEXT NOT NULL, -- 'precedent_library' / 'internal_decisions' -- / 'decisions' / 'case_documents' / 'similar_cases' query TEXT NOT NULL, practice_area TEXT, case_id UUID REFERENCES cases(id) ON DELETE SET NULL, user_agent TEXT, -- 'writer' / 'researcher' / 'analyst' / 'manual' / 'unknown' result_count INTEGER, top_case_law_ids UUID[], -- nullable: empty for search_decisions/search_case_documents -- which return document chunks not case_law rows duration_ms INTEGER, created_at TIMESTAMPTZ DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_search_logs_type ON search_logs(search_type); CREATE INDEX IF NOT EXISTS idx_search_logs_case ON search_logs(case_id); CREATE INDEX IF NOT EXISTS idx_search_logs_date ON search_logs(created_at DESC); CREATE TABLE IF NOT EXISTS search_relevance_feedback ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), search_log_id UUID REFERENCES search_logs(id) ON DELETE CASCADE, case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, rank INTEGER NOT NULL, -- 1-based position in the original results (1 = top hit) relevance_score INTEGER NOT NULL CHECK (relevance_score IN (0, 1, 2, 3)), -- 0=irrelevant, 1=marginal, 2=relevant, 3=highly relevant feedback_source TEXT, -- 'cited_in_decision' / 'chair_marked' / 'auto_inferred' created_at TIMESTAMPTZ DEFAULT NOW(), UNIQUE(search_log_id, case_law_id, feedback_source) ); CREATE INDEX IF NOT EXISTS idx_relevance_log ON search_relevance_feedback(search_log_id); CREATE INDEX IF NOT EXISTS idx_relevance_case_law ON search_relevance_feedback(case_law_id); """ # ── V19: case_law.citation_formatted ─────────────────────────────── # Full formal citation per the Israeli unified citation rules ("כללי # הציטוט האחיד"). Stored as Markdown: parties wrapped in **…** so the # copy-to-clipboard helper can render bold for Word/Docs while keeping # the plain-text form readable. # # Example: # ערר (ועדות ערר - תכנון ובנייה ת"א-יפו) 81002-01-21 **אברהם אגסי # נ' הועדה המקומית לתכנון ובנייה תל אביב** (נבו 25.9.2025) SCHEMA_V19_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS citation_formatted TEXT DEFAULT ''; """ # ── V20: case-name / case-number lexical match ──────────────────── # RC-A fix: the V12 tsvectors cover only chunk *content* + halacha # text, so a bare case-name query ("אגסי") matched decisions that # *cite* the case rather than the case itself. case_name and # case_number live on the parent case_law row, so we add a dedicated # meta tsvector there and OR it into the lexical search — a name/number # hit then surfaces all of that case's chunks + halachot. 'simple' # config (no stemmer) preserves Hebrew names + alphanumeric case # numbers like "81002-01-21" exactly as V12 does for content. SCHEMA_V20_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS meta_tsv tsvector GENERATED ALWAYS AS ( to_tsvector('simple', coalesce(case_name,'') || ' ' || coalesce(case_number,'') ) ) STORED; CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv ON case_law USING GIN(meta_tsv); """ # ── V21: explicit `searchable` flag (GAP-13 / INV-DM1) ───────────── # Materialized completeness flag — a case_law row is exposed to search only # when it satisfies the completeness contract (02-data-model §2a). Recomputed # on ingest/metadata completion via recompute_searchable(); not inferred at # query time. Default false so a freshly-inserted row is excluded until proven # complete. Health-check surfaces count(*) FILTER (WHERE NOT searchable). SCHEMA_V21_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS searchable boolean NOT NULL DEFAULT false; CREATE INDEX IF NOT EXISTS idx_case_law_searchable ON case_law (searchable); """ # ── V22: cases.blocks_stale — DOCX↔blocks drift flag (GAP-17 / INV-EX1) ── # Set true when revise_draft/apply_user_edit make active_draft_path the live # source-of-truth without re-syncing decision_blocks; cleared when blocks are # re-exported or re-saved. Surfaced by health-check. Source-of-truth remains # decision_blocks — this only flags known drift (no fragile DOCX→blocks reparse). SCHEMA_V22_SQL = """ ALTER TABLE cases ADD COLUMN IF NOT EXISTS blocks_stale boolean NOT NULL DEFAULT false; """ # ── V23: case_law content/indexed hashes — re-index on content change (GAP-09) ── # content_hash = SHA-256 of current full_text (written at the create boundary). # indexed_hash = the content_hash the CURRENT chunks/embeddings were built from # (set by mark_indexed after a successful store). Stale ⇔ content_hash IS # DISTINCT FROM indexed_hash. embedding can't be a GENERATED column (needs an # API call), so freshness is enforced by detection + reindex_case_law + health-check. SCHEMA_V23_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS content_hash text NOT NULL DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS indexed_hash text; """ SCHEMA_V24_SQL = """ -- X11: citation corroboration (treatment + halacha-level link) ALTER TABLE precedent_internal_citations ADD COLUMN IF NOT EXISTS treatment TEXT DEFAULT ''; CREATE TABLE IF NOT EXISTS halacha_citation_corroboration ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE, citing_case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, citing_decision_id UUID REFERENCES decisions(id) ON DELETE SET NULL, source_citation_id UUID NOT NULL, treatment TEXT NOT NULL, match_score NUMERIC(4,3) DEFAULT 0, match_context TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE (halacha_id, source_citation_id) ); CREATE INDEX IF NOT EXISTS idx_hcc_halacha ON halacha_citation_corroboration(halacha_id); """ SCHEMA_V25_SQL = """ -- Crash-safe halacha extraction: per-chunk checkpoint enables incremental store -- + resume. A chunk with halacha_extracted_at set has been processed; a resumed -- run skips it (so a crash never loses completed chunks or re-pays for them). ALTER TABLE precedent_chunks ADD COLUMN IF NOT EXISTS halacha_extracted_at TIMESTAMPTZ; """ SCHEMA_V26_SQL = """ -- draft_final_pairs (T5 / INV-LRN4): the reconciliation ledger. -- Every decision is "closed" only after it is compared against the chair's signed -- final. Captures an immutable snapshot of the AI draft at mark-final time (before -- it can be overwritten), paired with the final. The LLM distillation (curator) -- fills final_text + diff_stats + analysis later and advances status. CREATE TABLE IF NOT EXISTS draft_final_pairs ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE, draft_text TEXT NOT NULL DEFAULT '', final_path TEXT DEFAULT '', final_text TEXT DEFAULT '', diff_stats JSONB DEFAULT NULL, analysis JSONB DEFAULT NULL, -- final_received → analyzed → lessons_folded status TEXT NOT NULL DEFAULT 'final_received', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_case ON draft_final_pairs(case_id); CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_status ON draft_final_pairs(status); """ SCHEMA_V27_SQL = """ -- style_exemplars (T1-T3): block-level paragraphs from Dafna's OWN decisions -- (style_corpus + internal_committee finals), embedded for retrieval as -- style exemplars at write-time. Purpose-built so we DON'T fabricate synthetic -- cases just to reuse decision_paragraphs. INV-LRN5: style material only — the -- writer is told to adapt structure/voice, copy only boilerplate, never substance. CREATE TABLE IF NOT EXISTS style_exemplars ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_number TEXT DEFAULT '', source TEXT DEFAULT '', -- style_corpus | internal_committee practice_area TEXT DEFAULT '', outcome TEXT DEFAULT '', -- rejection | partial_acceptance | full_acceptance | '' section TEXT DEFAULT 'other', -- background | claims | discussion | summary | other paragraph_text TEXT NOT NULL, word_count INTEGER DEFAULT 0, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_style_exemplars_section ON style_exemplars(section); CREATE INDEX IF NOT EXISTS idx_style_exemplars_decision ON style_exemplars(decision_number, source); """ SCHEMA_V28_SQL = """ -- equivalent_halachot (#84.2 follow-up): halacha-level PARALLEL-AUTHORITY links. -- Distinct from halacha_citation_corroboration (X11): that records an actual -- citation of a halacha by a later decision; this records that two halachot of -- DIFFERENT precedents state the same legal principle INDEPENDENTLY (no citation -- between them). Symmetric and non-directional — stored with halacha_a < halacha_b -- so each pair is unique and self-links are impossible. Never merges/deletes the -- halachot; it only relates them so the chair sees a principle recurs across -- committees (a real-but-non-citation signal the citator must not fabricate). CREATE TABLE IF NOT EXISTS equivalent_halachot ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), halacha_a UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE, halacha_b UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE, cosine NUMERIC(4,3) DEFAULT 0, note TEXT DEFAULT '', created_by TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), CHECK (halacha_a < halacha_b), UNIQUE (halacha_a, halacha_b) ); CREATE INDEX IF NOT EXISTS idx_equiv_halacha_a ON equivalent_halachot(halacha_a); CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b); """ SCHEMA_V29_SQL = """ -- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified -- sample of halachot the chair/Dafna labels (is_holding / correct_type / -- quote_complete) so we can measure the extraction validators' precision/recall -- and recalibrate the auto-approve threshold. The tags are the ground truth — -- they MUST be human (no AI pre-fill) to avoid circular bias. CREATE TABLE IF NOT EXISTS halacha_goldset ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE, batch TEXT NOT NULL DEFAULT 'default', is_holding BOOLEAN, -- NULL until tagged correct_type TEXT DEFAULT '', -- binding | interpretive | obiter | application | '' quote_complete BOOLEAN, tagged_by TEXT DEFAULT '', tagged_at TIMESTAMPTZ, created_at TIMESTAMPTZ DEFAULT now(), UNIQUE (halacha_id, batch) ); CREATE INDEX IF NOT EXISTS idx_goldset_batch ON halacha_goldset(batch); -- AI second-opinion (a QA aid, NOT ground truth): an INDEPENDENT local-LLM -- judgment shown beside the human tag so the chair can spot disagreements and -- reconsider. Independent of the rule-based validators that #81.8 measures, so -- no circularity. Generated locally (claude_session); never auto-applied. ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_is_holding BOOLEAN; ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_correct_type TEXT DEFAULT ''; ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_rationale TEXT DEFAULT ''; ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ; """ SCHEMA_V30_SQL = """ -- digests (X12): Ofer Toister daily "כל יום" one-pagers. A SECONDARY, -- discovery-layer ("radar") source — NOT authoritative law. Kept in its OWN -- table (never case_law) so it cannot pollute the precedent corpus, never -- enters the halacha pipeline (INV-DIG2), and is never cited directly in a -- decision (INV-DIG1). Its only job is to point the researcher at the -- UNDERLYING ruling, which is ingested separately into case_law and cited from -- there. linked_case_law_id is the bridge (INV-DIG3): filled once the -- underlying ruling is in the library; NULL = an open knowledge gap. CREATE TABLE IF NOT EXISTS digests ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), yomon_number TEXT NOT NULL DEFAULT '', -- "5163" digest_date DATE, -- date of the yomon ISSUE publication TEXT NOT NULL DEFAULT 'כל יום', source_firm TEXT NOT NULL DEFAULT 'עפר טויסטר, עורכי דין', concept_tag TEXT NOT NULL DEFAULT '', -- "שיקול הדעת המצומצם" headline_holding TEXT NOT NULL DEFAULT '', -- bold subtitle = the holding analysis_text TEXT NOT NULL DEFAULT '', -- the 1-2 page body (raw text) summary TEXT NOT NULL DEFAULT '', -- 2-3 sentence LLM summary underlying_citation TEXT NOT NULL DEFAULT '', -- 'עת"מ 46111-12-22 יכין-אפק...' underlying_court TEXT NOT NULL DEFAULT '', underlying_date DATE, -- date the RULING was given (≠ digest_date) underlying_judge TEXT NOT NULL DEFAULT '', practice_area TEXT NOT NULL DEFAULT '', -- rishuy_uvniya/betterment_levy/compensation_197 appeal_subtype TEXT NOT NULL DEFAULT '', subject_tags TEXT[] NOT NULL DEFAULT '{}', linked_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL, embedding vector(1024), -- single vector of concept+headline+summary+analysis source_document_path TEXT NOT NULL DEFAULT '', -- staged PDF path (rel to DATA_DIR) content_hash TEXT NOT NULL DEFAULT '', -- sha256 of extracted text — idempotent upload extraction_status TEXT NOT NULL DEFAULT 'pending', -- pending/processing/completed/failed content_tsv tsvector GENERATED ALWAYS AS ( to_tsvector('simple', coalesce(concept_tag,'') || ' ' || coalesce(headline_holding,'') || ' ' || coalesce(summary,'') || ' ' || coalesce(analysis_text,'')) ) STORED, created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); -- Idempotent re-upload (INV-G3): same yomon number = same digest. yomon_number -- can be '' transiently (before extraction), so the unique index is partial. CREATE UNIQUE INDEX IF NOT EXISTS uq_digests_yomon_number ON digests(yomon_number) WHERE yomon_number <> ''; -- Secondary dedup key when yomon_number couldn't be parsed. CREATE UNIQUE INDEX IF NOT EXISTS uq_digests_content_hash ON digests(content_hash) WHERE content_hash <> ''; -- HNSW (not ivfflat): the digests radar is a small, slowly-growing corpus -- (~1/day). ivfflat trains `lists` centroids and probes a subset at query time, -- so on a small table a single probe can hit an empty list and return 0 rows -- (recall cliff). HNSW has no list-training/probe step — correct recall from -- the first row — so it is the right index for a corpus that starts ~empty. DROP INDEX IF EXISTS idx_digests_embedding; -- drop any pre-existing ivfflat CREATE INDEX IF NOT EXISTS idx_digests_embedding_hnsw ON digests USING hnsw (embedding vector_cosine_ops); CREATE INDEX IF NOT EXISTS idx_digests_linked ON digests(linked_case_law_id); CREATE INDEX IF NOT EXISTS idx_digests_practice_area ON digests(practice_area); CREATE INDEX IF NOT EXISTS idx_digests_concept_tag ON digests(concept_tag); CREATE INDEX IF NOT EXISTS idx_digests_subject_tags ON digests USING gin(subject_tags); -- Lexical half of a future hybrid (Phase-1 search is semantic-only; index is ready). CREATE INDEX IF NOT EXISTS idx_digests_content_tsv ON digests USING gin(content_tsv); """ # ── X13 — Court Verdict Fetch queue ────────────────────────────────────── # A lightweight, observable, idempotent job queue for the auto-fetch # subsystem (docs/spec/X13-court-fetch.md). One row per court verdict we try # to pull from a public source. Mirrors the extraction-queue pattern: status # is always explicit (INV-CF2 — no silent drop), the canonical case number is # the idempotency key (INV-CF5), and ``attempts`` drives the human-fallback # gate (INV-CF3 — flip to 'manual' after N autonomous failures). # V31 — digests (X12) took V30 when it merged first. SCHEMA_V31_SQL = """ CREATE TABLE IF NOT EXISTS court_fetch_jobs ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), case_number_norm TEXT NOT NULL UNIQUE, -- idempotency key (INV-CF5) citation_raw TEXT NOT NULL DEFAULT '', tier TEXT NOT NULL DEFAULT '', -- supreme | admin | skip court TEXT NOT NULL DEFAULT '', status TEXT NOT NULL DEFAULT 'pending', -- pending|running|done|failed|manual attempts INT NOT NULL DEFAULT 0, error TEXT NOT NULL DEFAULT '', case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL, digest_id UUID, -- source digest (X12), nullable for ad-hoc source_url TEXT NOT NULL DEFAULT '', -- provenance (INV-CF7) created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_court_fetch_jobs_status ON court_fetch_jobs(status); CREATE INDEX IF NOT EXISTS idx_court_fetch_jobs_digest ON court_fetch_jobs(digest_id) WHERE digest_id IS NOT NULL; """ async def _run_schema_migrations(pool: asyncpg.Pool) -> None: async with pool.acquire() as conn: await conn.execute(SCHEMA_SQL) await conn.execute(MIGRATIONS_SQL) await conn.execute(SCHEMA_V2_SQL) await conn.execute(SCHEMA_V3_SQL) await conn.execute(SCHEMA_V4_SQL) await conn.execute(SCHEMA_V5_SQL) await conn.execute(SCHEMA_V6_SQL) await conn.execute(SCHEMA_V7_SQL) await conn.execute(SCHEMA_V8_SQL) await conn.execute(SCHEMA_V9_SQL) await conn.execute(SCHEMA_V10_SQL) await conn.execute(SCHEMA_V11_SQL) await conn.execute(SCHEMA_V12_SQL) await conn.execute(SCHEMA_V13_SQL) await conn.execute(SCHEMA_V14_SQL) await conn.execute(SCHEMA_V15_SQL) await conn.execute(SCHEMA_V16_SQL) await conn.execute(SCHEMA_V17_SQL) await conn.execute(SCHEMA_V18_SQL) await conn.execute(SCHEMA_V19_SQL) await conn.execute(SCHEMA_V20_SQL) await conn.execute(SCHEMA_V21_SQL) await conn.execute(SCHEMA_V22_SQL) await conn.execute(SCHEMA_V23_SQL) await conn.execute(SCHEMA_V24_SQL) await conn.execute(SCHEMA_V25_SQL) await conn.execute(SCHEMA_V26_SQL) await conn.execute(SCHEMA_V27_SQL) await conn.execute(SCHEMA_V28_SQL) await conn.execute(SCHEMA_V29_SQL) await conn.execute(SCHEMA_V30_SQL) await conn.execute(SCHEMA_V31_SQL) logger.info("Database schema initialized (v1-v31)") async def init_schema() -> None: """Backward-compatible wrapper. Schema init now runs lazily inside get_pool().""" await get_pool() # ── Case CRUD ─────────────────────────────────────────────────────── async def create_case( case_number: str, title: str, appellants: list[str] | None = None, respondents: list[str] | None = None, subject: str = "", property_address: str = "", permit_number: str = "", committee_type: str = "ועדה מקומית", hearing_date: date | None = None, notes: str = "", expected_outcome: str = "", # Default "" — DB CHECK constraint accepts empty, the upstream tool # (cases.case_create) is responsible for deriving the domain value # from the case_number prefix before calling here. practice_area: str = "", appeal_subtype: str = "", proceeding_type: str = "ערר", ) -> dict: pool = await get_pool() case_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO cases (id, case_number, title, appellants, respondents, subject, property_address, permit_number, committee_type, hearing_date, notes, expected_outcome, practice_area, appeal_subtype, proceeding_type) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)""", case_id, _canonical_case_number(case_number), title, json.dumps(appellants or []), json.dumps(respondents or []), subject, property_address, permit_number, committee_type, hearing_date, notes, expected_outcome, practice_area, appeal_subtype, proceeding_type, ) return await get_case(case_id) async def get_case(case_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id) if row is None: return None return _row_to_case(row) async def set_active_draft_path(case_id: UUID, path: str | None) -> None: """Update the case's active_draft_path (the DOCX that is source of truth).""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "UPDATE cases SET active_draft_path = $1, updated_at = now() WHERE id = $2", path, case_id, ) async def get_active_draft_path(case_id: UUID) -> str | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT active_draft_path FROM cases WHERE id = $1", case_id, ) return row["active_draft_path"] if row else None async def mark_blocks_stale(case_id: UUID, stale: bool) -> None: """Flag/clear DOCX↔blocks drift for a case (GAP-17).""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "UPDATE cases SET blocks_stale = $1, updated_at = now() WHERE id = $2", stale, case_id, ) async def resolve_citation_case_law_ids(ids) -> dict: """Structural citation→corpus resolution (GAP-20 / INV-AUD3). Given case_law_id values referenced by a decision's citations/provenance, split into resolvable (exist in case_law) vs unresolvable. """ resolved, unresolved = [], [] pool = await get_pool() async with pool.acquire() as conn: for cid in ids: try: exists = await conn.fetchval( "SELECT EXISTS(SELECT 1 FROM case_law WHERE id = $1)", cid) except Exception: exists = False (resolved if exists else unresolved).append(cid) return {"resolved": resolved, "unresolved": unresolved} def _normalize_case_number(s: str) -> str: """Canonicalise a case number for tolerant lookup. Agents receive the number in many shapes — from a Paperclip issue title ("ערר 8137/24"), with a slash instead of a dash, padded, or with surrounding whitespace. Stored values are bare ("8137-24"). Without this, get_case_by_number's exact match silently fails and the agent concludes the case has no documents (see #58). Strategy: drop any leading proceeding-type prefix (everything before the first digit), trim, and unify '/' → '-'. """ s = (s or "").strip() m = re.search(r"\d", s) if m: s = s[m.start():] return s.strip().replace("/", "-") def _canonical_case_number(s: str) -> str: """Canonical write-time form per X1 §1: trim · prefix-strip · '/'→'-'. Deterministic and format-only — does NOT add or remove a month segment. Used at the write boundary for identifier-keyed corpora (internal committee decisions, active cases). NOT for external precedents, whose canonical identifier is the full citation. """ s = (s or "").strip() m = re.search(r"\d", s) if m: s = s[m.start():] return s.strip().replace("/", "-") def _content_hash(text: str) -> str: """SHA-256 hex of the text — deterministic content fingerprint (FU-3/GAP-09). Empty/None → "" (a row with no text has no content fingerprint). """ if not text: return "" return hashlib.sha256(text.encode("utf-8")).hexdigest() async def get_case_by_number(case_number: str) -> dict | None: pool = await get_pool() norm = _normalize_case_number(case_number) async with pool.acquire() as conn: # Exact match first (fast path + preferred); fall back to a # separator/prefix-normalised comparison so common formatting # variants still resolve to the right case. See #58. row = await conn.fetchrow( """SELECT * FROM cases WHERE case_number = $1 OR replace(btrim(case_number), '/', '-') = $2 ORDER BY (case_number = $1) DESC, created_at LIMIT 1""", case_number, norm, ) if row is None: return None return _row_to_case(row) async def list_cases( status: str | None = None, limit: int = 50, include_archived: bool = False, archived_only: bool = False, ) -> list[dict]: pool = await get_pool() where = [] args: list = [] if status: where.append(f"status = ${len(args) + 1}") args.append(status) if archived_only: where.append("archived_at IS NOT NULL") elif not include_archived: where.append("archived_at IS NULL") where_clause = f"WHERE {' AND '.join(where)}" if where else "" args.append(limit) sql = f"SELECT * FROM cases {where_clause} ORDER BY updated_at DESC LIMIT ${len(args)}" async with pool.acquire() as conn: rows = await conn.fetch(sql, *args) return [_row_to_case(r) for r in rows] async def update_case(case_id: UUID, **fields) -> dict | None: if not fields: return await get_case(case_id) pool = await get_pool() set_clauses = [] values = [] for i, (key, val) in enumerate(fields.items(), start=2): if key in ("appellants", "respondents", "tags"): val = json.dumps(val) set_clauses.append(f"{key} = ${i}") values.append(val) set_clauses.append("updated_at = now()") sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1" async with pool.acquire() as conn: await conn.execute(sql, case_id, *values) return await get_case(case_id) def _row_to_case(row: asyncpg.Record) -> dict: d = dict(row) for field in ("appellants", "respondents", "tags"): if isinstance(d.get(field), str): d[field] = json.loads(d[field]) d["id"] = str(d["id"]) return d async def archive_case(case_id: UUID) -> dict | None: """Mark a case as archived. Returns updated row, or None if not found.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "UPDATE cases SET archived_at = now(), updated_at = now() " "WHERE id = $1 RETURNING *", case_id, ) return _row_to_case(row) if row else None async def restore_case(case_id: UUID) -> dict | None: """Clear the archived_at timestamp. Returns updated row, or None if not found.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "UPDATE cases SET archived_at = NULL, updated_at = now() " "WHERE id = $1 RETURNING *", case_id, ) return _row_to_case(row) if row else None async def delete_case(case_id: UUID) -> bool: """Delete a case row. Returns True if a row was actually removed. All dependent rows are removed automatically by FK constraints: • CASCADE: documents, document_chunks, claims, appraiser_facts, decisions, qa_results, case_precedents • SET NULL: audit_log.case_id, chair_feedback.case_id NOTE: this only touches the legal-ai database. The Paperclip project (issues, comments, runs) and Gitea repo for the case live in other systems and are NOT cleaned up here — call sites that need a full reset must handle those separately. """ pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute("DELETE FROM cases WHERE id = $1", case_id) # asyncpg execute returns "DELETE " — extract count. return int(result.split()[-1]) > 0 # ── Document CRUD ─────────────────────────────────────────────────── async def create_document( case_id: UUID, doc_type: str, title: str, file_path: str, page_count: int | None = None, content_hash: str = "", ) -> dict: pool = await get_pool() doc_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count, content_hash) VALUES ($1, $2, $3, $4, $5, $6, $7)""", doc_id, case_id, doc_type, title, file_path, page_count, content_hash, ) row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) return _row_to_doc(row) async def get_document_by_hash(case_id: UUID, content_hash: str) -> dict | None: """Return an existing document for this case with the same file hash, or None. INV-TOOL3 / GAP-52: deterministic key for idempotent upload. Empty hashes (legacy rows) are never matched. """ if not content_hash: return None pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM documents WHERE case_id = $1 AND content_hash = $2 LIMIT 1", case_id, content_hash, ) return _row_to_doc(row) if row else None async def update_document(doc_id: UUID, **fields) -> None: if not fields: return pool = await get_pool() set_clauses = [] values = [] for i, (key, val) in enumerate(fields.items(), start=2): if key == "metadata": val = json.dumps(val) set_clauses.append(f"{key} = ${i}") values.append(val) sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1" async with pool.acquire() as conn: await conn.execute(sql, doc_id, *values) async def get_document(doc_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) return _row_to_doc(row) if row else None async def list_documents(case_id: UUID) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id ) return [_row_to_doc(r) for r in rows] async def get_document_text(doc_id: UUID) -> str: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT extracted_text FROM documents WHERE id = $1", doc_id ) return row["extracted_text"] if row else "" def _row_to_doc(row: asyncpg.Record) -> dict: d = dict(row) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) if isinstance(d.get("metadata"), str): d["metadata"] = json.loads(d["metadata"]) return d # ── Claims ───────────────────────────────────────────────────────── async def store_claims(case_id: UUID, claims: list[dict], source_document: str = "") -> int: """Store extracted claims. Replaces existing claims from same source. Each claim dict: party_role, claim_text, claim_index, party_name (optional) """ pool = await get_pool() async with pool.acquire() as conn: if source_document: await conn.execute( "DELETE FROM claims WHERE case_id = $1 AND source_document = $2", case_id, source_document, ) for claim in claims: await conn.execute( """INSERT INTO claims (case_id, party_role, party_name, claim_text, claim_index, source_document, claim_type) VALUES ($1, $2, $3, $4, $5, $6, $7)""", case_id, claim["party_role"], claim.get("party_name", ""), claim["claim_text"], claim.get("claim_index", 0), source_document, claim.get("claim_type", "claim"), ) return len(claims) async def get_claims(case_id: UUID, party_role: str | None = None) -> list[dict]: """Get claims for a case, optionally filtered by party role.""" pool = await get_pool() async with pool.acquire() as conn: if party_role: rows = await conn.fetch( "SELECT * FROM claims WHERE case_id = $1 AND party_role = $2 ORDER BY claim_index", case_id, party_role, ) else: rows = await conn.fetch( "SELECT * FROM claims WHERE case_id = $1 ORDER BY party_role, claim_index", case_id, ) return [dict(r) for r in rows] # ── Decisions ────────────────────────────────────────────────────── async def create_decision( case_id: UUID, outcome: str = "", outcome_summary: str = "", outcome_reasoning: str = "", direction_doc: dict | None = None, ) -> dict: """Create a decision record for a case.""" pool = await get_pool() decision_id = uuid4() async with pool.acquire() as conn: # Check if a decision already exists for this case existing = await conn.fetchrow( "SELECT id, version FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1", case_id, ) version = (existing["version"] + 1) if existing else 1 await conn.execute( """INSERT INTO decisions (id, case_id, version, outcome, outcome_summary, outcome_reasoning, direction_doc) VALUES ($1, $2, $3, $4, $5, $6, $7)""", decision_id, case_id, version, outcome, outcome_summary, outcome_reasoning, json.dumps(direction_doc) if direction_doc else None, ) return await get_decision(decision_id) async def get_decision(decision_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow("SELECT * FROM decisions WHERE id = $1", decision_id) if not row: return None d = dict(row) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) if isinstance(d.get("direction_doc"), str): d["direction_doc"] = json.loads(d["direction_doc"]) if isinstance(d.get("panel_members"), str): d["panel_members"] = json.loads(d["panel_members"]) return d async def get_decision_by_case(case_id: UUID) -> dict | None: """Get the latest decision for a case.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1", case_id, ) if not row: return None d = dict(row) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) if isinstance(d.get("direction_doc"), str): d["direction_doc"] = json.loads(d["direction_doc"]) if isinstance(d.get("panel_members"), str): d["panel_members"] = json.loads(d["panel_members"]) return d async def get_critical_qa_failures(case_id: UUID) -> list[dict]: """Return critical-severity failures from the case's latest QA run. ``qa_results`` is cleared+rewritten per ``validate_decision`` run, so the current rows for a ``case_id`` ARE the latest run. Returns rows where ``severity='critical' AND passed=false``. Callers distinguish "no QA run yet" (no rows at all) via ``qa_run_exists`` below. """ pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """SELECT check_name, severity, passed, errors FROM qa_results WHERE case_id = $1 AND severity = 'critical' AND passed = false ORDER BY check_name""", case_id, ) return [dict(r) for r in rows] async def qa_run_exists(case_id: UUID) -> bool: """True if a QA run has ever been recorded for this case (any rows).""" pool = await get_pool() async with pool.acquire() as conn: n = await conn.fetchval( "SELECT count(*) FROM qa_results WHERE case_id = $1", case_id, ) return bool(n) async def update_decision(decision_id: UUID, **fields) -> None: if not fields: return pool = await get_pool() set_clauses = [] values = [] for i, (key, val) in enumerate(fields.items(), start=2): if key in ("direction_doc", "panel_members") and isinstance(val, (dict, list)): val = json.dumps(val) set_clauses.append(f"{key} = ${i}") values.append(val) set_clauses.append("updated_at = now()") sql = f"UPDATE decisions SET {', '.join(set_clauses)} WHERE id = $1" async with pool.acquire() as conn: await conn.execute(sql, decision_id, *values) # ── Document deletion ────────────────────────────────────────────── async def delete_document(doc_id: UUID) -> bool: """Delete a document and all its chunks. Returns True if deleted.""" pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", doc_id ) result = await conn.execute( "DELETE FROM documents WHERE id = $1", doc_id ) return int(result.split()[-1]) > 0 # ── Chunks & Vectors ─────────────────────────────────────────────── async def delete_document_chunks(document_id: UUID) -> int: """Delete all chunks for a document (used before reprocessing).""" pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", document_id ) return int(result.split()[-1]) # e.g. "DELETE 5" -> 5 async def store_chunks( document_id: UUID, case_id: UUID | None, chunks: list[dict], ) -> int: """Store document chunks with embeddings. Each chunk dict has: content, section_type, embedding (list[float]), page_number, chunk_index """ pool = await get_pool() async with pool.acquire() as conn: # Delete existing chunks for this document await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", document_id ) for chunk in chunks: await conn.execute( """INSERT INTO document_chunks (document_id, case_id, chunk_index, content, section_type, embedding, page_number) VALUES ($1, $2, $3, $4, $5, $6, $7)""", document_id, case_id, chunk["chunk_index"], chunk["content"], chunk.get("section_type", "other"), chunk["embedding"], chunk.get("page_number"), ) return len(chunks) async def search_similar( query_embedding: list[float], limit: int = 10, case_id: UUID | None = None, section_type: str | None = None, practice_area: str | None = None, appeal_subtype: str | None = None, ) -> list[dict]: """Cosine similarity search on document chunks.""" pool = await get_pool() conditions = [] params: list = [query_embedding, limit] param_idx = 3 if case_id: conditions.append(f"dc.case_id = ${param_idx}") params.append(case_id) param_idx += 1 if section_type: conditions.append(f"dc.section_type = ${param_idx}") params.append(section_type) param_idx += 1 if practice_area: conditions.append(f"c.practice_area = ${param_idx}") params.append(practice_area) param_idx += 1 if appeal_subtype: conditions.append(f"c.appeal_subtype = ${param_idx}") params.append(appeal_subtype) param_idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT dc.content, dc.section_type, dc.page_number, dc.document_id, dc.case_id, d.title AS document_title, c.case_number, 1 - (dc.embedding <=> $1) AS score FROM document_chunks dc JOIN documents d ON d.id = dc.document_id JOIN cases c ON c.id = dc.case_id {where} ORDER BY dc.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] # ── Style corpus ──────────────────────────────────────────────────── async def add_to_style_corpus( document_id: UUID | None, decision_number: str, decision_date: date | None, subject_categories: list[str], full_text: str, summary: str = "", outcome: str = "", key_principles: list[str] | None = None, practice_area: str = "appeals_committee", appeal_subtype: str = "", ) -> UUID: pool = await get_pool() corpus_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO style_corpus (id, document_id, decision_number, decision_date, subject_categories, full_text, summary, outcome, key_principles, practice_area, appeal_subtype) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""", corpus_id, document_id, decision_number, decision_date, json.dumps(subject_categories), full_text, summary, outcome, json.dumps(key_principles or []), practice_area, appeal_subtype, ) return corpus_id async def delete_from_style_corpus(corpus_id: UUID) -> dict: """Remove a decision from style_corpus + related documents (cascades chunks). Also tries to delete the [קורפוס] document associated by title match, since the current training pipeline inserts style_corpus with document_id=NULL. """ pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): row = await conn.fetchrow( "DELETE FROM style_corpus WHERE id = $1 " "RETURNING decision_number, document_id", corpus_id, ) if not row: return {"deleted": False, "reason": "not found"} docs_deleted = 0 if row["document_id"]: await conn.execute( "DELETE FROM documents WHERE id = $1", row["document_id"] ) docs_deleted = 1 else: # Best-effort: match a [קורפוס] document by the decision_number # in its title. Only for single, unambiguous matches. if row["decision_number"]: docs = await conn.fetch( "SELECT id FROM documents " "WHERE case_id IS NULL AND title LIKE $1", f"%{row['decision_number']}%", ) if len(docs) == 1: await conn.execute( "DELETE FROM documents WHERE id = $1", docs[0]["id"] ) docs_deleted = 1 return { "deleted": True, "decision_number": row["decision_number"], "docs_deleted": docs_deleted, } async def get_style_corpus_row(corpus_id: UUID) -> dict | None: """Return a single style_corpus row by id, or None if missing.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """ SELECT id, document_id, decision_number, decision_date, subject_categories, full_text, summary, outcome, key_principles, practice_area, appeal_subtype, created_at FROM style_corpus WHERE id = $1 """, corpus_id, ) return dict(row) if row else None async def update_style_corpus_metadata( corpus_id: UUID, *, summary: str | None = None, outcome: str | None = None, key_principles: list[str] | None = None, appeal_subtype: str | None = None, practice_area: str | None = None, overwrite: bool = False, ) -> dict: """Patch the enriched-metadata columns of a style_corpus row. By default, only empty columns are filled — passing ``overwrite=True`` is the caller's signal that they intentionally want to replace existing values (used by the re-extract flow when the chair runs it manually). """ pool = await get_pool() async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT summary, outcome, key_principles, appeal_subtype, practice_area " "FROM style_corpus WHERE id = $1", corpus_id, ) if not existing: return {"updated": False, "reason": "not found"} sets: dict = {} if summary is not None and (overwrite or not (existing["summary"] or "").strip()): sets["summary"] = summary if outcome is not None and (overwrite or not (existing["outcome"] or "").strip()): sets["outcome"] = outcome if key_principles is not None: current = existing["key_principles"] if isinstance(current, str): try: current = json.loads(current) except json.JSONDecodeError: current = [] if overwrite or not (current or []): sets["key_principles"] = json.dumps(key_principles) if appeal_subtype is not None and (overwrite or not (existing["appeal_subtype"] or "").strip()): sets["appeal_subtype"] = appeal_subtype if practice_area is not None and (overwrite or not (existing["practice_area"] or "").strip()): sets["practice_area"] = practice_area if not sets: return {"updated": False, "reason": "nothing to update", "fields": []} cols = list(sets.keys()) set_clause = ", ".join(f"{c} = ${i + 2}" for i, c in enumerate(cols)) values = [sets[c] for c in cols] await conn.execute( f"UPDATE style_corpus SET {set_clause} WHERE id = $1", corpus_id, *values, ) return {"updated": True, "fields": cols} # ── decision_lessons (per-corpus row notes) ──────────────────────── async def list_decision_lessons(corpus_id: UUID) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT id, style_corpus_id, lesson_text, category, source, " " applied_to_skill, created_by, created_at, updated_at " "FROM decision_lessons WHERE style_corpus_id = $1 " "ORDER BY created_at DESC", corpus_id, ) return [dict(r) for r in rows] async def add_decision_lesson( corpus_id: UUID, *, lesson_text: str, category: str = "general", source: str = "manual", created_by: str = "chaim", ) -> dict: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "INSERT INTO decision_lessons " "(style_corpus_id, lesson_text, category, source, created_by) " "VALUES ($1, $2, $3, $4, $5) " "RETURNING id, style_corpus_id, lesson_text, category, source, " " applied_to_skill, created_by, created_at, updated_at", corpus_id, lesson_text, category, source, created_by, ) return dict(row) if row else {} async def update_decision_lesson( lesson_id: UUID, *, lesson_text: str | None = None, category: str | None = None, applied_to_skill: bool | None = None, ) -> dict: sets: dict = {} if lesson_text is not None: sets["lesson_text"] = lesson_text if category is not None: sets["category"] = category if applied_to_skill is not None: sets["applied_to_skill"] = applied_to_skill if not sets: return {"updated": False, "reason": "nothing to update"} sets["updated_at"] = "now()" # sentinel — replaced inline below cols = [c for c in sets if c != "updated_at"] set_clause = ", ".join(f"{c} = ${i + 2}" for i, c in enumerate(cols)) set_clause += ", updated_at = now()" values = [sets[c] for c in cols] pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( f"UPDATE decision_lessons SET {set_clause} WHERE id = $1 " f"RETURNING id, style_corpus_id, lesson_text, category, source, " f" applied_to_skill, updated_at", lesson_id, *values, ) if not row: return {"updated": False, "reason": "not found"} return {"updated": True, **dict(row)} async def delete_decision_lesson(lesson_id: UUID) -> dict: pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( "DELETE FROM decision_lessons WHERE id = $1", lesson_id, ) # asyncpg returns "DELETE n" deleted = result.split(" ", 1)[1].strip() if " " in result else "0" return {"deleted": deleted != "0"} async def count_decision_lessons_per_corpus() -> dict[str, int]: """Map style_corpus.id (str) → lesson count, for badge display in the list.""" pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT style_corpus_id, count(*) AS n " "FROM decision_lessons GROUP BY style_corpus_id" ) return {str(r["style_corpus_id"]): r["n"] for r in rows} # ── chat (style agent conversations) ─────────────────────────────── async def create_chat_conversation( *, title: str = "שיחה חדשה", style_corpus_id: UUID | None = None, system_prompt_version: str = "v1", ) -> dict: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "INSERT INTO chat_conversations " "(title, style_corpus_id, system_prompt_version) " "VALUES ($1, $2, $3) " "RETURNING id, title, style_corpus_id, claude_session_id, " " system_prompt_version, created_at, last_message_at", title, style_corpus_id, system_prompt_version, ) return dict(row) if row else {} async def list_chat_conversations(limit: int = 50) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """ SELECT c.id, c.title, c.style_corpus_id, c.claude_session_id, c.created_at, c.last_message_at, sc.decision_number, (SELECT count(*) FROM chat_messages m WHERE m.conversation_id = c.id) AS message_count FROM chat_conversations c LEFT JOIN style_corpus sc ON sc.id = c.style_corpus_id ORDER BY c.last_message_at DESC NULLS LAST LIMIT $1 """, limit, ) return [dict(r) for r in rows] async def get_chat_conversation(conv_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT id, title, style_corpus_id, claude_session_id, " " system_prompt_version, created_at, last_message_at " "FROM chat_conversations WHERE id = $1", conv_id, ) return dict(row) if row else None async def delete_chat_conversation(conv_id: UUID) -> dict: pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( "DELETE FROM chat_conversations WHERE id = $1", conv_id, ) deleted = result.split(" ", 1)[1].strip() if " " in result else "0" return {"deleted": deleted != "0"} async def update_chat_conversation_session_id( conv_id: UUID, claude_session_id: str, ) -> None: pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "UPDATE chat_conversations SET claude_session_id = $1, " " last_message_at = now() " "WHERE id = $2", claude_session_id, conv_id, ) async def add_chat_message( conv_id: UUID, *, role: str, content: str, raw_events: list | None = None, ) -> dict: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "INSERT INTO chat_messages " "(conversation_id, role, content, raw_events) " "VALUES ($1, $2, $3, $4) " "RETURNING id, conversation_id, role, content, created_at", conv_id, role, content, json.dumps(raw_events or []), ) await conn.execute( "UPDATE chat_conversations SET last_message_at = now() WHERE id = $1", conv_id, ) return dict(row) if row else {} async def list_chat_messages(conv_id: UUID) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT id, role, content, created_at " "FROM chat_messages WHERE conversation_id = $1 " "ORDER BY created_at ASC", conv_id, ) return [dict(r) for r in rows] async def get_style_patterns(pattern_type: str | None = None) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: if pattern_type: rows = await conn.fetch( "SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC", pattern_type, ) else: rows = await conn.fetch( "SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC" ) return [dict(r) for r in rows] async def get_methodology_overrides(category: str) -> dict: """Chair's /methodology edits for one category (golden_ratios / discussion_rules / content_checklists). Returns {rule_key: parsed_value}. These OVERRIDE the hardcoded lessons.py defaults — the writer must consume them (T15 / INV-LRN4). Mirrors the merge in GET /api/methodology/{category}.""" pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT rule_key, rule_value FROM appeal_type_rules " "WHERE appeal_type = '_global' AND rule_category = $1", category, ) out: dict = {} for r in rows: raw = r["rule_value"] if isinstance(raw, str): try: raw = json.loads(raw) except (json.JSONDecodeError, TypeError): pass out[r["rule_key"]] = raw return out async def get_recent_decision_lessons(limit: int = 15, practice_area: str = "") -> list[dict]: """Per-decision learnings the chair/curator attached in /training (decision_lessons), so the writer consumes them too (T15). Prefers style/structure/lexicon, recent first.""" pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """SELECT dl.lesson_text, dl.category, dl.source, sc.decision_number, sc.practice_area FROM decision_lessons dl JOIN style_corpus sc ON sc.id = dl.style_corpus_id WHERE ($2 = '' OR sc.practice_area = $2) ORDER BY dl.created_at DESC LIMIT $1""", limit, practice_area, ) return [dict(r) for r in rows] async def create_draft_final_pair(case_id: UUID, draft_text: str, final_path: str = "") -> str: """Capture the draft↔final pairing at mark-final (T5 / INV-LRN4). Immutable draft snapshot; final_text/diff_stats/analysis filled later by the curator distillation.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """INSERT INTO draft_final_pairs (case_id, draft_text, final_path, status) VALUES ($1, $2, $3, 'final_received') RETURNING id""", case_id, draft_text, final_path, ) return str(row["id"]) async def update_draft_final_pair( pair_id: UUID, final_text: str | None = None, diff_stats: dict | None = None, analysis: dict | None = None, status: str | None = None, ) -> None: """Advance a pairing row (curator distillation): final_text → diff_stats → analysis → status.""" sets, params, idx = [], [], 1 if final_text is not None: sets.append(f"final_text = ${idx}"); params.append(final_text); idx += 1 if diff_stats is not None: sets.append(f"diff_stats = ${idx}::jsonb"); params.append(json.dumps(diff_stats, ensure_ascii=False)); idx += 1 if analysis is not None: sets.append(f"analysis = ${idx}::jsonb"); params.append(json.dumps(analysis, ensure_ascii=False)); idx += 1 if status is not None: sets.append(f"status = ${idx}"); params.append(status); idx += 1 if not sets: return sets.append("updated_at = now()") params.append(pair_id) pool = await get_pool() async with pool.acquire() as conn: await conn.execute( f"UPDATE draft_final_pairs SET {', '.join(sets)} WHERE id = ${idx}", *params, ) async def list_draft_final_pairs(status: str | None = None, limit: int = 200) -> list[dict]: """Reconciliation ledger: all decisions paired with their final + status.""" pool = await get_pool() async with pool.acquire() as conn: if status: rows = await conn.fetch( """SELECT p.id, p.case_id, c.case_number, c.title, p.status, p.diff_stats, p.created_at, p.updated_at FROM draft_final_pairs p LEFT JOIN cases c ON c.id = p.case_id WHERE p.status = $1 ORDER BY p.created_at DESC LIMIT $2""", status, limit, ) else: rows = await conn.fetch( """SELECT p.id, p.case_id, c.case_number, c.title, p.status, p.diff_stats, p.created_at, p.updated_at FROM draft_final_pairs p LEFT JOIN cases c ON c.id = p.case_id ORDER BY p.created_at DESC LIMIT $1""", limit, ) return [dict(r) for r in rows] async def get_draft_final_pair(pair_id: UUID) -> dict | None: """Full pairing row incl. analysis (curator proposal) — for the T14 approval gate.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """SELECT p.id, p.case_id, c.case_number, c.title, p.status, p.draft_text, p.final_text, p.diff_stats, p.analysis, p.created_at, p.updated_at FROM draft_final_pairs p LEFT JOIN cases c ON c.id = p.case_id WHERE p.id = $1""", pair_id, ) return dict(row) if row else None async def insert_style_exemplar( decision_number: str, source: str, practice_area: str, outcome: str, section: str, paragraph_text: str, word_count: int, embedding: list[float], ) -> None: """Insert one block-level style exemplar (T1 backfill).""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( """INSERT INTO style_exemplars (decision_number, source, practice_area, outcome, section, paragraph_text, word_count, embedding) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""", decision_number, source, practice_area, outcome, section, paragraph_text, word_count, embedding, ) async def delete_style_exemplars(decision_number: str, source: str) -> int: """Idempotent backfill: clear a decision's exemplars before re-inserting.""" pool = await get_pool() async with pool.acquire() as conn: res = await conn.execute( "DELETE FROM style_exemplars WHERE decision_number = $1 AND source = $2", decision_number, source, ) try: return int(res.split()[-1]) except (ValueError, IndexError): return 0 async def search_style_exemplars( query_embedding: list[float], section: str | None = None, outcome: str | None = None, practice_area: str | None = None, limit: int = 6, ) -> list[dict]: """Retrieve Dafna's own block-level paragraphs as STYLE exemplars (T2). Filters by section (block) + optionally outcome/practice_area for the closest match to the block being written. Soft filters: outcome/practice_area narrow but never zero-out — section is the hard filter.""" pool = await get_pool() conditions, params, idx = [], [query_embedding, limit], 3 if section: conditions.append(f"section = ${idx}"); params.append(section); idx += 1 if outcome: conditions.append(f"(outcome = ${idx} OR outcome = '')"); params.append(outcome); idx += 1 if practice_area: conditions.append(f"(practice_area = ${idx} OR practice_area = '')"); params.append(practice_area); idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT decision_number, source, section, outcome, practice_area, paragraph_text, word_count, 1 - (embedding <=> $1) AS score FROM style_exemplars {where} ORDER BY embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] async def count_style_exemplars() -> dict: """Coverage check for the backfill.""" pool = await get_pool() async with pool.acquire() as conn: total = await conn.fetchval("SELECT count(*) FROM style_exemplars") by_section = await conn.fetch( "SELECT section, count(*) AS n FROM style_exemplars GROUP BY section ORDER BY n DESC" ) decisions = await conn.fetchval( "SELECT count(DISTINCT decision_number) FROM style_exemplars" ) return {"total": total, "decisions": decisions, "by_section": [dict(r) for r in by_section]} async def upsert_style_pattern( pattern_type: str, pattern_text: str, context: str = "", examples: list[str] | None = None, appeal_subtype: str = "", ) -> None: pool = await get_pool() async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT id, frequency FROM style_patterns " "WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3", pattern_type, pattern_text, appeal_subtype, ) if existing: await conn.execute( "UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1", existing["id"], ) else: await conn.execute( """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype) VALUES ($1, $2, $3, $4, $5)""", pattern_type, pattern_text, context, json.dumps(examples or []), appeal_subtype, ) async def clear_style_patterns(appeal_subtype: str = "") -> None: """Delete style patterns, optionally filtered by appeal_subtype. Empty appeal_subtype = delete ALL patterns. """ pool = await get_pool() async with pool.acquire() as conn: if appeal_subtype: await conn.execute( "DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype ) else: await conn.execute("DELETE FROM style_patterns") # ── Semantic Search (V2 — decision blocks & case law) ───────────── async def search_similar_paragraphs( query_embedding: list[float], limit: int = 10, block_type: str | None = None, ) -> list[dict]: """Search decision paragraphs by semantic similarity.""" pool = await get_pool() conditions = [] params: list = [query_embedding, limit] param_idx = 3 if block_type: conditions.append(f"db.block_id = ${param_idx}") params.append(block_type) param_idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT dp.content, dp.word_count, dp.paragraph_number, db.block_id AS block_type, db.title AS block_title, c.case_number, c.title AS case_title, d.outcome, d.author, 1 - (pe.embedding <=> $1) AS score FROM paragraph_embeddings pe JOIN decision_paragraphs dp ON dp.id = pe.paragraph_id JOIN decision_blocks db ON db.id = dp.block_id JOIN decisions d ON d.id = db.decision_id JOIN cases c ON c.id = d.case_id {where} ORDER BY pe.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] async def search_similar_case_law( query_embedding: list[float], limit: int = 5, ) -> list[dict]: """Search case law by semantic similarity.""" pool = await get_pool() sql = """ SELECT cl.case_number, cl.case_name, cl.court, cl.summary, cl.key_quote, cl.subject_tags, cle.chunk_text, 1 - (cle.embedding <=> $1) AS score FROM case_law_embeddings cle JOIN case_law cl ON cl.id = cle.case_law_id ORDER BY cle.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, query_embedding, limit) results = [] for r in rows: d = dict(r) if isinstance(d.get("subject_tags"), str): d["subject_tags"] = json.loads(d["subject_tags"]) results.append(d) return results async def search_precedents( query_embedding: list[float], limit: int = 10, ) -> list[dict]: """Combined search: paragraphs + case law, ranked by score.""" paragraphs = await search_similar_paragraphs(query_embedding, limit=limit) case_law = await search_similar_case_law(query_embedding, limit=limit) # Combine and sort by score results = [] for p in paragraphs: results.append({ "type": "decision_paragraph", "score": float(p["score"]), "case_number": p["case_number"], "case_title": p["case_title"], "block_type": p["block_type"], "content": p["content"][:500], "author": p["author"], }) for c in case_law: results.append({ "type": "case_law", "score": float(c["score"]), "case_number": c["case_number"], "case_name": c["case_name"], "court": c["court"], "content": c["summary"], }) results.sort(key=lambda x: x["score"], reverse=True) return results[:limit] # ── Case precedents (CRUD) ──────────────────────────────────────── async def create_case_precedent( case_id: UUID, quote: str, citation: str, section_id: str | None = None, chair_note: str = "", pdf_document_id: UUID | None = None, practice_area: str | None = None, ) -> dict: """Insert a new precedent attached to a case.""" pool = await get_pool() row = await pool.fetchrow( """ INSERT INTO case_precedents (case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area) VALUES ($1, $2, $3, $4, $5, $6, $7) RETURNING * """, case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area, ) return dict(row) async def list_case_precedents(case_id: UUID) -> list[dict]: """List all precedents attached to a case, ordered by section then creation time.""" pool = await get_pool() rows = await pool.fetch( """ SELECT id, case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area, created_at, updated_at FROM case_precedents WHERE case_id = $1 ORDER BY section_id NULLS LAST, created_at """, case_id, ) return [dict(r) for r in rows] async def delete_case_precedent(precedent_id: UUID) -> bool: """Delete a precedent attachment by ID. Returns True if deleted.""" pool = await get_pool() result = await pool.execute( "DELETE FROM case_precedents WHERE id = $1", precedent_id ) return result == "DELETE 1" async def search_precedent_library( query: str, practice_area: str = "", limit: int = 10, ) -> list[dict]: """Search all precedents across cases by citation or quote text.""" pool = await get_pool() pattern = f"%{query}%" if practice_area: rows = await pool.fetch( """ SELECT id, case_id, section_id, quote, citation, chair_note, practice_area, created_at FROM case_precedents WHERE (citation ILIKE $1 OR quote ILIKE $1) AND practice_area = $2 ORDER BY created_at DESC LIMIT $3 """, pattern, practice_area, limit, ) else: rows = await pool.fetch( """ SELECT id, case_id, section_id, quote, citation, chair_note, practice_area, created_at FROM case_precedents WHERE citation ILIKE $1 OR quote ILIKE $1 ORDER BY created_at DESC LIMIT $2 """, pattern, limit, ) return [dict(r) for r in rows] # ── Chair feedback ──────────────────────────────────────────────── async def record_chair_feedback( case_id: UUID | None, block_id: str, feedback_text: str, category: str = "other", lesson_extracted: str = "", ) -> UUID: """Record feedback from the chair (Dafna) on a draft block.""" pool = await get_pool() feedback_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO chair_feedback (id, case_id, block_id, feedback_text, category, lesson_extracted) VALUES ($1, $2, $3, $4, $5, $6)""", feedback_id, case_id, block_id, feedback_text, category, lesson_extracted, ) return feedback_id async def list_chair_feedback( case_id: UUID | None = None, category: str | None = None, unresolved_only: bool = False, limit: int = 100, ) -> list[dict]: """List chair feedback, optionally filtered. Capped by limit (INV-TOOL5 / GAP-53).""" pool = await get_pool() conditions = [] params: list = [] idx = 1 if case_id: conditions.append(f"case_id = ${idx}") params.append(case_id) idx += 1 if category: conditions.append(f"category = ${idx}") params.append(category) idx += 1 if unresolved_only: conditions.append("resolved = FALSE") where = f"WHERE {' AND '.join(conditions)}" if conditions else "" params.append(max(1, int(limit))) async with pool.acquire() as conn: rows = await conn.fetch( f"SELECT * FROM chair_feedback {where} ORDER BY created_at DESC LIMIT ${idx}", *params, ) return [dict(r) for r in rows] async def get_chair_feedback(feedback_id: UUID) -> dict | None: """Return a single chair_feedback row by id (with case_number), or None.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """SELECT cf.*, c.case_number, c.appeal_type AS case_appeal_type FROM chair_feedback cf LEFT JOIN cases c ON c.id = cf.case_id WHERE cf.id = $1""", feedback_id, ) return dict(row) if row else None async def resolve_chair_feedback( feedback_id: UUID, applied_to: list[str], ) -> None: """Mark feedback as resolved and record where it was applied.""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( """UPDATE chair_feedback SET resolved = TRUE, applied_to = $2 WHERE id = $1""", feedback_id, applied_to, ) # ── Appraiser facts (V5 — for interim drafts) ───────────────────── async def replace_appraiser_facts( case_id: UUID, document_id: UUID, facts: list[dict], ) -> int: """Replace all appraiser_facts for a given document. Each fact dict: appraiser_name, appraiser_side, fact_type ('plan'|'permit'), identifier, details (dict), page_number (optional). """ pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): await conn.execute( "DELETE FROM appraiser_facts WHERE document_id = $1", document_id, ) for f in facts: await conn.execute( """INSERT INTO appraiser_facts (case_id, document_id, appraiser_name, appraiser_side, fact_type, identifier, details, page_number) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""", case_id, document_id, f["appraiser_name"], f.get("appraiser_side", ""), f["fact_type"], f["identifier"], json.dumps(f.get("details", {}), ensure_ascii=False), f.get("page_number"), ) return len(facts) async def list_appraiser_facts( case_id: UUID, fact_type: str | None = None, ) -> list[dict]: """List appraiser_facts for a case, optionally filtered by fact_type.""" pool = await get_pool() async with pool.acquire() as conn: if fact_type: rows = await conn.fetch( """SELECT * FROM appraiser_facts WHERE case_id = $1 AND fact_type = $2 ORDER BY identifier, appraiser_name""", case_id, fact_type, ) else: rows = await conn.fetch( """SELECT * FROM appraiser_facts WHERE case_id = $1 ORDER BY fact_type, identifier, appraiser_name""", case_id, ) results = [] for r in rows: d = dict(r) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) d["document_id"] = str(d["document_id"]) if isinstance(d.get("details"), str): d["details"] = json.loads(d["details"]) results.append(d) return results async def detect_appraiser_conflicts(case_id: UUID) -> list[dict]: """Detect conflicts: identifiers cited by 2+ different appraisers in this case. A conflict exists when the SAME identifier (e.g., "תמ"א 38") was reported differently by two appraisers — different details, or one cited it and the other did not. Returns list of conflict groups. Each entry in a group carries the appraiser's side so the caller can label it as committee / appellant / deciding. """ pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """SELECT identifier, fact_type, json_agg(jsonb_build_object( 'appraiser_name', appraiser_name, 'appraiser_side', appraiser_side, 'details', details, 'page_number', page_number, 'document_id', document_id ) ORDER BY CASE appraiser_side WHEN 'committee' THEN 1 WHEN 'appellant' THEN 2 WHEN 'deciding' THEN 3 ELSE 4 END, appraiser_name ) AS entries, COUNT(DISTINCT appraiser_name) AS n_appraisers FROM appraiser_facts WHERE case_id = $1 GROUP BY identifier, fact_type HAVING COUNT(DISTINCT appraiser_name) > 1""", case_id, ) conflicts = [] for r in rows: entries = r["entries"] if isinstance(entries, str): entries = json.loads(entries) # Parse nested details if still strings for e in entries: if isinstance(e.get("details"), str): e["details"] = json.loads(e["details"]) conflicts.append({ "identifier": r["identifier"], "fact_type": r["fact_type"], "n_appraisers": r["n_appraisers"], "entries": entries, }) return conflicts # ── V7: External precedent library + halachot ───────────────────── def _row_to_case_law(row: asyncpg.Record) -> dict: """Normalize a case_law row, parsing subject_tags JSONB to list.""" d = dict(row) if isinstance(d.get("subject_tags"), str): try: d["subject_tags"] = json.loads(d["subject_tags"]) except (TypeError, ValueError): d["subject_tags"] = [] if d.get("date") is not None: d["date"] = d["date"].isoformat() return d async def get_case_law(case_law_id: UUID) -> dict | None: pool = await get_pool() row = await pool.fetchrow( "SELECT * FROM case_law WHERE id = $1", case_law_id, ) return _row_to_case_law(row) if row else None async def get_external_case_law_by_citation(citation: str) -> dict | None: """Return the first external_upload row whose case_number matches citation, or None.""" pool = await get_pool() row = await pool.fetchrow( """ SELECT id, case_number, case_name, court, date, halacha_extraction_status, source_kind, created_at FROM case_law WHERE case_number = $1 AND source_kind = 'external_upload' LIMIT 1 """, citation, ) return _row_to_case_law(row) if row else None async def mark_indexed(case_law_id: UUID) -> None: """Mark a case_law row's embeddings as built from its current content (FU-3). Sets indexed_hash := content_hash. Call AFTER a successful chunk+embed+store. """ pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "UPDATE case_law SET indexed_hash = content_hash WHERE id = $1", case_law_id, ) async def list_stale_case_law(limit: int = 500) -> list[dict]: """case_law rows whose embeddings are stale vs current content (GAP-09/INV-G6).""" pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """SELECT id, case_number, source_kind FROM case_law WHERE coalesce(full_text, '') <> '' AND content_hash IS DISTINCT FROM indexed_hash ORDER BY created_at LIMIT $1""", limit, ) return [dict(r) for r in rows] async def recompute_content_hashes() -> dict: """Backfill (FU-3): set content_hash for all rows; set indexed_hash=content_hash only where chunks already exist (those are already embedded). Rows with text but no chunks get indexed_hash=NULL → surface as stale. Hash-only; no re-embed.""" pool = await get_pool() updated = 0 async with pool.acquire() as conn: rows = await conn.fetch("SELECT id, full_text FROM case_law") for r in rows: ch = _content_hash(r["full_text"] or "") has_chunks = await conn.fetchval( "SELECT EXISTS(SELECT 1 FROM precedent_chunks WHERE case_law_id = $1)", r["id"]) await conn.execute( "UPDATE case_law SET content_hash = $2, " "indexed_hash = CASE WHEN $3 THEN $2 ELSE indexed_hash END WHERE id = $1", r["id"], ch, bool(has_chunks)) updated += 1 return {"updated": updated} async def add_case_law_relation( a_id: UUID, b_id: UUID, relation_type: str = "same_case_chain" ) -> None: """Link two case_law records bidirectionally. Idempotent (ON CONFLICT DO NOTHING).""" pool = await get_pool() async with pool.acquire() as conn: await conn.executemany( """ INSERT INTO case_law_relations(case_law_id, related_id, relation_type) VALUES($1, $2, $3) ON CONFLICT (case_law_id, related_id) DO NOTHING """, [(a_id, b_id, relation_type), (b_id, a_id, relation_type)], ) async def remove_case_law_relation(a_id: UUID, b_id: UUID) -> None: """Remove a bidirectional link between two case_law records.""" pool = await get_pool() await pool.execute( """ DELETE FROM case_law_relations WHERE (case_law_id = $1 AND related_id = $2) OR (case_law_id = $2 AND related_id = $1) """, a_id, b_id, ) async def get_case_law_relations(case_law_id: UUID) -> list[dict]: """Return all case_law records linked to case_law_id, ordered by date asc.""" pool = await get_pool() rows = await pool.fetch( """ SELECT cl.*, r.relation_type FROM case_law_relations r JOIN case_law cl ON cl.id = r.related_id WHERE r.case_law_id = $1 ORDER BY cl.date ASC NULLS LAST """, case_law_id, ) results = [] for row in rows: d = dict(row) relation_type = d.pop("relation_type") normalized = _row_to_case_law(d) normalized["relation_type"] = relation_type results.append(normalized) return results async def get_case_law_by_citation(case_number: str) -> dict | None: pool = await get_pool() row = await pool.fetchrow( "SELECT * FROM case_law WHERE case_number = $1", case_number, ) return _row_to_case_law(row) if row else None async def create_external_case_law( case_number: str, case_name: str, full_text: str, court: str = "", decision_date: date | None = None, practice_area: str = "", appeal_subtype: str = "", subject_tags: list[str] | None = None, summary: str = "", headnote: str = "", key_quote: str = "", source_url: str = "", source_type: str = "", precedent_level: str = "", is_binding: bool = True, document_id: UUID | None = None, ) -> dict: """Insert a chair-uploaded external precedent into case_law. If a row with this ``case_number`` already exists with source_kind='cited_only' (auto-discovered), promote it to source_kind='external_upload' and fill in the missing fields. """ pool = await get_pool() tags_json = json.dumps(subject_tags or [], ensure_ascii=False) async with pool.acquire() as conn: # Atomic upsert on the V15 partial unique index # uq_case_law_external_number (case_number) WHERE source_kind <> 'internal_committee'. # The predicate is repeated in ON CONFLICT (required for partial indexes). # This also subsumes the old cited_only→external_upload promotion: a # cited_only row with the same case_number conflicts and is promoted by # DO UPDATE. Scoped to the external partial index, so an internal row with # the same number is NOT touched (the old SELECT-without-source_kind could # wrongly promote it). row = await conn.fetchrow( """ INSERT INTO case_law ( case_number, case_name, court, date, subject_tags, summary, key_quote, full_text, source_url, source_kind, document_id, extraction_status, halacha_extraction_status, practice_area, appeal_subtype, headnote, source_type, precedent_level, is_binding, content_hash ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, 'external_upload', $10, 'processing', 'pending', $11, $12, $13, $14, $15, $16, $17 ) ON CONFLICT (case_number) WHERE source_kind <> 'internal_committee' DO UPDATE SET case_name = EXCLUDED.case_name, court = COALESCE(NULLIF(EXCLUDED.court, ''), case_law.court), date = COALESCE(EXCLUDED.date, case_law.date), practice_area = EXCLUDED.practice_area, appeal_subtype = EXCLUDED.appeal_subtype, subject_tags = EXCLUDED.subject_tags, summary = COALESCE(NULLIF(EXCLUDED.summary, ''), case_law.summary), headnote = EXCLUDED.headnote, key_quote = COALESCE(NULLIF(EXCLUDED.key_quote, ''), case_law.key_quote), full_text = EXCLUDED.full_text, source_url = COALESCE(NULLIF(EXCLUDED.source_url, ''), case_law.source_url), source_type = EXCLUDED.source_type, precedent_level = EXCLUDED.precedent_level, is_binding = EXCLUDED.is_binding, document_id = COALESCE(EXCLUDED.document_id, case_law.document_id), source_kind = 'external_upload', extraction_status = 'processing', halacha_extraction_status = 'pending', content_hash = EXCLUDED.content_hash RETURNING * """, case_number, case_name, court, decision_date, tags_json, summary, key_quote, full_text, source_url, document_id, practice_area, appeal_subtype, headnote, source_type, precedent_level, is_binding, _content_hash(full_text), ) return _row_to_case_law(row) async def create_internal_committee_decision( case_number: str, case_name: str, full_text: str, court: str = "", decision_date: date | None = None, chair_name: str = "", district: str = "", practice_area: str = "", appeal_subtype: str = "", subject_tags: list[str] | None = None, summary: str = "", is_binding: bool = True, document_id: UUID | None = None, proceeding_type: str = "ערר", ) -> dict: """Upsert an appeals-committee decision as source_kind='internal_committee'. Idempotency key: (case_number, proceeding_type) — the same number can exist as both 'ערר' and 'בל"מ' (an extension-of-time request can be filed against an existing appeal with the same number). """ pool = await get_pool() case_number = _canonical_case_number(case_number) tags_json = json.dumps(subject_tags or [], ensure_ascii=False) async with pool.acquire() as conn: # Atomic upsert on V15 partial unique index # uq_case_law_internal_number_proc (case_number, proceeding_type) # WHERE source_kind = 'internal_committee'. Predicate repeated for the # partial index. Replaces the old SELECT-then-INSERT/UPDATE (race-prone). row = await conn.fetchrow( """ INSERT INTO case_law ( case_number, case_name, court, date, chair_name, district, subject_tags, summary, full_text, source_kind, source_type, document_id, extraction_status, halacha_extraction_status, practice_area, appeal_subtype, is_binding, proceeding_type, content_hash ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, 'internal_committee', 'appeals_committee', $10, 'processing', 'pending', $11, $12, $13, $14, $15 ) ON CONFLICT (case_number, proceeding_type) WHERE source_kind = 'internal_committee' DO UPDATE SET case_name = EXCLUDED.case_name, court = COALESCE(NULLIF(EXCLUDED.court, ''), case_law.court), date = COALESCE(EXCLUDED.date, case_law.date), chair_name = COALESCE(NULLIF(EXCLUDED.chair_name, ''), case_law.chair_name), district = COALESCE(NULLIF(EXCLUDED.district, ''), case_law.district), practice_area = EXCLUDED.practice_area, appeal_subtype = EXCLUDED.appeal_subtype, subject_tags = EXCLUDED.subject_tags, summary = COALESCE(NULLIF(EXCLUDED.summary, ''), case_law.summary), full_text = EXCLUDED.full_text, source_type = 'appeals_committee', source_kind = 'internal_committee', is_binding = EXCLUDED.is_binding, document_id = COALESCE(EXCLUDED.document_id, case_law.document_id), extraction_status = 'processing', halacha_extraction_status = 'pending', content_hash = EXCLUDED.content_hash RETURNING * """, case_number, case_name, court, decision_date, chair_name, district, tags_json, summary, full_text, document_id, practice_area, appeal_subtype, is_binding, proceeding_type, _content_hash(full_text), ) return _row_to_case_law(row) def _compute_searchable(row: dict, has_embedded_chunk: bool) -> bool: """Completeness contract (INV-DM1 / 02-data-model §2a). A row is searchable IFF: canonical id present · case_name/practice_area/ source_kind present · ≥1 chunk with a non-null embedding · extraction completed · metadata non-empty (≥1 of headnote/summary/subject_tags). Pure — `has_embedded_chunk` is supplied by the caller (cross-table check). """ if not has_embedded_chunk: return False if (row.get("extraction_status") or "") != "completed": return False if not (row.get("case_number") or "").strip(): return False if not (row.get("case_name") or "").strip(): return False # practice_area is required only for identifier-keyed corpora (internal # committee decisions, active cases). External precedents (e.g. בג"ץ) are # legitimately cross-domain and may have no single practice_area. if (row.get("source_kind") or "") != "external_upload": if not (row.get("practice_area") or "").strip(): return False if not (row.get("source_kind") or "").strip(): return False tags = row.get("subject_tags") or [] has_meta = bool((row.get("headnote") or "").strip()) \ or bool((row.get("summary") or "").strip()) \ or (len(tags) > 0) return has_meta async def recompute_searchable(case_law_id: "UUID | str | None" = None) -> int: """Recompute and persist the `searchable` flag. Idempotent / reversible. If case_law_id is None, recompute ALL rows (used by the V21 backfill and the dry-run). Returns the number of rows now marked searchable=true. """ pool = await get_pool() async with pool.acquire() as conn: if case_law_id is not None: cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id)) rows = await conn.fetch( "SELECT * FROM case_law WHERE id = $1", cid) else: rows = await conn.fetch("SELECT * FROM case_law") n_true = 0 for r in rows: row = dict(r) tags = row.get("subject_tags") if isinstance(tags, str): try: tags = json.loads(tags) except (ValueError, TypeError): tags = [] row["subject_tags"] = tags or [] has_chunk = await conn.fetchval( "SELECT EXISTS(SELECT 1 FROM precedent_chunks " "WHERE case_law_id = $1 AND embedding IS NOT NULL)", row["id"]) val = _compute_searchable(row, bool(has_chunk)) await conn.execute( "UPDATE case_law SET searchable = $2 WHERE id = $1", row["id"], val) if val: n_true += 1 return n_true async def update_case_law(case_law_id: UUID, **fields) -> dict | None: """Patch metadata fields on a case_law row. Allowed fields: case_name, court, date, practice_area, appeal_subtype, subject_tags, summary, headnote, key_quote, source_url, source_type, precedent_level, is_binding, citation_formatted. """ allowed = { "case_number", "case_name", "court", "date", "practice_area", "appeal_subtype", "subject_tags", "summary", "headnote", "nevo_ratio", "key_quote", "source_url", "source_type", "precedent_level", "is_binding", "district", "chair_name", "proceeding_type", "citation_formatted", } updates = {k: v for k, v in fields.items() if k in allowed} if not updates: return await get_case_law(case_law_id) pool = await get_pool() set_parts = [] params: list = [case_law_id] for i, (k, v) in enumerate(updates.items(), start=2): if k == "subject_tags": v = json.dumps(v or [], ensure_ascii=False) set_parts.append(f"{k} = ${i}") params.append(v) sql = f"UPDATE case_law SET {', '.join(set_parts)} WHERE id = $1 RETURNING *" row = await pool.fetchrow(sql, *params) return _row_to_case_law(row) if row else None async def set_case_law_extraction_status(case_law_id: UUID, status: str) -> None: """Set text-extraction status. When transitioning to a terminal state ('completed'/'failed') we also NULL ``metadata_extraction_requested_at`` so the local-MCP queue (`process_pending_extractions`, which scans by ``WHERE *_requested_at IS NOT NULL``) doesn't re-pick the row forever and leave the row blocked in the UI's `isPrecedentActive` check.""" pool = await get_pool() if status in ("completed", "failed"): await pool.execute( "UPDATE case_law SET extraction_status = $2, " "metadata_extraction_requested_at = NULL WHERE id = $1", case_law_id, status, ) else: await pool.execute( "UPDATE case_law SET extraction_status = $2 WHERE id = $1", case_law_id, status, ) async def set_case_law_halacha_status(case_law_id: UUID, status: str) -> None: """Set halacha-extraction status. Mirrors ``set_case_law_extraction_status``: on terminal states we also clear ``halacha_extraction_requested_at`` so the queue and UI don't see a stale request flag.""" pool = await get_pool() if status in ("completed", "failed"): await pool.execute( "UPDATE case_law SET halacha_extraction_status = $2, " "halacha_extraction_requested_at = NULL WHERE id = $1", case_law_id, status, ) else: await pool.execute( "UPDATE case_law SET halacha_extraction_status = $2 WHERE id = $1", case_law_id, status, ) async def set_case_law_metadata_status(case_law_id: UUID, status: str) -> None: """Set metadata-extraction status. Mirrors ``set_case_law_halacha_status``: on terminal states ('completed'/'failed') we also clear ``metadata_extraction_requested_at`` so the local-MCP queue (`process_pending_extractions`, which scans ``WHERE *_requested_at IS NOT NULL``) stops re-picking the row and the UI's ``isPrecedentActive`` check settles.""" pool = await get_pool() if status in ("completed", "failed"): await pool.execute( "UPDATE case_law SET metadata_extraction_status = $2, " "metadata_extraction_requested_at = NULL WHERE id = $1", case_law_id, status, ) else: await pool.execute( "UPDATE case_law SET metadata_extraction_status = $2 WHERE id = $1", case_law_id, status, ) async def list_external_case_law( practice_area: str = "", court: str = "", precedent_level: str = "", source_type: str = "", search: str = "", limit: int = 100, offset: int = 0, source_kind: str = "external_upload", ) -> list[dict]: """List chair-uploaded precedents, with simple filters. source_kind="all_committees" expands to: source_kind='internal_committee' OR (source_kind='external_upload' AND source_type='appeals_committee'). """ pool = await get_pool() if source_kind == "all_committees": conditions = [ "(source_kind = 'internal_committee' OR " "(source_kind = 'external_upload' AND source_type = 'appeals_committee'))" ] else: conditions = [f"source_kind = '{source_kind}'"] params: list = [] idx = 1 if practice_area: conditions.append(f"practice_area = ${idx}") params.append(practice_area) idx += 1 if court: conditions.append(f"court ILIKE ${idx}") params.append(f"%{court}%") idx += 1 if precedent_level: conditions.append(f"precedent_level = ${idx}") params.append(precedent_level) idx += 1 if source_type: conditions.append(f"source_type = ${idx}") params.append(source_type) idx += 1 if search: conditions.append( f"(case_number ILIKE ${idx} OR case_name ILIKE ${idx} " f"OR summary ILIKE ${idx} OR headnote ILIKE ${idx})" ) params.append(f"%{search}%") idx += 1 where_sql = " AND ".join(conditions) params.extend([limit, offset]) sql = f""" SELECT id, case_number, case_name, court, date, practice_area, appeal_subtype, source_type, precedent_level, is_binding, summary, headnote, subject_tags, source_kind, chair_name, district, citation_formatted, extraction_status, halacha_extraction_status, metadata_extraction_status, metadata_extraction_requested_at, halacha_extraction_requested_at, created_at, (SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id) AS halachot_count, (SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id AND h.review_status IN ('approved', 'published')) AS approved_count FROM case_law WHERE {where_sql} ORDER BY created_at DESC LIMIT ${idx} OFFSET ${idx + 1} """ rows = await pool.fetch(sql, *params) out = [] for r in rows: d = _row_to_case_law(r) # Render timestamps as ISO strings so the JSON layer stays simple for k in ("metadata_extraction_requested_at", "halacha_extraction_requested_at"): if d.get(k) is not None: d[k] = d[k].isoformat() out.append(d) return out async def delete_case_law(case_law_id: UUID) -> bool: """Delete a precedent and cascade chunks + halachot.""" pool = await get_pool() result = await pool.execute( "DELETE FROM case_law WHERE id = $1", case_law_id, ) return result == "DELETE 1" # ── Digests (X12 — radar layer; separate table, INV-DIG1/2/3) ──────── _DIGEST_COLS = ( "id, yomon_number, digest_date, publication, source_firm, concept_tag, " "headline_holding, analysis_text, summary, underlying_citation, " "underlying_court, underlying_date, underlying_judge, practice_area, " "appeal_subtype, subject_tags, linked_case_law_id, source_document_path, " "content_hash, extraction_status, created_at, updated_at" ) _DIGEST_UPDATE_ALLOWED = { "yomon_number", "digest_date", "publication", "source_firm", "concept_tag", "headline_holding", "analysis_text", "summary", "underlying_citation", "underlying_court", "underlying_date", "underlying_judge", "practice_area", "appeal_subtype", "subject_tags", "source_document_path", "content_hash", "extraction_status", } def _row_to_digest(row: asyncpg.Record | dict | None) -> dict | None: """Normalize a digests row: ISO-format dates, ensure subject_tags is a list.""" if row is None: return None d = dict(row) for k in ("digest_date", "underlying_date", "created_at", "updated_at"): if d.get(k) is not None and hasattr(d[k], "isoformat"): d[k] = d[k].isoformat() if d.get("subject_tags") is None: d["subject_tags"] = [] if d.get("id") is not None: d["id"] = str(d["id"]) if d.get("linked_case_law_id") is not None: d["linked_case_law_id"] = str(d["linked_case_law_id"]) return d async def create_digest( *, analysis_text: str, yomon_number: str = "", digest_date: date | None = None, publication: str = "כל יום", source_firm: str = "עפר טויסטר, עורכי דין", concept_tag: str = "", headline_holding: str = "", summary: str = "", underlying_citation: str = "", underlying_court: str = "", underlying_date: date | None = None, underlying_judge: str = "", practice_area: str = "", appeal_subtype: str = "", subject_tags: list[str] | None = None, source_document_path: str = "", extraction_status: str = "processing", ) -> dict: """Upsert a digest (X12). Idempotent on yomon_number (INV-G3): a repeat upload of the same yomon updates in place. content_hash is the secondary dedup key for digests whose number couldn't be parsed.""" pool = await get_pool() content_hash = _content_hash(analysis_text) async with pool.acquire() as conn: # Upsert on the partial unique index uq_digests_yomon_number # (yomon_number WHERE yomon_number <> ''). Predicate repeated in # ON CONFLICT as required for partial indexes. row = await conn.fetchrow( f""" INSERT INTO digests ( yomon_number, digest_date, publication, source_firm, concept_tag, headline_holding, analysis_text, summary, underlying_citation, underlying_court, underlying_date, underlying_judge, practice_area, appeal_subtype, subject_tags, source_document_path, content_hash, extraction_status ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18 ) ON CONFLICT (yomon_number) WHERE yomon_number <> '' DO UPDATE SET digest_date = COALESCE(EXCLUDED.digest_date, digests.digest_date), publication = EXCLUDED.publication, source_firm = EXCLUDED.source_firm, concept_tag = EXCLUDED.concept_tag, headline_holding = EXCLUDED.headline_holding, analysis_text = EXCLUDED.analysis_text, summary = EXCLUDED.summary, underlying_citation = EXCLUDED.underlying_citation, underlying_court = EXCLUDED.underlying_court, underlying_date = COALESCE(EXCLUDED.underlying_date, digests.underlying_date), underlying_judge = EXCLUDED.underlying_judge, practice_area = EXCLUDED.practice_area, appeal_subtype = EXCLUDED.appeal_subtype, subject_tags = EXCLUDED.subject_tags, source_document_path = COALESCE(NULLIF(EXCLUDED.source_document_path, ''), digests.source_document_path), content_hash = EXCLUDED.content_hash, extraction_status = EXCLUDED.extraction_status, updated_at = now() RETURNING {_DIGEST_COLS} """, yomon_number, digest_date, publication, source_firm, concept_tag, headline_holding, analysis_text, summary, underlying_citation, underlying_court, underlying_date, underlying_judge, practice_area, appeal_subtype, list(subject_tags or []), source_document_path, content_hash, extraction_status, ) return _row_to_digest(row) async def get_digest(digest_id: UUID | str) -> dict | None: pool = await get_pool() cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id)) row = await pool.fetchrow( f"SELECT {_DIGEST_COLS} FROM digests WHERE id = $1", cid, ) return _row_to_digest(row) async def get_digest_by_content_hash(content_hash: str) -> dict | None: if not content_hash: return None pool = await get_pool() row = await pool.fetchrow( f"SELECT {_DIGEST_COLS} FROM digests WHERE content_hash = $1", content_hash, ) return _row_to_digest(row) async def update_digest(digest_id: UUID | str, **fields) -> dict | None: """Patch metadata fields on a digest row. Whitelist via _DIGEST_UPDATE_ALLOWED.""" cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id)) updates = {k: v for k, v in fields.items() if k in _DIGEST_UPDATE_ALLOWED} if not updates: return await get_digest(cid) pool = await get_pool() set_parts = [] params: list = [cid] for i, (k, v) in enumerate(updates.items(), start=2): if k == "subject_tags": v = list(v or []) set_parts.append(f"{k} = ${i}") params.append(v) set_parts.append("updated_at = now()") sql = f"UPDATE digests SET {', '.join(set_parts)} WHERE id = $1 RETURNING {_DIGEST_COLS}" row = await pool.fetchrow(sql, *params) return _row_to_digest(row) async def store_digest_embedding(digest_id: UUID | str, vector: list[float]) -> None: pool = await get_pool() cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id)) await pool.execute( "UPDATE digests SET embedding = $2, updated_at = now() WHERE id = $1", cid, vector, ) async def link_digest_to_case_law( digest_id: UUID | str, case_law_id: UUID | str | None, ) -> dict | None: """Set (or clear, with None) the bridge to the underlying ruling (INV-DIG3).""" pool = await get_pool() cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id)) clid = None if case_law_id is not None: clid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id)) row = await pool.fetchrow( f"UPDATE digests SET linked_case_law_id = $2, updated_at = now() " f"WHERE id = $1 RETURNING {_DIGEST_COLS}", cid, clid, ) return _row_to_digest(row) async def delete_digest(digest_id: UUID | str) -> bool: pool = await get_pool() cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id)) result = await pool.execute("DELETE FROM digests WHERE id = $1", cid) return result == "DELETE 1" async def list_digests( practice_area: str = "", concept_tag: str = "", linked: bool | None = None, search: str = "", limit: int = 100, offset: int = 0, ) -> list[dict]: """List digests with simple filters. linked=True/False filters on whether the underlying ruling is in the library yet (INV-DIG3 gap surfacing).""" pool = await get_pool() conditions: list[str] = [] params: list = [] idx = 1 if practice_area: conditions.append(f"practice_area = ${idx}") params.append(practice_area) idx += 1 if concept_tag: conditions.append(f"concept_tag ILIKE ${idx}") params.append(f"%{concept_tag}%") idx += 1 if linked is True: conditions.append("linked_case_law_id IS NOT NULL") elif linked is False: conditions.append("linked_case_law_id IS NULL") if search: conditions.append( f"(yomon_number ILIKE ${idx} OR concept_tag ILIKE ${idx} " f"OR headline_holding ILIKE ${idx} OR underlying_citation ILIKE ${idx} " f"OR summary ILIKE ${idx})" ) params.append(f"%{search}%") idx += 1 where_sql = (" WHERE " + " AND ".join(conditions)) if conditions else "" params.extend([limit, offset]) sql = ( f"SELECT {_DIGEST_COLS} FROM digests{where_sql} " f"ORDER BY digest_date DESC NULLS LAST, created_at DESC " f"LIMIT ${idx} OFFSET ${idx + 1}" ) rows = await pool.fetch(sql, *params) return [_row_to_digest(r) for r in rows] async def list_pending_digests(limit: int = 20) -> list[dict]: """Digests awaiting local LLM enrichment (web-upload queue, X12). The drainer (digest_library.process_pending_digests) picks these up.""" pool = await get_pool() rows = await pool.fetch( f"SELECT {_DIGEST_COLS} FROM digests WHERE extraction_status = 'pending' " f"ORDER BY created_at LIMIT $1", limit, ) return [_row_to_digest(r) for r in rows] async def search_digests_semantic( query_embedding: list[float], practice_area: str = "", subject_tag: str = "", concept_tag: str = "", limit: int = 10, ) -> list[dict]: """Pure-semantic search over the digests radar (X12). Single vector per row (no chunks/halachot), so no RRF here — see X12 §6. Joins the linked ruling's citation when present so the researcher sees the pointer target directly.""" pool = await get_pool() conditions = ["d.embedding IS NOT NULL"] params: list = [query_embedding, limit] idx = 3 if practice_area: conditions.append(f"d.practice_area = ${idx}") params.append(practice_area) idx += 1 if subject_tag: conditions.append(f"${idx} = ANY(d.subject_tags)") params.append(subject_tag) idx += 1 if concept_tag: conditions.append(f"d.concept_tag ILIKE ${idx}") params.append(f"%{concept_tag}%") idx += 1 sql = f""" SELECT {', '.join('d.' + c for c in _DIGEST_COLS.split(', '))}, cl.case_number AS linked_case_number, cl.case_name AS linked_case_name, cl.searchable AS linked_searchable, 1 - (d.embedding <=> $1) AS score FROM digests d LEFT JOIN case_law cl ON cl.id = d.linked_case_law_id WHERE {' AND '.join(conditions)} ORDER BY d.embedding <=> $1 LIMIT $2 """ rows = await pool.fetch(sql, *params) out = [] for r in rows: d = _row_to_digest(r) d["linked_case_number"] = r["linked_case_number"] d["linked_case_name"] = r["linked_case_name"] d["linked_searchable"] = r["linked_searchable"] d["score"] = float(r["score"]) d["type"] = "digest" out.append(d) return out async def find_case_law_by_citation_fuzzy(citation: str) -> dict | None: """Best-effort match of a digest's underlying_citation to a case_law row, for autolink (INV-DIG3). Tries: (1) exact case_number; (2) canonical docket substring (e.g. '46111-12-22') contained in a case_law.case_number. Returns the first match or None — never raises, never mutates.""" citation = (citation or "").strip() if not citation: return None pool = await get_pool() row = await pool.fetchrow( "SELECT * FROM case_law WHERE case_number = $1 LIMIT 1", citation, ) if row: return _row_to_case_law(row) # Extract a docket-like token: digits with '-' or '/' separators, e.g. # 46111-12-22 or 3975/22. Match it as a substring of case_number. m = re.search(r"\d+[-/]\d+(?:[-/]\d+)?", citation) if not m: return None docket = m.group(0) row = await pool.fetchrow( "SELECT * FROM case_law " "WHERE case_number ILIKE $1 ORDER BY created_at LIMIT 1", f"%{docket}%", ) return _row_to_case_law(row) if row else None async def store_precedent_chunks( case_law_id: UUID, chunks: list[dict], ) -> int: """Replace precedent chunks for a case_law row (single-tier). Each chunk dict has: chunk_index, content, section_type, page_number, embedding (list[float] or None). All rows written here are stored with ``chunk_role='child'`` and ``parent_chunk_id IS NULL`` — backward-compatible with the V17 schema (parent-doc lookup is a no-op for these rows). For two-tier ingestion, see :func:`store_precedent_chunks_hierarchical`. """ pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "DELETE FROM precedent_chunks WHERE case_law_id = $1", case_law_id, ) for c in chunks: await conn.execute( """INSERT INTO precedent_chunks (case_law_id, chunk_index, content, section_type, page_number, embedding) VALUES ($1, $2, $3, $4, $5, $6)""", case_law_id, c["chunk_index"], c["content"], c.get("section_type", "other"), c.get("page_number"), c.get("embedding"), ) return len(chunks) async def store_precedent_chunks_hierarchical( case_law_id: UUID, chunks: list[dict], ) -> dict: """Replace precedent chunks for a case_law row (two-tier). Each input dict must carry: * ``role``: 'child' | 'parent' * ``local_id``: in-batch identifier (int) used to wire children to their parent's DB UUID * ``parent_local_id``: int (only for children) — references the ``local_id`` of the parent in this same batch. For parents, this is None. * ``chunk_index``, ``content``, ``section_type``, ``page_number`` * ``embedding``: required for children, None for parents Two-pass write inside a single transaction: 1. INSERT all parents (no FK back to children), capture ``local_id → DB UUID`` map. 2. INSERT all children with ``parent_chunk_id`` resolved. Returns ``{"parents": N, "children": M, "total": N+M}``. """ parents = [c for c in chunks if c.get("role") == "parent"] children = [c for c in chunks if c.get("role") == "child"] if not parents and not children: return {"parents": 0, "children": 0, "total": 0} pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): await conn.execute( "DELETE FROM precedent_chunks WHERE case_law_id = $1", case_law_id, ) # Pass 1: parents — embedding intentionally NULL (parents # aren't matched on; they only carry retrieval context). local_to_uuid: dict[int, UUID] = {} for p in parents: row = await conn.fetchrow( """INSERT INTO precedent_chunks (case_law_id, chunk_index, content, section_type, page_number, embedding, chunk_role, parent_chunk_id) VALUES ($1, $2, $3, $4, $5, NULL, 'parent', NULL) RETURNING id""", case_law_id, p["chunk_index"], p["content"], p.get("section_type", "other"), p.get("page_number"), ) local_to_uuid[int(p["local_id"])] = row["id"] # Pass 2: children with resolved parent_chunk_id. for c in children: parent_uuid = local_to_uuid.get( int(c["parent_local_id"]) ) if c.get("parent_local_id") is not None else None await conn.execute( """INSERT INTO precedent_chunks (case_law_id, chunk_index, content, section_type, page_number, embedding, chunk_role, parent_chunk_id) VALUES ($1, $2, $3, $4, $5, $6, 'child', $7)""", case_law_id, c["chunk_index"], c["content"], c.get("section_type", "other"), c.get("page_number"), c.get("embedding"), parent_uuid, ) return { "parents": len(parents), "children": len(children), "total": len(parents) + len(children), } async def list_precedent_chunks( case_law_id: UUID, section_types: tuple[str, ...] | None = None, ) -> list[dict]: pool = await get_pool() if section_types: rows = await pool.fetch( """SELECT id, chunk_index, content, section_type, page_number, halacha_extracted_at FROM precedent_chunks WHERE case_law_id = $1 AND section_type = ANY($2::text[]) ORDER BY chunk_index""", case_law_id, list(section_types), ) else: rows = await pool.fetch( """SELECT id, chunk_index, content, section_type, page_number, halacha_extracted_at FROM precedent_chunks WHERE case_law_id = $1 ORDER BY chunk_index""", case_law_id, ) return [dict(r) for r in rows] async def delete_halachot(case_law_id: UUID) -> int: pool = await get_pool() result = await pool.execute( "DELETE FROM halachot WHERE case_law_id = $1", case_law_id, ) # result is e.g. "DELETE 5" — extract the number. try: return int(result.split()[-1]) except (ValueError, IndexError): return 0 async def store_halachot(case_law_id: UUID, halachot: list[dict]) -> int: """Bulk-insert extracted halachot. Each halacha enters with review_status determined by extractor confidence vs ``config.HALACHA_AUTO_APPROVE_THRESHOLD``: - confidence >= threshold → 'approved' (visible to search immediately) - else → 'pending_review' (chair must approve manually) The auto-approval reviewer is recorded as 'auto' for traceability. """ if not halachot: return 0 threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD pool = await get_pool() async with pool.acquire() as conn: for i, h in enumerate(halachot): confidence = float(h.get("confidence", 0.0)) auto_approve = confidence >= threshold review_status = "approved" if auto_approve else "pending_review" reviewer = ( f"auto-approved (confidence ≥ {threshold:.2f})" if auto_approve else None ) reviewed_at_clause = "now()" if auto_approve else "NULL" await conn.execute( f"""INSERT INTO halachot (case_law_id, halacha_index, rule_statement, rule_type, reasoning_summary, supporting_quote, page_reference, practice_areas, subject_tags, cites, confidence, quote_verified, embedding, review_status, reviewer, reviewed_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, {reviewed_at_clause})""", case_law_id, i, h["rule_statement"], h.get("rule_type", "interpretive"), h.get("reasoning_summary", ""), h["supporting_quote"], h.get("page_reference", ""), h.get("practice_areas", []), h.get("subject_tags", []), h.get("cites", []), confidence, h.get("quote_verified", False), h.get("embedding"), review_status, reviewer, ) return len(halachot) async def reset_halacha_extraction(case_law_id: UUID) -> None: """Force a clean re-extraction: wipe halachot + clear per-chunk checkpoints so every chunk is re-processed (used by explicit re-extract, not resume).""" pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): await conn.execute("DELETE FROM halachot WHERE case_law_id = $1", case_law_id) await conn.execute( "UPDATE precedent_chunks SET halacha_extracted_at = NULL " "WHERE case_law_id = $1", case_law_id, ) async def mark_all_chunks_extracted(case_law_id: UUID) -> int: """Checkpoint every un-marked chunk of a precedent as extracted. Used to backfill pre-V25 precedents (halachot already exist but no chunk was checkpointed) so a resume run skips them instead of re-extracting (which would duplicate). Returns rows updated. """ pool = await get_pool() result = await pool.execute( "UPDATE precedent_chunks SET halacha_extracted_at = now() " "WHERE case_law_id = $1 AND halacha_extracted_at IS NULL", case_law_id, ) try: return int(result.split()[-1]) except (ValueError, IndexError): return 0 async def store_halachot_for_chunk( case_law_id: UUID, chunk_id: UUID, halachot: list[dict], ) -> int: """Persist ONE chunk's halachot and mark the chunk done — atomically. Crash-safe + resumable: each chunk's results land in the DB the moment it finishes, and the chunk is flagged (``halacha_extracted_at``) so a resumed run skips it. ``halacha_index`` continues from the current max so appends across chunks never collide. The chunk is marked even when ``halachot`` is empty (so resume skips genuinely-empty chunks too). Caller serializes calls (a single in-process store-lock) so the MAX read stays race-free. Two gates encode the strict rubric (docs/halacha-strict-rubric.md) so the corpus stays clean at the source instead of accumulating noise: * Auto-approve gate — a halacha auto-approves only if confidence ≥ threshold AND it carries no ``quality_flags`` (non_decision / truncated_quote / thin_restatement / quote_unverified). Flagged items route to ``pending_review`` regardless of confidence. * Dedup-on-insert — within the SAME precedent, a halacha is skipped if its normalized ``supporting_quote`` already exists, or its rule-embedding has cosine ≥ ``HALACHA_DEDUP_COSINE`` against an already-stored halacha. Returns the number of halachot actually INSERTED (after dedup skips). """ threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE # cosine sim → distance band_distance = 1.0 - config.HALACHA_DEDUP_BAND_COSINE # tail-band ceiling (#82.3) pool = await get_pool() inserted = 0 skipped = 0 async with pool.acquire() as conn: async with conn.transaction(): base = await conn.fetchval( "SELECT COALESCE(MAX(halacha_index), -1) + 1 FROM halachot " "WHERE case_law_id = $1", case_law_id, ) # Existing normalized quotes for exact-dedup (incl. within-batch). existing_quotes = { halacha_quality.normalize_text(r["supporting_quote"]) for r in await conn.fetch( "SELECT supporting_quote FROM halachot WHERE case_law_id = $1", case_law_id, ) } for h in halachot: norm_quote = halacha_quality.normalize_text(h["supporting_quote"]) # 1) exact normalized-quote duplicate within this precedent if norm_quote and norm_quote in existing_quotes: skipped += 1 continue # 2) semantic near-duplicate (rule embedding cosine) — fetch the # nearest same-precedent neighbor once so we can both auto-skip # (cosine ≥ DEDUP) and flag the lexical tail (#82.3). emb = h.get("embedding") flags = list(h.get("quality_flags") or []) if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0: neighbor = await conn.fetchrow( "SELECT rule_statement, (embedding <=> $2) AS dist " "FROM halachot WHERE case_law_id = $1 " "AND embedding IS NOT NULL " "ORDER BY embedding <=> $2 LIMIT 1", case_law_id, emb, ) if neighbor is not None: dist = float(neighbor["dist"]) if dist <= dedup_distance: skipped += 1 continue # tail band: below auto-skip but lexically near → flag. if (dist <= band_distance and halacha_quality.FLAG_NEAR_DUPLICATE not in flags and halacha_quality.lexical_near_duplicate( h["rule_statement"], neighbor["rule_statement"])): flags.append(halacha_quality.FLAG_NEAR_DUPLICATE) confidence = float(h.get("confidence", 0.0)) auto_approve = confidence >= threshold and not flags review_status = "approved" if auto_approve else "pending_review" reviewer = ( f"auto-approved (confidence ≥ {threshold:.2f})" if auto_approve else None ) reviewed_at_clause = "now()" if auto_approve else "NULL" await conn.execute( f"""INSERT INTO halachot (case_law_id, halacha_index, rule_statement, rule_type, reasoning_summary, supporting_quote, page_reference, practice_areas, subject_tags, cites, confidence, quote_verified, quality_flags, embedding, review_status, reviewer, reviewed_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, {reviewed_at_clause})""", case_law_id, base + inserted, h["rule_statement"], h.get("rule_type", "interpretive"), h.get("reasoning_summary", ""), h["supporting_quote"], h.get("page_reference", ""), h.get("practice_areas", []), h.get("subject_tags", []), h.get("cites", []), confidence, h.get("quote_verified", False), flags, h.get("embedding"), review_status, reviewer, ) existing_quotes.add(norm_quote) inserted += 1 await conn.execute( "UPDATE precedent_chunks SET halacha_extracted_at = now() " "WHERE id = $1", chunk_id, ) if skipped: logger.info( "store_halachot_for_chunk: case_law=%s chunk=%s — %d inserted, " "%d skipped as duplicates", case_law_id, chunk_id, inserted, skipped, ) return inserted async def list_halachot( case_law_id: UUID | None = None, review_status: str | None = None, practice_area: str | None = None, limit: int = 200, offset: int = 0, exclude_low_quality: bool = False, order_by_priority: bool = False, cluster: bool = False, include_equivalents: bool = False, ) -> list[dict]: """List halachot with optional triage controls (#84). exclude_low_quality — drop items carrying ANY quality_flag (application / truncated_quote / quote_unverified / non_decision / thin_restatement / nli_unsupported / near_duplicate). These belong in a 'needs extraction fix' bucket, not the chair's approve queue (#84.1). order_by_priority — replace FIFO with an active-learning order (#84.3): negatively-treated first, then most-uncertain (lowest confidence), then oldest — so the chair sees the highest-value decisions first. cluster — annotate each row with ``cluster_id`` + ``cluster_size`` (#84.2): same-precedent halachot within HALACHA_CLUSTER_COSINE form one group so the UI can collapse near-identical principles into a single review card. """ pool = await get_pool() conditions = [] params: list = [] idx = 1 if case_law_id is not None: conditions.append(f"h.case_law_id = ${idx}") params.append(case_law_id) idx += 1 if review_status: conditions.append(f"h.review_status = ${idx}") params.append(review_status) idx += 1 if practice_area: conditions.append(f"${idx} = ANY(h.practice_areas)") params.append(practice_area) idx += 1 if exclude_low_quality: # a clean item has an empty/NULL quality_flags array conditions.append("COALESCE(array_length(h.quality_flags, 1), 0) = 0") where_sql = f"WHERE {' AND '.join(conditions)}" if conditions else "" order_sql = ( "ORDER BY corroboration_negative DESC, h.confidence ASC NULLS LAST, " "h.created_at ASC" if order_by_priority else "ORDER BY h.case_law_id, h.halacha_index" ) params.extend([limit, offset]) sql = f""" SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement, h.rule_type, h.reasoning_summary, h.supporting_quote, h.page_reference, h.practice_areas, h.subject_tags, h.cites, h.confidence, h.quote_verified, h.quality_flags, h.review_status, h.reviewer, h.reviewed_at, h.created_at, h.updated_at, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, COALESCE(cor.corroboration_count, 0)::int AS corroboration_count, COALESCE(cor.corroboration_negative, false) AS corroboration_negative FROM halachot h LEFT JOIN case_law cl ON cl.id = h.case_law_id LEFT JOIN ( SELECT halacha_id, count(DISTINCT COALESCE(citing_case_law_id::text, citing_decision_id::text, source_citation_id::text)) FILTER (WHERE treatment IN ('followed','explained')) AS corroboration_count, bool_or(treatment IN ('distinguished','criticized','questioned','overruled')) AS corroboration_negative FROM halacha_citation_corroboration GROUP BY halacha_id ) cor ON cor.halacha_id = h.id {where_sql} {order_sql} LIMIT ${idx} OFFSET ${idx + 1} """ rows = await pool.fetch(sql, *params) out = [] for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() # authority is DERIVED from the source, never stored (INV-DM7) d["authority"] = halacha_quality.derive_authority(d.get("precedent_level")) out.append(d) if cluster and out: await _annotate_clusters(pool, out) if include_equivalents and out: await _annotate_equivalents(pool, out) return out async def _annotate_clusters(pool, out: list[dict]) -> None: """Add cluster_id + cluster_size to each row (#84.2), display-only. Same-precedent halachot within HALACHA_CLUSTER_COSINE are unioned into one group. Singletons get their own id as cluster_id and size 1. Pairwise is confined to the returned set (cheap; the queue is ~hundreds of rows).""" ids = [d["id"] for d in out] max_dist = 1.0 - config.HALACHA_CLUSTER_COSINE pairs = await pool.fetch( "SELECT a.id AS a, b.id AS b FROM halachot a JOIN halachot b " "ON a.case_law_id = b.case_law_id AND a.id < b.id " "AND a.embedding IS NOT NULL AND b.embedding IS NOT NULL " "AND (a.embedding <=> b.embedding) <= $2 " "WHERE a.id = ANY($1::uuid[]) AND b.id = ANY($1::uuid[])", ids, max_dist, ) parent = {str(i): str(i) for i in ids} def find(x: str) -> str: while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x for p in pairs: ra, rb = find(str(p["a"])), find(str(p["b"])) if ra != rb: parent[ra] = rb from collections import Counter sizes = Counter(find(str(i)) for i in ids) for d in out: root = find(str(d["id"])) d["cluster_id"] = root d["cluster_size"] = sizes[root] async def update_halacha( halacha_id: UUID, review_status: str | None = None, reviewer: str = "", rule_statement: str | None = None, reasoning_summary: str | None = None, subject_tags: list[str] | None = None, practice_areas: list[str] | None = None, ) -> dict | None: """Update a halacha — used by the chair to approve/reject/edit.""" pool = await get_pool() set_parts: list[str] = [] params: list = [halacha_id] idx = 2 if review_status is not None: set_parts.append(f"review_status = ${idx}") params.append(review_status) idx += 1 if review_status in ("approved", "rejected", "published", "deferred"): set_parts.append(f"reviewed_at = now()") set_parts.append(f"reviewer = ${idx}") params.append(reviewer) idx += 1 if rule_statement is not None: set_parts.append(f"rule_statement = ${idx}") params.append(rule_statement) idx += 1 if reasoning_summary is not None: set_parts.append(f"reasoning_summary = ${idx}") params.append(reasoning_summary) idx += 1 if subject_tags is not None: set_parts.append(f"subject_tags = ${idx}") params.append(subject_tags) idx += 1 if practice_areas is not None: set_parts.append(f"practice_areas = ${idx}") params.append(practice_areas) idx += 1 if not set_parts: return None set_parts.append("updated_at = now()") # Exclude `embedding` — it's a numpy.ndarray of np.float32 that breaks # FastAPI's jsonable_encoder downstream (PATCH /api/halachot/{id}). # Callers that need it (none today) can re-fetch with get_halacha. sql = f""" UPDATE halachot SET {', '.join(set_parts)} WHERE id = $1 RETURNING id, case_law_id, halacha_index, rule_statement, rule_type, reasoning_summary, supporting_quote, page_reference, practice_areas, subject_tags, cites, confidence, quote_verified, quality_flags, review_status, reviewer, reviewed_at, created_at, updated_at """ row = await pool.fetchrow(sql, *params) return dict(row) if row else None # Statuses the chair can set via review (batch or single). 'deferred' = snooze: # stays out of the active library AND out of the default pending queue, without # the finality of 'rejected'. #84 review-queue triage. HALACHA_REVIEW_STATUSES = { "pending_review", "approved", "rejected", "published", "deferred", } async def update_halachot_batch( halacha_ids: list[str], review_status: str, reviewer: str = "", ) -> int: """Bulk-set review_status for many halachot in one atomic statement. Powers the #84 "approve/reject/defer the whole group" action — one request, one transaction, one refetch (vs N PATCH round-trips). Only the status + reviewer + reviewed_at are touched (no content edits in batch). Returns the number of rows updated. """ if not halacha_ids or review_status not in HALACHA_REVIEW_STATUSES: return 0 ids = [UUID(str(i)) for i in halacha_ids] stamp = review_status in ("approved", "rejected", "published", "deferred") pool = await get_pool() result = await pool.execute( f"""UPDATE halachot SET review_status = $2, updated_at = now() {", reviewed_at = now(), reviewer = $3" if stamp else ""} WHERE id = ANY($1::uuid[])""", ids, review_status, *( [reviewer] if stamp else [] ), ) try: return int(result.split()[-1]) except (ValueError, IndexError): return 0 async def approve_halacha_by_corroboration( halacha_id: UUID, n_sources: int, min_cites: int, ) -> bool: """Approve a halacha on citation corroboration — ONLY if it is currently awaiting the chair (``pending_review``). Never touches ``published`` / ``rejected`` / already-``approved`` (INV-COR5: the chair gate is preserved for everything else). The reviewer records the corroboration basis as provenance (INV-COR6). Returns True iff a row actually transitioned.""" pool = await get_pool() reviewer = f"corroborated ({n_sources} judicial citations ≥ {min_cites})" row = await pool.fetchrow( "UPDATE halachot SET review_status='approved', reviewer=$2, " "reviewed_at=now(), updated_at=now() " "WHERE id=$1 AND review_status='pending_review' RETURNING id", halacha_id, reviewer, ) return row is not None async def demote_halacha_overruled(halacha_id: UUID) -> bool: """Demote an APPROVED halacha back to the chair gate because a later citing court overruled it (INV-COR2). Acts only on ``approved`` → ``pending_review``; leaves ``published`` / ``rejected`` / already-``pending_review`` untouched. The reviewer note records why it re-entered the queue. Returns True iff a row transitioned.""" pool = await get_pool() row = await pool.fetchrow( "UPDATE halachot SET review_status='pending_review', " "reviewer='flagged: overruled by later citation (X11)', " "reviewed_at=NULL, updated_at=now() " "WHERE id=$1 AND review_status='approved' RETURNING id", halacha_id, ) return row is not None async def list_corroboration_grouped(case_law_id: UUID) -> dict[str, list[dict]]: """Per-halacha corroboration links for a cited precedent, in the ``{source_id, treatment}`` shape ``aggregate()`` consumes. The distinct citing source is keyed by case_law/decision id (falling back to the citation row id so two anonymous rows are not collapsed).""" pool = await get_pool() rows = await pool.fetch( "SELECT hcc.halacha_id::text AS halacha_id, " " COALESCE(hcc.citing_case_law_id::text, hcc.citing_decision_id::text, " " hcc.source_citation_id::text) AS source_id, " " hcc.treatment " "FROM halacha_citation_corroboration hcc " "JOIN halachot h ON h.id = hcc.halacha_id " "WHERE h.case_law_id = $1", case_law_id, ) out: dict[str, list[dict]] = {} for r in rows: out.setdefault(r["halacha_id"], []).append( {"source_id": r["source_id"], "treatment": r["treatment"]} ) return out async def precedents_with_halachot_and_incoming_citations() -> list[str]: """case_law ids that have at least one halacha AND at least one incoming citation (either graph) — the corroboration backfill target set.""" pool = await get_pool() rows = await pool.fetch( "SELECT c.id::text FROM case_law c " "WHERE EXISTS (SELECT 1 FROM halachot h WHERE h.case_law_id=c.id) " " AND (EXISTS (SELECT 1 FROM precedent_internal_citations p " " WHERE p.cited_case_law_id=c.id) " " OR EXISTS (SELECT 1 FROM case_law_citations cc " " WHERE cc.case_law_id=c.id))", ) return [r["id"] for r in rows] async def nearest_halacha_for_vector(case_law_id: UUID, vec: list[float]) -> tuple[str, float] | None: """Best-matching halacha of `case_law_id` for a context embedding (cosine).""" pool = await get_pool() row = await pool.fetchrow( "SELECT id::text AS id, 1 - (embedding <=> $2) AS sim " "FROM halachot WHERE case_law_id = $1 AND embedding IS NOT NULL " "ORDER BY embedding <=> $2 LIMIT 1", case_law_id, vec, ) return (row["id"], float(row["sim"])) if row else None async def incoming_citations_for_precedent(case_law_id: UUID) -> list[dict]: """All incoming citations (both graphs) with their context + source id.""" pool = await get_pool() rows = await pool.fetch( "SELECT id::text AS source_id, source_case_law_id::text AS citing_case_law_id, " " NULL::text AS citing_decision_id, match_context AS context " "FROM precedent_internal_citations WHERE cited_case_law_id = $1 " "UNION ALL " "SELECT id::text, NULL, decision_id::text, context_text " "FROM case_law_citations WHERE case_law_id = $1", case_law_id, ) return [dict(r) for r in rows] async def store_corroboration( halacha_id: str, source_id: str, citing_case_law_id, citing_decision_id, treatment: str, score: float, context: str, ) -> None: from uuid import UUID as _UUID pool = await get_pool() # asyncpg requires UUID objects for uuid-typed columns; convert non-None strings. h_id = _UUID(halacha_id) if isinstance(halacha_id, str) else halacha_id s_id = _UUID(source_id) if isinstance(source_id, str) else source_id cl_id = _UUID(citing_case_law_id) if (citing_case_law_id and isinstance(citing_case_law_id, str)) else citing_case_law_id d_id = _UUID(citing_decision_id) if (citing_decision_id and isinstance(citing_decision_id, str)) else citing_decision_id await pool.execute( "INSERT INTO halacha_citation_corroboration " "(halacha_id, citing_case_law_id, citing_decision_id, source_citation_id, treatment, match_score, match_context) " "VALUES ($1,$2,$3,$4,$5,$6,$7) " "ON CONFLICT (halacha_id, source_citation_id) DO UPDATE SET " "treatment=EXCLUDED.treatment, match_score=EXCLUDED.match_score", h_id, cl_id, d_id, s_id, treatment, score, context, ) # ── Parallel-authority (equivalent halachot) — #84.2 follow-up ─────────────── # # A NON-citation, symmetric link between halachot of different precedents that # state the same principle. Kept entirely separate from the citation corroboration # above so the citator's counts never include non-citation recurrences. def _equiv_order(a: UUID, b: UUID) -> tuple[UUID, UUID]: """Canonical ordering (halacha_a < halacha_b) so the pair is symmetric+unique.""" return (a, b) if str(a) < str(b) else (b, a) async def link_equivalent_halachot( a: UUID, b: UUID, *, cosine: float = 0.0, note: str = "", created_by: str = "", ) -> bool: """Record that two halachot (different precedents) state the same principle. Idempotent (symmetric UNIQUE). Returns False and does nothing if a == b or the two belong to the SAME precedent (parallel authority is cross-precedent by definition; within-precedent sameness is the dedup/cluster concern).""" if a == b: return False pool = await get_pool() same = await pool.fetchval( "SELECT (SELECT case_law_id FROM halachot WHERE id=$1) " " = (SELECT case_law_id FROM halachot WHERE id=$2)", a, b, ) if same: return False lo, hi = _equiv_order(a, b) await pool.execute( "INSERT INTO equivalent_halachot (halacha_a, halacha_b, cosine, note, created_by) " "VALUES ($1,$2,$3,$4,$5) ON CONFLICT (halacha_a, halacha_b) DO UPDATE SET " "cosine=GREATEST(equivalent_halachot.cosine, EXCLUDED.cosine), " "note=COALESCE(NULLIF(EXCLUDED.note,''), equivalent_halachot.note)", lo, hi, round(float(cosine), 3), note, created_by, ) return True async def unlink_equivalent_halachot(a: UUID, b: UUID) -> bool: pool = await get_pool() lo, hi = _equiv_order(a, b) res = await pool.execute( "DELETE FROM equivalent_halachot WHERE halacha_a=$1 AND halacha_b=$2", lo, hi, ) return res.endswith(" 1") async def list_equivalent_for_halacha(halacha_id: UUID) -> list[dict]: """The other halachot linked as parallel authority to this one (both sides).""" pool = await get_pool() rows = await pool.fetch( "SELECT e.cosine, h.id::text AS halacha_id, h.rule_statement, " " cl.case_number, cl.case_name " "FROM equivalent_halachot e " "JOIN halachot h ON h.id = CASE WHEN e.halacha_a=$1 THEN e.halacha_b ELSE e.halacha_a END " "JOIN case_law cl ON cl.id = h.case_law_id " "WHERE e.halacha_a=$1 OR e.halacha_b=$1 " "ORDER BY e.cosine DESC", halacha_id, ) return [ { "halacha_id": r["halacha_id"], "rule_statement": r["rule_statement"], "case_number": r["case_number"], "case_name": r["case_name"], "cosine": float(r["cosine"]) if r["cosine"] is not None else None, } for r in rows ] async def _annotate_equivalents(pool, out: list[dict]) -> None: """Attach an `equivalents` list to each row (#84.2) — parallel-authority links. Adds both directions, so when both halachot of a pair are on the same page each one lists the other.""" ids = [d["id"] for d in out] rows = await pool.fetch( "SELECT e.halacha_a, e.halacha_b, e.cosine, " " ha.rule_statement AS a_rule, cla.case_number AS a_case, " " hb.rule_statement AS b_rule, clb.case_number AS b_case " "FROM equivalent_halachot e " "JOIN halachot ha ON ha.id = e.halacha_a " "JOIN case_law cla ON cla.id = ha.case_law_id " "JOIN halachot hb ON hb.id = e.halacha_b " "JOIN case_law clb ON clb.id = hb.case_law_id " "WHERE e.halacha_a = ANY($1::uuid[]) OR e.halacha_b = ANY($1::uuid[])", ids, ) idset = {str(i) for i in ids} by_src: dict[str, list[dict]] = {} for r in rows: a, b = str(r["halacha_a"]), str(r["halacha_b"]) cos = float(r["cosine"]) if r["cosine"] is not None else None if a in idset: by_src.setdefault(a, []).append({ "halacha_id": b, "case_number": r["b_case"], "rule_statement": r["b_rule"], "cosine": cos}) if b in idset: by_src.setdefault(b, []).append({ "halacha_id": a, "case_number": r["a_case"], "rule_statement": r["a_rule"], "cosine": cos}) for d in out: d["equivalents"] = by_src.get(str(d["id"]), []) # ── Gold-set evaluation (#81.7 / #81.8) ────────────────────────────────────── async def goldset_create_sample( n: int = 150, batch: str = "default", reset: bool = False, ) -> dict: """Stratified sample of halachot (round-robin over case×rule_type) into a tagging batch. Idempotent (ON CONFLICT); ``reset`` clears the batch first.""" pool = await get_pool() if reset: await pool.execute("DELETE FROM halacha_goldset WHERE batch = $1", batch) rows = await pool.fetch( "SELECT id, case_law_id, rule_type FROM halachot WHERE rule_statement <> ''" ) from collections import defaultdict buckets: dict = defaultdict(list) for r in rows: buckets[(r["case_law_id"], r["rule_type"])].append(r["id"]) keys = list(buckets.values()) sample: list = [] i = 0 while len(sample) < n and any(keys): b = keys[i % len(keys)] if b: sample.append(b.pop()) i += 1 if i > n * 50: break inserted = 0 for hid in sample: res = await pool.execute( "INSERT INTO halacha_goldset (halacha_id, batch) VALUES ($1, $2) " "ON CONFLICT (halacha_id, batch) DO NOTHING", hid, batch, ) if res.endswith(" 1"): inserted += 1 total = await pool.fetchval( "SELECT count(*) FROM halacha_goldset WHERE batch = $1", batch) return {"batch": batch, "inserted": inserted, "total": total} async def goldset_list(batch: str = "default") -> list[dict]: """Gold-set items joined with the halacha content + the machine's labels.""" pool = await get_pool() rows = await pool.fetch( "SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, " " g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, " " g.ai_is_holding, g.ai_correct_type, g.ai_rationale, g.ai_generated_at, " " h.rule_statement, h.supporting_quote, h.reasoning_summary, " " h.rule_type, h.confidence, h.quality_flags, h.review_status, " " cl.case_number, cl.case_name, cl.source_type, cl.precedent_level " "FROM halacha_goldset g JOIN halachot h ON h.id = g.halacha_id " "LEFT JOIN case_law cl ON cl.id = h.case_law_id " "WHERE g.batch = $1 ORDER BY g.created_at, g.id", batch, ) out = [] for r in rows: d = dict(r) if d.get("tagged_at") is not None: d["tagged_at"] = d["tagged_at"].isoformat() if d.get("ai_generated_at") is not None: d["ai_generated_at"] = d["ai_generated_at"].isoformat() if d.get("confidence") is not None: d["confidence"] = float(d["confidence"]) # authority is DERIVED from the source, never stored (INV-DM7) d["authority"] = halacha_quality.derive_authority(d.get("precedent_level")) out.append(d) return out async def goldset_set_ai_recommendation( goldset_id: UUID, *, ai_is_holding: bool | None, ai_correct_type: str = "", ai_rationale: str = "", ) -> None: """Store the independent AI second-opinion for a gold-set item (QA aid).""" pool = await get_pool() await pool.execute( "UPDATE halacha_goldset SET ai_is_holding = $2, ai_correct_type = $3, " "ai_rationale = $4, ai_generated_at = now() WHERE id = $1", goldset_id, ai_is_holding, ai_correct_type, ai_rationale, ) async def goldset_tag( goldset_id: UUID, *, is_holding: bool | None = None, correct_type: str | None = None, quote_complete: bool | None = None, tagged_by: str = "chair", ) -> dict | None: """Save one human tag (partial — only provided fields change).""" pool = await get_pool() sets = ["tagged_by = $2", "tagged_at = now()"] params: list = [goldset_id, tagged_by] i = 3 if is_holding is not None: sets.append(f"is_holding = ${i}"); params.append(is_holding); i += 1 if correct_type is not None: sets.append(f"correct_type = ${i}"); params.append(correct_type); i += 1 if quote_complete is not None: sets.append(f"quote_complete = ${i}"); params.append(quote_complete); i += 1 row = await pool.fetchrow( f"UPDATE halacha_goldset SET {', '.join(sets)} WHERE id = $1 RETURNING *", *params, ) return dict(row) if row else None async def goldset_score(batch: str = "default") -> dict: """Measure each extraction validator against the human tags (#81.8). A validator flag predicts "NOT a clean holding"; ground truth is is_holding == false. truncated_quote is scored against quote_complete.""" items = await goldset_list(batch) labeled = [r for r in items if r.get("is_holding") is not None] from collections import defaultdict counters: dict = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0}) def tally(name: str, predicted_bad: bool, truly_bad: bool) -> None: c = counters[name] key = ("tp" if truly_bad else "fp") if predicted_bad else ("fn" if truly_bad else "tn") c[key] += 1 for r in labeled: rule = r.get("rule_statement") or "" quote = r.get("supporting_quote") or "" rtype = r.get("rule_type") or "interpretive" qc = r["quote_complete"] if r["quote_complete"] is not None else True truly_bad = r["is_holding"] is False flags = halacha_quality.compute_quality_flags(rule, quote, "", qc, rtype) tally("any_flag", bool(flags), truly_bad) tally("application", halacha_quality.FLAG_APPLICATION in flags, truly_bad) tally("non_decision", halacha_quality.FLAG_NON_DECISION in flags, truly_bad) tally("thin_restatement", halacha_quality.FLAG_THIN_RESTATEMENT in flags, truly_bad) tally("truncated_quote", halacha_quality.is_quote_truncated(quote), qc is False) def prf(c: dict) -> dict: p = c["tp"] / (c["tp"] + c["fp"]) if (c["tp"] + c["fp"]) else 0.0 rec = c["tp"] / (c["tp"] + c["fn"]) if (c["tp"] + c["fn"]) else 0.0 f1 = 2 * p * rec / (p + rec) if (p + rec) else 0.0 return {"precision": round(p, 3), "recall": round(rec, 3), "f1": round(f1, 3), **c} return { "batch": batch, "total": len(items), "labeled": len(labeled), "validators": {name: prf(c) for name, c in counters.items()}, } async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]: """Return all corroboration rows for one halacha, ordered by match_score DESC.""" pool = await get_pool() rows = await pool.fetch( "SELECT treatment, match_score, match_context, citing_case_law_id::text, " " citing_decision_id::text, created_at " "FROM halacha_citation_corroboration WHERE halacha_id = $1 " "ORDER BY match_score DESC", halacha_id, ) return [ { "treatment": r["treatment"], "match_score": float(r["match_score"]) if r["match_score"] is not None else None, "match_context": r["match_context"], "citing_case_law_id": r["citing_case_law_id"], "citing_decision_id": r["citing_decision_id"], "created_at": r["created_at"].isoformat() if r["created_at"] else None, } for r in rows ] async def search_precedent_library_semantic( query_embedding: list[float], practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, subject_tag: str = "", limit: int = 10, include_halachot: bool = True, source_kind: str = "external_upload", district: str = "", chair_name: str = "", ) -> list[dict]: """Semantic search over precedents filtered by source_kind. source_kind='external_upload' → court rulings (default) source_kind='internal_committee' → appeals-committee decisions Returns merged halachot + chunks. Halachot are pre-distilled rules, so they get a small score boost. Only ``approved`` / ``published`` halachot are visible (per chair-review policy). Chunks are visible regardless of halacha review status. """ pool = await get_pool() halacha_filters = [ "h.review_status IN ('approved', 'published')", f"cl.source_kind = '{source_kind}'", "cl.searchable = true", ] chunk_filters = [f"cl.source_kind = '{source_kind}'", "cl.searchable = true"] h_params: list = [query_embedding, limit] c_params: list = [query_embedding, limit] h_idx = 3 c_idx = 3 if practice_area: halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)") h_params.append(practice_area) h_idx += 1 chunk_filters.append(f"cl.practice_area = ${c_idx}") c_params.append(practice_area) c_idx += 1 if court: halacha_filters.append(f"cl.court ILIKE ${h_idx}") h_params.append(f"%{court}%") h_idx += 1 chunk_filters.append(f"cl.court ILIKE ${c_idx}") c_params.append(f"%{court}%") c_idx += 1 if precedent_level: halacha_filters.append(f"cl.precedent_level = ${h_idx}") h_params.append(precedent_level) h_idx += 1 chunk_filters.append(f"cl.precedent_level = ${c_idx}") c_params.append(precedent_level) c_idx += 1 if appeal_subtype: halacha_filters.append(f"cl.appeal_subtype = ${h_idx}") h_params.append(appeal_subtype) h_idx += 1 chunk_filters.append(f"cl.appeal_subtype = ${c_idx}") c_params.append(appeal_subtype) c_idx += 1 if is_binding is not None: halacha_filters.append(f"cl.is_binding = ${h_idx}") h_params.append(is_binding) h_idx += 1 chunk_filters.append(f"cl.is_binding = ${c_idx}") c_params.append(is_binding) c_idx += 1 if subject_tag: halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)") h_params.append(subject_tag) h_idx += 1 if district: halacha_filters.append(f"cl.district = ${h_idx}") h_params.append(district) h_idx += 1 chunk_filters.append(f"cl.district = ${c_idx}") c_params.append(district) c_idx += 1 if chair_name: halacha_filters.append(f"cl.chair_name = ${h_idx}") h_params.append(chair_name) h_idx += 1 chunk_filters.append(f"cl.chair_name = ${c_idx}") c_params.append(chair_name) c_idx += 1 halacha_sql = f""" SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement, h.reasoning_summary, h.supporting_quote, h.page_reference, h.practice_areas, h.subject_tags, h.confidence, h.rule_type, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.chair_name, cl.district, 1 - (h.embedding <=> $1) AS score FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id WHERE {' AND '.join(halacha_filters)} AND h.embedding IS NOT NULL ORDER BY h.embedding <=> $1 LIMIT $2 """ # Parent-doc retrieval (V17 / TaskMaster #48): the LEFT JOIN # surfaces each chunk's parent_chunk's content alongside it. When # ``config.PARENT_DOC_RETRIEVAL_ENABLED`` is true *and* the row has # a non-null parent, the post-processing loop swaps in the parent's # content so the writer sees the broader passage instead of the # 300-token sliver that matched. Legacy rows (parent_chunk_id NULL) # are unaffected — the JOIN returns NULL parent_* and the swap is a # no-op. Index ``idx_precedent_chunks_role`` is not used here # intentionally: filtering on chunk_role='child' would exclude # legacy single-tier rows that default to 'child' but have no # parent; an embedding-IS-NOT-NULL filter is equivalent because # parents store NULL embeddings. chunk_sql = f""" SELECT pc.id AS chunk_id, pc.case_law_id, pc.content, pc.section_type, pc.page_number, pc.parent_chunk_id, parent.content AS parent_content, parent.section_type AS parent_section_type, parent.page_number AS parent_page_number, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.practice_area, cl.chair_name, cl.district, 1 - (pc.embedding <=> $1) AS score FROM precedent_chunks pc JOIN case_law cl ON cl.id = pc.case_law_id LEFT JOIN precedent_chunks parent ON parent.id = pc.parent_chunk_id WHERE {' AND '.join(chunk_filters)} AND pc.embedding IS NOT NULL -- #55: exclude tiny fragment chunks (artifacts of pre-fix -- mid-sentence header splits) that carry no retrievable signal. AND length(trim(pc.content)) >= 50 ORDER BY pc.embedding <=> $1 LIMIT $2 """ results: list[dict] = [] if include_halachot: rows = await pool.fetch(halacha_sql, *h_params) for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() # Dynamic rule-level boost: scales with extractor confidence # so high-conf halachot rank higher than low-conf ones. # conf=0.78 → +0.047, conf=0.90 → +0.054, conf=0.95 → +0.057 # Calibrated so the average (≈0.85) stays at +0.05 (legacy value). _conf = float(d.get("confidence") or 0.0) d["score"] = float(d["score"]) + max(_conf * 0.06, 0.0) d["type"] = "halacha" # authority is DERIVED from the source, never stored (INV-DM7) d["authority"] = halacha_quality.derive_authority(d.get("precedent_level")) results.append(d) rows = await pool.fetch(chunk_sql, *c_params) for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() d["score"] = float(d["score"]) d["type"] = "passage" _maybe_swap_parent(d) results.append(d) results.sort(key=lambda x: x["score"], reverse=True) # Dedupe: when multiple child hits share the same parent, we'd # otherwise return duplicate parent content. Keep the highest- # scoring hit per parent (skip if parent swap disabled or row has # no parent — chunk_id alone remains unique). return _dedupe_by_parent(results, limit) def _maybe_swap_parent(row: dict) -> None: """Promote parent content into ``content`` when the flag is on and the row has a non-NULL parent. Mutates ``row`` in place. Adds debug fields ``child_content`` / ``child_section_type`` / ``child_page_number`` so callers can see what originally matched. Strips the ``parent_*`` keys that come back from the LEFT JOIN — they're an implementation detail of the swap. """ parent_content = row.pop("parent_content", None) parent_section = row.pop("parent_section_type", None) parent_page = row.pop("parent_page_number", None) if ( config.PARENT_DOC_RETRIEVAL_ENABLED and row.get("parent_chunk_id") is not None and parent_content ): row["child_content"] = row.get("content") row["child_section_type"] = row.get("section_type") row["child_page_number"] = row.get("page_number") row["content"] = parent_content # Parent's section_type is authoritative for the swapped row # (children inherit from their parent, but a parent that spans # a boundary uses its first section's type — same convention). if parent_section: row["section_type"] = parent_section if parent_page is not None: row["page_number"] = parent_page row["parent_swap"] = True def _dedupe_by_parent(rows: list[dict], limit: int) -> list[dict]: """When parent-doc swap is active, multiple children sharing a parent collapse to one parent row (the highest-scored child wins). Rows without a parent (legacy chunks, halachot) pass through unchanged. """ if not config.PARENT_DOC_RETRIEVAL_ENABLED: return rows[:limit] seen_parents: set = set() out: list[dict] = [] for r in rows: pid = r.get("parent_chunk_id") if pid and r.get("parent_swap"): if pid in seen_parents: continue seen_parents.add(pid) out.append(r) if len(out) >= limit: break return out async def search_precedent_library_lexical( *, query: str, practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, subject_tag: str = "", source_kind: str = "external_upload", district: str = "", chair_name: str = "", limit: int = 30, include_halachot: bool = True, ) -> list[dict]: """Lexical (BM25-like) search via ``ts_rank_cd`` over ``content_tsv`` and ``rule_tsv`` (V12 columns). Mirrors the filter set of :func:`search_precedent_library_semantic` so the two layers can be fused 1:1 by rank in :mod:`hybrid_search` via RRF. Why ``plainto_tsquery``: it accepts free-text input, lowercases, and AND-joins the terms — matches the bi-encoder's "all words contribute" assumption better than ``websearch_to_tsquery`` (which inserts ORs). Empty / stopword-only queries return zero rows (no error). Why ``ts_rank_cd``: cover density variant — rewards documents where the query terms appear close together (e.g. "1461/20 אנטרים" matches the same paragraph). Higher is more relevant. """ if not (query or "").strip(): return [] pool = await get_pool() halacha_filters = [ "h.review_status IN ('approved', 'published')", f"cl.source_kind = '{source_kind}'", "cl.searchable = true", ] chunk_filters = [f"cl.source_kind = '{source_kind}'", "cl.searchable = true"] # $1 = query, $2 = limit. Filters append starting at $3. h_params: list = [query, limit] c_params: list = [query, limit] h_idx = 3 c_idx = 3 if practice_area: halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)") h_params.append(practice_area) h_idx += 1 chunk_filters.append(f"cl.practice_area = ${c_idx}") c_params.append(practice_area) c_idx += 1 if court: halacha_filters.append(f"cl.court ILIKE ${h_idx}") h_params.append(f"%{court}%") h_idx += 1 chunk_filters.append(f"cl.court ILIKE ${c_idx}") c_params.append(f"%{court}%") c_idx += 1 if precedent_level: halacha_filters.append(f"cl.precedent_level = ${h_idx}") h_params.append(precedent_level) h_idx += 1 chunk_filters.append(f"cl.precedent_level = ${c_idx}") c_params.append(precedent_level) c_idx += 1 if appeal_subtype: halacha_filters.append(f"cl.appeal_subtype = ${h_idx}") h_params.append(appeal_subtype) h_idx += 1 chunk_filters.append(f"cl.appeal_subtype = ${c_idx}") c_params.append(appeal_subtype) c_idx += 1 if is_binding is not None: halacha_filters.append(f"cl.is_binding = ${h_idx}") h_params.append(is_binding) h_idx += 1 chunk_filters.append(f"cl.is_binding = ${c_idx}") c_params.append(is_binding) c_idx += 1 if subject_tag: halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)") h_params.append(subject_tag) h_idx += 1 if district: halacha_filters.append(f"cl.district = ${h_idx}") h_params.append(district) h_idx += 1 chunk_filters.append(f"cl.district = ${c_idx}") c_params.append(district) c_idx += 1 if chair_name: halacha_filters.append(f"cl.chair_name = ${h_idx}") h_params.append(chair_name) h_idx += 1 chunk_filters.append(f"cl.chair_name = ${c_idx}") c_params.append(chair_name) c_idx += 1 halacha_sql = f""" SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement, h.reasoning_summary, h.supporting_quote, h.page_reference, h.practice_areas, h.subject_tags, h.confidence, h.rule_type, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.chair_name, cl.district, GREATEST( ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)), ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1)) ) + CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1) THEN 1.0 ELSE 0.0 END AS score FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id WHERE {' AND '.join(halacha_filters)} AND (h.rule_tsv @@ plainto_tsquery('simple', $1) OR cl.meta_tsv @@ plainto_tsquery('simple', $1)) ORDER BY score DESC LIMIT $2 """ # Parent-doc retrieval (V17) — same LEFT JOIN strategy as the # semantic side. The tsvector match still runs over the child's # ``content_tsv``; only the *returned* content is promoted to the # parent when the flag is on and a parent exists. See # :func:`search_precedent_library_semantic` for the rationale. # We intentionally restrict matching to chunks with an embedding # (i.e. children + legacy single-tier rows). Hierarchical parents # store NULL embeddings, so even though their ``content_tsv`` is # populated they're excluded here — preventing a parent from # matching directly and then being "swapped" with itself. chunk_sql = f""" SELECT pc.id AS chunk_id, pc.case_law_id, pc.content, pc.section_type, pc.page_number, pc.parent_chunk_id, parent.content AS parent_content, parent.section_type AS parent_section_type, parent.page_number AS parent_page_number, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.practice_area, cl.chair_name, cl.district, GREATEST( ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)), ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1)) ) + CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1) THEN 1.0 ELSE 0.0 END AS score FROM precedent_chunks pc JOIN case_law cl ON cl.id = pc.case_law_id LEFT JOIN precedent_chunks parent ON parent.id = pc.parent_chunk_id WHERE {' AND '.join(chunk_filters)} AND pc.embedding IS NOT NULL -- #55: exclude tiny fragment chunks (see semantic query above). AND length(trim(pc.content)) >= 50 AND (pc.content_tsv @@ plainto_tsquery('simple', $1) OR cl.meta_tsv @@ plainto_tsquery('simple', $1)) ORDER BY score DESC LIMIT $2 """ results: list[dict] = [] if include_halachot: rows = await pool.fetch(halacha_sql, *h_params) for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() d["score"] = float(d["score"]) d["type"] = "halacha" results.append(d) rows = await pool.fetch(chunk_sql, *c_params) for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() d["score"] = float(d["score"]) d["type"] = "passage" _maybe_swap_parent(d) results.append(d) results.sort(key=lambda x: x["score"], reverse=True) return _dedupe_by_parent(results, limit) async def precedent_library_stats() -> dict: """Aggregate stats for the /precedents stats tab.""" pool = await get_pool() async with pool.acquire() as conn: total = await conn.fetchval( "SELECT COUNT(*) FROM case_law" ) by_practice = await conn.fetch( """SELECT practice_area, COUNT(*) AS n FROM case_law GROUP BY practice_area ORDER BY n DESC""" ) by_level = await conn.fetch( """SELECT precedent_level, COUNT(*) AS n FROM case_law GROUP BY precedent_level ORDER BY n DESC""" ) halachot_total = await conn.fetchval( "SELECT COUNT(*) FROM halachot" ) halachot_pending = await conn.fetchval( "SELECT COUNT(*) FROM halachot WHERE review_status = 'pending_review'" ) halachot_approved = await conn.fetchval( "SELECT COUNT(*) FROM halachot WHERE review_status IN ('approved', 'published')" ) return { "precedents_total": int(total or 0), "by_practice_area": [ {"practice_area": r["practice_area"], "count": int(r["n"])} for r in by_practice ], "by_precedent_level": [ {"precedent_level": r["precedent_level"], "count": int(r["n"])} for r in by_level ], "halachot_total": int(halachot_total or 0), "halachot_pending": int(halachot_pending or 0), "halachot_approved": int(halachot_approved or 0), } # ── V8: extraction request queue helpers ───────────────────────── async def request_metadata_extraction(case_law_id: UUID) -> bool: """Stamp ``metadata_extraction_requested_at`` for the local MCP worker to pick up. Returns False if the row is missing. Originally restricted to ``source_kind='external_upload'`` (see git blame). Opened to all source kinds 2026-05-06 — internal_committee rows can also need re-extraction (e.g. corrupted subject_tags from an early ingest pipeline). The extractor itself preserves user values (``precedent_metadata_extractor.extract_and_apply`` only fills empty fields), so this is safe. """ pool = await get_pool() # Reset the status to 'pending' alongside the timestamp so a re-request # after a prior 'completed'/'failed' run shows "בתור" again in the UI # instead of a stale terminal badge. result = await pool.execute( "UPDATE case_law SET metadata_extraction_requested_at = now(), " "metadata_extraction_status = 'pending' " "WHERE id = $1", case_law_id, ) return result == "UPDATE 1" async def request_halacha_extraction(case_law_id: UUID) -> bool: """Same but for halacha extraction. See note on :func:`request_metadata_extraction` re: opening to all source kinds.""" pool = await get_pool() result = await pool.execute( "UPDATE case_law SET halacha_extraction_requested_at = now() " "WHERE id = $1", case_law_id, ) return result == "UPDATE 1" async def list_pending_extraction_requests( kind: str = "metadata", # 'metadata' | 'halacha' limit: int = 20, ) -> list[dict]: """Return rows requesting extraction, oldest request first. The MCP worker drains the queue in order: process → clear timestamp. """ col = ( "metadata_extraction_requested_at" if kind == "metadata" else "halacha_extraction_requested_at" ) pool = await get_pool() # Drop the legacy ``source_kind = 'external_upload'`` filter — without it # internal_committee rows could be stamped (we opened that gate in # request_metadata_extraction / request_halacha_extraction) but stayed # invisible to the worker forever. rows = await pool.fetch( f"""SELECT id, case_number, case_name, court, date, practice_area, is_binding, {col} AS requested_at FROM case_law WHERE {col} IS NOT NULL ORDER BY {col} ASC LIMIT $1""", limit, ) out = [] for r in rows: d = dict(r) if d.get("date") is not None: d["date"] = d["date"].isoformat() if d.get("requested_at") is not None: d["requested_at"] = d["requested_at"].isoformat() out.append(d) return out async def extraction_queue_status() -> dict: """Pending-extraction queue depth per kind (INV-TOOL4 visibility / GAP-45). Surfaces the otherwise-hidden queue that ``process_pending_extractions`` drains: how many case_law rows still carry a metadata/halacha extraction request, and the age of the oldest one. Read-only — does not drain. """ pool = await get_pool() async with pool.acquire() as conn: meta = await conn.fetchrow( "SELECT COUNT(*) AS n, MIN(metadata_extraction_requested_at) AS oldest " "FROM case_law WHERE metadata_extraction_requested_at IS NOT NULL" ) hal = await conn.fetchrow( "SELECT COUNT(*) AS n, MIN(halacha_extraction_requested_at) AS oldest " "FROM case_law WHERE halacha_extraction_requested_at IS NOT NULL" ) def _fmt(r: dict) -> dict: oldest = r["oldest"] return {"pending": r["n"], "oldest_request": oldest.isoformat() if oldest else None} return {"metadata": _fmt(meta), "halacha": _fmt(hal)} async def clear_extraction_request( case_law_id: UUID, kind: str = "metadata", ) -> None: col = ( "metadata_extraction_requested_at" if kind == "metadata" else "halacha_extraction_requested_at" ) pool = await get_pool() await pool.execute( f"UPDATE case_law SET {col} = NULL WHERE id = $1", case_law_id, ) # ── V9: Multimodal page image embeddings ───────────────────────── async def store_document_image_embeddings( document_id: UUID, case_id: UUID | None, page_records: list[dict], model_name: str = "voyage-multimodal-3", ) -> int: """Replace per-page image embeddings for a document. Each ``page_records`` entry: ``{page_number, embedding, image_thumbnail_path}``. Embeddings should already be 1024-dim lists (or None for skipped pages). """ pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "DELETE FROM document_image_embeddings WHERE document_id = $1", document_id, ) for r in page_records: await conn.execute( """INSERT INTO document_image_embeddings (document_id, case_id, page_number, embedding, image_thumbnail_path, model_name) VALUES ($1, $2, $3, $4, $5, $6)""", document_id, case_id, r["page_number"], r.get("embedding"), r.get("image_thumbnail_path"), model_name, ) return len(page_records) async def store_precedent_image_embeddings( case_law_id: UUID, page_records: list[dict], model_name: str = "voyage-multimodal-3", ) -> int: """Same pattern as store_document_image_embeddings but for precedents.""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "DELETE FROM precedent_image_embeddings WHERE case_law_id = $1", case_law_id, ) for r in page_records: await conn.execute( """INSERT INTO precedent_image_embeddings (case_law_id, page_number, embedding, image_thumbnail_path, model_name) VALUES ($1, $2, $3, $4, $5)""", case_law_id, r["page_number"], r.get("embedding"), r.get("image_thumbnail_path"), model_name, ) return len(page_records) async def search_document_images_similar( query_embedding: list[float], limit: int = 10, case_id: UUID | None = None, practice_area: str | None = None, appeal_subtype: str | None = None, ) -> list[dict]: """Cosine search over per-page image embeddings of case documents.""" pool = await get_pool() conditions: list[str] = [] params: list = [query_embedding, limit] idx = 3 if case_id: conditions.append(f"die.case_id = ${idx}") params.append(case_id); idx += 1 if practice_area: conditions.append(f"c.practice_area = ${idx}") params.append(practice_area); idx += 1 if appeal_subtype: conditions.append(f"c.appeal_subtype = ${idx}") params.append(appeal_subtype); idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT die.document_id, die.case_id, die.page_number, die.image_thumbnail_path, d.title AS document_title, c.case_number, 1 - (die.embedding <=> $1) AS score FROM document_image_embeddings die JOIN documents d ON d.id = die.document_id JOIN cases c ON c.id = die.case_id {where} ORDER BY die.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] async def search_precedent_images_similar( query_embedding: list[float], limit: int = 10, practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, ) -> list[dict]: """Cosine search over per-page image embeddings of precedent rulings.""" pool = await get_pool() conditions: list[str] = ["cl.source_kind = 'external_upload'"] params: list = [query_embedding, limit] idx = 3 if practice_area: conditions.append(f"cl.practice_area = ${idx}") params.append(practice_area); idx += 1 if court: conditions.append(f"cl.court ILIKE ${idx}") params.append(f"%{court}%"); idx += 1 if precedent_level: conditions.append(f"cl.precedent_level = ${idx}") params.append(precedent_level); idx += 1 if appeal_subtype: conditions.append(f"cl.appeal_subtype = ${idx}") params.append(appeal_subtype); idx += 1 if is_binding is not None: conditions.append(f"cl.is_binding = ${idx}") params.append(is_binding); idx += 1 where = " AND ".join(conditions) sql = f""" SELECT pie.case_law_id, pie.page_number, pie.image_thumbnail_path, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.practice_area, 1 - (pie.embedding <=> $1) AS score FROM precedent_image_embeddings pie JOIN case_law cl ON cl.id = pie.case_law_id WHERE {where} ORDER BY pie.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) out = [] for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() out.append(d) return out async def search_similar_hybrid( query_text_embedding: list[float], query_image_embedding: list[float], limit: int = 10, fetch_k: int = 30, text_weight: float = 0.65, case_id: UUID | None = None, section_type: str | None = None, practice_area: str | None = None, appeal_subtype: str | None = None, ) -> list[dict]: """Weighted merge of text-chunk and per-page image search. Same (document_id, page_number) → boost text chunk by image score on that page. Image-only pages with no overlapping text chunk are surfaced as ``match_type='image'`` so dense scanned content still appears in results. """ img_weight = 1.0 - text_weight text_rows = await search_similar( query_text_embedding, limit=fetch_k, case_id=case_id, section_type=section_type, practice_area=practice_area, appeal_subtype=appeal_subtype, ) img_rows = await search_document_images_similar( query_image_embedding, limit=fetch_k, case_id=case_id, practice_area=practice_area, appeal_subtype=appeal_subtype, ) img_by_page: dict[tuple, dict] = { (str(r["document_id"]), r["page_number"]): r for r in img_rows } seen: set = set() merged: list[dict] = [] for r in text_rows: page = r.get("page_number") key = (str(r["document_id"]), page) if page is not None else None img_hit = img_by_page.get(key) if key else None text_score = float(r["score"]) image_score = float(img_hit["score"]) if img_hit else 0.0 d = dict(r) d["text_score"] = text_score d["image_score"] = image_score d["score"] = text_score * text_weight + image_score * img_weight d["match_type"] = "text+image" if img_hit else "text" if img_hit: d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path") merged.append(d) if key: seen.add(key) for r in img_rows: key = (str(r["document_id"]), r["page_number"]) if key in seen: continue d = dict(r) d["text_score"] = 0.0 d["image_score"] = float(r["score"]) d["score"] = float(r["score"]) * img_weight d["match_type"] = "image" d["content"] = "" d["section_type"] = "image" merged.append(d) merged.sort(key=lambda x: -x["score"]) return merged[:limit] async def search_precedent_library_hybrid( query_text_embedding: list[float], query_image_embedding: list[float], limit: int = 10, fetch_k: int = 30, text_weight: float = 0.65, practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, subject_tag: str = "", include_halachot: bool = True, ) -> list[dict]: """Hybrid variant of search_precedent_library_semantic. Halachot have no ``page_number`` — they're boosted by the max image score from any page in the same case_law row. """ img_weight = 1.0 - text_weight text_results = await search_precedent_library_semantic( query_text_embedding, practice_area=practice_area, court=court, precedent_level=precedent_level, appeal_subtype=appeal_subtype, is_binding=is_binding, subject_tag=subject_tag, limit=fetch_k, include_halachot=include_halachot, ) img_results = await search_precedent_images_similar( query_image_embedding, limit=fetch_k, practice_area=practice_area, court=court, precedent_level=precedent_level, appeal_subtype=appeal_subtype, is_binding=is_binding, ) img_by_page: dict[tuple, dict] = {} img_by_case: dict[str, float] = {} for r in img_results: cid = str(r["case_law_id"]) img_by_page[(cid, r["page_number"])] = r img_by_case[cid] = max(img_by_case.get(cid, 0.0), float(r["score"])) seen: set = set() merged: list[dict] = [] for r in text_results: cid = str(r["case_law_id"]) page = r.get("page_number") key = (cid, page) if page is not None else None img_hit = img_by_page.get(key) if key else None if img_hit: image_score = float(img_hit["score"]) elif r.get("type") == "halacha": image_score = img_by_case.get(cid, 0.0) else: image_score = 0.0 text_score = float(r["score"]) d = dict(r) d["text_score"] = text_score d["image_score"] = image_score d["score"] = text_score * text_weight + image_score * img_weight if img_hit: d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path") if key: seen.add(key) merged.append(d) for r in img_results: key = (str(r["case_law_id"]), r["page_number"]) if key in seen: continue d = dict(r) d["text_score"] = 0.0 d["image_score"] = float(r["score"]) d["score"] = float(r["score"]) * img_weight d["type"] = "image_page" d["content"] = "" d["section_type"] = "image" merged.append(d) merged.sort(key=lambda x: -x["score"]) return merged[:limit] # ── Missing precedents (V13) ─────────────────────────────────────── # Track citations from party briefs that aren't yet in the corpus. # Lifecycle: 'open' → researcher logs gap → chair uploads decision # → status='uploaded' (file ingested) → status='closed' (linked to # case_law row). 'irrelevant' = chair decided the citation isn't worth # adding to the library. ALLOWED_MP_PARTIES = { "appellant", "respondent", "committee", "permit_applicant", "unknown", } ALLOWED_MP_STATUS = {"open", "uploaded", "closed", "irrelevant"} def _row_to_missing_precedent(row: asyncpg.Record) -> dict: d = dict(row) d["id"] = str(d["id"]) if d.get("cited_in_case_id") is not None: d["cited_in_case_id"] = str(d["cited_in_case_id"]) if d.get("cited_in_document_id") is not None: d["cited_in_document_id"] = str(d["cited_in_document_id"]) if d.get("linked_case_law_id") is not None: d["linked_case_law_id"] = str(d["linked_case_law_id"]) return d async def create_missing_precedent( citation: str, case_name: str | None = None, cited_in_case_id: UUID | None = None, cited_in_document_id: UUID | None = None, cited_by_party: str | None = None, cited_by_party_name: str | None = None, legal_topic: str | None = None, legal_issue: str | None = None, claim_quote: str | None = None, notes: str | None = None, ) -> dict: """Create a new missing-precedent row (status='open' by default).""" if not citation.strip(): raise ValueError("citation is required") if cited_by_party and cited_by_party not in ALLOWED_MP_PARTIES: raise ValueError( f"cited_by_party must be one of {sorted(ALLOWED_MP_PARTIES)}" ) pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """INSERT INTO missing_precedents ( citation, case_name, cited_in_case_id, cited_in_document_id, cited_by_party, cited_by_party_name, legal_topic, legal_issue, claim_quote, notes ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING *""", citation.strip(), case_name, cited_in_case_id, cited_in_document_id, cited_by_party, cited_by_party_name, legal_topic, legal_issue, claim_quote, notes, ) return _row_to_missing_precedent(row) async def list_missing_precedents( status: str | None = None, case_id: UUID | None = None, legal_topic: str | None = None, limit: int = 200, offset: int = 0, ) -> list[dict]: """List missing precedents, joining the cited-in case_number for display.""" pool = await get_pool() conditions: list[str] = [] params: list = [] idx = 1 if status: conditions.append(f"mp.status = ${idx}") params.append(status) idx += 1 if case_id: conditions.append(f"mp.cited_in_case_id = ${idx}") params.append(case_id) idx += 1 if legal_topic: conditions.append(f"mp.legal_topic ILIKE ${idx}") params.append(f"%{legal_topic}%") idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" params.append(limit) params.append(offset) sql = f""" SELECT mp.*, c.case_number AS cited_in_case_number, cl.case_number AS linked_case_law_number, cl.case_name AS linked_case_law_name FROM missing_precedents mp LEFT JOIN cases c ON c.id = mp.cited_in_case_id LEFT JOIN case_law cl ON cl.id = mp.linked_case_law_id {where} ORDER BY CASE mp.status WHEN 'open' THEN 0 WHEN 'uploaded' THEN 1 WHEN 'closed' THEN 2 WHEN 'irrelevant' THEN 3 END, mp.created_at DESC LIMIT ${idx} OFFSET ${idx + 1} """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [_row_to_missing_precedent(r) for r in rows] async def get_missing_precedent(mp_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """ SELECT mp.*, c.case_number AS cited_in_case_number, cl.case_number AS linked_case_law_number, cl.case_name AS linked_case_law_name FROM missing_precedents mp LEFT JOIN cases c ON c.id = mp.cited_in_case_id LEFT JOIN case_law cl ON cl.id = mp.linked_case_law_id WHERE mp.id = $1 """, mp_id, ) return _row_to_missing_precedent(row) if row else None async def update_missing_precedent(mp_id: UUID, **fields) -> dict | None: """Patch a missing-precedent row. Allowed fields: legal_topic, legal_issue, notes, cited_by_party, cited_by_party_name, case_name, status, linked_case_law_id, closed_at.""" if not fields: return await get_missing_precedent(mp_id) allowed = { "legal_topic", "legal_issue", "notes", "cited_by_party", "cited_by_party_name", "case_name", "status", "linked_case_law_id", "closed_at", "claim_quote", "citation", } clean = {k: v for k, v in fields.items() if k in allowed} if not clean: return await get_missing_precedent(mp_id) if "status" in clean and clean["status"] not in ALLOWED_MP_STATUS: raise ValueError( f"status must be one of {sorted(ALLOWED_MP_STATUS)}" ) if "cited_by_party" in clean and clean["cited_by_party"] and \ clean["cited_by_party"] not in ALLOWED_MP_PARTIES: raise ValueError( f"cited_by_party must be one of {sorted(ALLOWED_MP_PARTIES)}" ) set_clauses = [] values = [] for i, (key, val) in enumerate(clean.items(), start=2): set_clauses.append(f"{key} = ${i}") values.append(val) set_clauses.append("updated_at = now()") sql = ( f"UPDATE missing_precedents SET {', '.join(set_clauses)} " f"WHERE id = $1 RETURNING *" ) pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow(sql, mp_id, *values) return _row_to_missing_precedent(row) if row else None async def close_missing_precedent( mp_id: UUID, linked_case_law_id: UUID | None = None, notes: str | None = None, status: str = "closed", ) -> dict | None: """Mark a missing-precedent row as closed (or 'uploaded'/'irrelevant') and link it to a case_law row if provided.""" if status not in ALLOWED_MP_STATUS: raise ValueError( f"status must be one of {sorted(ALLOWED_MP_STATUS)}" ) pool = await get_pool() async with pool.acquire() as conn: sets = ["status = $2", "closed_at = now()", "updated_at = now()"] params: list = [mp_id, status] idx = 3 if linked_case_law_id is not None: sets.append(f"linked_case_law_id = ${idx}") params.append(linked_case_law_id) idx += 1 if notes is not None: sets.append(f"notes = ${idx}") params.append(notes) idx += 1 sql = ( f"UPDATE missing_precedents SET {', '.join(sets)} " f"WHERE id = $1 RETURNING *" ) row = await conn.fetchrow(sql, *params) return _row_to_missing_precedent(row) if row else None async def find_missing_precedent_by_citation( citation: str, case_id: UUID | None = None, ) -> dict | None: """Look up an existing row by citation string (exact match) and optionally cited-in case_id. Used to deduplicate auto-creation by the researcher.""" pool = await get_pool() async with pool.acquire() as conn: if case_id is not None: row = await conn.fetchrow( "SELECT * FROM missing_precedents " "WHERE citation = $1 AND cited_in_case_id = $2 LIMIT 1", citation.strip(), case_id, ) else: row = await conn.fetchrow( "SELECT * FROM missing_precedents WHERE citation = $1 LIMIT 1", citation.strip(), ) return _row_to_missing_precedent(row) if row else None # ── X13 — Court Verdict Fetch jobs ─────────────────────────────────────── # CRUD for the auto-fetch queue (docs/spec/X13-court-fetch.md). Status is # always explicit; failures are recorded, never swallowed (INV-CF2). Upsert # is keyed on the canonical case number (INV-CF5). def _row_to_court_fetch_job(row) -> dict: return dict(row) if row else None async def court_fetch_job_upsert( case_number_norm: str, citation_raw: str = "", tier: str = "", court: str = "", digest_id: UUID | None = None, ) -> dict: """Idempotent create-or-get of a fetch job by canonical case number. Re-requesting the same case number returns the existing row (with a ``_existing`` flag) rather than creating a duplicate — the canonical number is a UNIQUE key. A job that already reached a terminal state is returned as-is so callers can decide whether to retry. """ if not (case_number_norm or "").strip(): raise ValueError("case_number_norm is required") pool = await get_pool() async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT * FROM court_fetch_jobs WHERE case_number_norm = $1", case_number_norm, ) if existing: out = _row_to_court_fetch_job(existing) out["_existing"] = True return out row = await conn.fetchrow( """INSERT INTO court_fetch_jobs (case_number_norm, citation_raw, tier, court, digest_id) VALUES ($1, $2, $3, $4, $5) RETURNING *""", case_number_norm, citation_raw, tier, court, digest_id, ) out = _row_to_court_fetch_job(row) out["_existing"] = False return out async def court_fetch_job_update( job_id: UUID, *, status: str | None = None, error: str | None = None, case_law_id: UUID | None = None, source_url: str | None = None, bump_attempts: bool = False, ) -> dict: """Patch a job row. Only provided fields change; ``updated_at`` always does.""" sets = ["updated_at = now()"] args: list = [] if status is not None: args.append(status); sets.append(f"status = ${len(args)}") if error is not None: args.append(error); sets.append(f"error = ${len(args)}") if case_law_id is not None: args.append(case_law_id); sets.append(f"case_law_id = ${len(args)}") if source_url is not None: args.append(source_url); sets.append(f"source_url = ${len(args)}") if bump_attempts: sets.append("attempts = attempts + 1") args.append(job_id) pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( f"UPDATE court_fetch_jobs SET {', '.join(sets)} " f"WHERE id = ${len(args)} RETURNING *", *args, ) return _row_to_court_fetch_job(row) async def court_fetch_job_get(case_number_norm: str) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM court_fetch_jobs WHERE case_number_norm = $1", case_number_norm, ) return _row_to_court_fetch_job(row) if row else None async def court_fetch_job_list(status: str | None = None, limit: int = 100) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: if status: rows = await conn.fetch( "SELECT * FROM court_fetch_jobs WHERE status = $1 " "ORDER BY created_at DESC LIMIT $2", status, limit, ) else: rows = await conn.fetch( "SELECT * FROM court_fetch_jobs ORDER BY created_at DESC LIMIT $1", limit, ) return [_row_to_court_fetch_job(r) for r in rows]