"""Database service - asyncpg connection pool and queries.""" from __future__ import annotations import asyncio import json import logging from datetime import date from uuid import UUID, uuid4 import asyncpg from pgvector.asyncpg import register_vector from legal_mcp import config logger = logging.getLogger(__name__) _pool: asyncpg.Pool | None = None _schema_ready: bool = False _init_lock: asyncio.Lock = asyncio.Lock() async def get_pool() -> asyncpg.Pool: """Return the connection pool, creating it (and running schema init) lazily. The MCP server's `lifespan` no longer blocks on schema init — it's done here on first DB access. This keeps the `initialize`/`tools/list` MCP handshake immediate so Claude Code never sees a stale "No such tool". """ global _pool, _schema_ready if _pool is not None and _schema_ready: return _pool async with _init_lock: if _pool is None: # First, ensure pgvector extension exists (before registering type codec) conn = await asyncpg.connect(config.POSTGRES_URL) try: await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"') finally: await conn.close() _pool = await asyncpg.create_pool( config.POSTGRES_URL, min_size=2, max_size=10, init=_init_connection, ) if not _schema_ready: await _run_schema_migrations(_pool) _schema_ready = True return _pool async def _init_connection(conn: asyncpg.Connection) -> None: await register_vector(conn) async def close_pool() -> None: global _pool if _pool: await _pool.close() _pool = None # ── Schema ────────────────────────────────────────────────────────── SCHEMA_SQL = """ CREATE TABLE IF NOT EXISTS cases ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_number TEXT UNIQUE NOT NULL, title TEXT NOT NULL, appellants JSONB DEFAULT '[]', respondents JSONB DEFAULT '[]', subject TEXT DEFAULT '', property_address TEXT DEFAULT '', permit_number TEXT DEFAULT '', committee_type TEXT DEFAULT 'ועדה מקומית', status TEXT DEFAULT 'new', hearing_date DATE, decision_date DATE, tags JSONB DEFAULT '[]', notes TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS documents ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE CASCADE, doc_type TEXT NOT NULL, title TEXT NOT NULL, file_path TEXT NOT NULL, extracted_text TEXT DEFAULT '', extraction_status TEXT DEFAULT 'pending', page_count INTEGER, metadata JSONB DEFAULT '{}', created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS document_chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID REFERENCES documents(id) ON DELETE CASCADE, case_id UUID REFERENCES cases(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, section_type TEXT DEFAULT 'other', embedding vector(1024), page_number INTEGER, created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS style_corpus ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID REFERENCES documents(id) ON DELETE SET NULL, decision_number TEXT, decision_date DATE, subject_categories JSONB DEFAULT '[]', full_text TEXT NOT NULL, summary TEXT DEFAULT '', outcome TEXT DEFAULT '', key_principles JSONB DEFAULT '[]', practice_area TEXT DEFAULT 'appeals_committee', appeal_subtype TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS style_patterns ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), pattern_type TEXT NOT NULL, pattern_text TEXT NOT NULL, frequency INTEGER DEFAULT 1, context TEXT DEFAULT '', examples JSONB DEFAULT '[]', appeal_subtype TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id); CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id); CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id); CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status); CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number); """ MIGRATIONS_SQL = """ ALTER TABLE cases ADD COLUMN IF NOT EXISTS expected_outcome TEXT DEFAULT ''; CREATE TABLE IF NOT EXISTS audit_log ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), action TEXT NOT NULL, case_id UUID REFERENCES cases(id) ON DELETE SET NULL, document_id UUID REFERENCES documents(id) ON DELETE SET NULL, details JSONB DEFAULT '{}', actor TEXT DEFAULT 'system', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_audit_case ON audit_log(case_id); CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_log(action); CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_log(created_at DESC); """ # ── Phase 3: Workflow expansion ──────────────────────────────────── SCHEMA_V3_SQL = """ -- הרחבת decisions עם שדות חדשים ALTER TABLE decisions ADD COLUMN IF NOT EXISTS direction_doc JSONB DEFAULT NULL; ALTER TABLE decisions ADD COLUMN IF NOT EXISTS outcome_reasoning TEXT DEFAULT ''; -- הרחבת cases עם appeal_type (אם לא קיים) ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_type TEXT DEFAULT ''; ALTER TABLE cases ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee'; ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; -- active_draft_path = path to the DOCX that is the current source of truth -- for this case's decision text. Set to the latest טיוטה-v*.docx after export, -- or the latest עריכה-v*.docx after user upload. Used by revise_draft to know -- what file to base Track Changes revisions on. ALTER TABLE cases ADD COLUMN IF NOT EXISTS active_draft_path TEXT; -- הרחבת style_corpus עם practice_area / appeal_subtype ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee'; ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; -- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; -- טבלת qa_results CREATE TABLE IF NOT EXISTS qa_results ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, case_id UUID REFERENCES cases(id) ON DELETE CASCADE, check_name TEXT NOT NULL, passed BOOLEAN NOT NULL, severity TEXT DEFAULT 'warning', errors JSONB DEFAULT '[]', details TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_qa_results_decision ON qa_results(decision_id); CREATE INDEX IF NOT EXISTS idx_qa_results_case ON qa_results(case_id); -- טבלת decision_definitions (אם לא קיימת) CREATE TABLE IF NOT EXISTS decision_definitions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, term TEXT NOT NULL, definition TEXT NOT NULL, block_id TEXT DEFAULT 'block-he', created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_definitions_decision ON decision_definitions(decision_id); -- טבלת appeal_type_rules (אם לא קיימת) CREATE TABLE IF NOT EXISTS appeal_type_rules ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), appeal_type TEXT NOT NULL, rule_category TEXT NOT NULL, rule_key TEXT NOT NULL, rule_value JSONB NOT NULL, description TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(appeal_type, rule_category, rule_key) ); -- image_placeholders על decision_blocks ALTER TABLE decision_blocks ADD COLUMN IF NOT EXISTS image_placeholders JSONB DEFAULT '[]'; """ # ── Phase 2: Decision + Knowledge + RAG layers ──────────────────── SCHEMA_V2_SQL = """ -- ═══════════════════════════════════════════════════════════════════ -- Layer 2: Decision -- ═══════════════════════════════════════════════════════════════════ -- decisions: מטאדטה של החלטה (גרסה אחת = רשומה אחת) CREATE TABLE IF NOT EXISTS decisions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE CASCADE, version INTEGER DEFAULT 1, status TEXT DEFAULT 'draft', -- draft/review/final/published outcome TEXT DEFAULT '', -- rejected/accepted/partial outcome_summary TEXT DEFAULT '', -- תמצית תוצאה (שורה אחת) total_paragraphs INTEGER DEFAULT 0, total_words INTEGER DEFAULT 0, decision_date DATE, author TEXT DEFAULT 'דפנה תמיר', panel_members JSONB DEFAULT '[]', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now(), UNIQUE(case_id, version) ); -- decision_blocks: 12 בלוקים לפי block-schema.md CREATE TABLE IF NOT EXISTS decision_blocks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, block_id TEXT NOT NULL, -- block-alef, block-bet, ... block-yod-bet block_index INTEGER NOT NULL, -- 1-12 title TEXT DEFAULT '', -- כותרת הבלוק (ריק לבלוקים ללא כותרת) content TEXT DEFAULT '', -- תוכן מלא (markdown) word_count INTEGER DEFAULT 0, weight_percent NUMERIC(5,2) DEFAULT 0, -- משקל בפועל (%) generation_type TEXT DEFAULT '', -- template-fill/reproduction/paraphrase/... model_used TEXT DEFAULT '', -- sonnet/opus/script temperature NUMERIC(3,2) DEFAULT 0, status TEXT DEFAULT 'empty', -- empty/draft/review/final notes TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now(), UNIQUE(decision_id, block_id) ); -- decision_paragraphs: סעיפים בודדים עם מעקב ציטוטים CREATE TABLE IF NOT EXISTS decision_paragraphs ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), block_id UUID REFERENCES decision_blocks(id) ON DELETE CASCADE, paragraph_number INTEGER NOT NULL, -- מספור רציף בתוך ההחלטה content TEXT NOT NULL, word_count INTEGER DEFAULT 0, citations JSONB DEFAULT '[]', -- [{case_law_id, text, type}] cross_references JSONB DEFAULT '[]', -- הפניות לסעיפים אחרים ["סעיף 5 לעיל"] created_at TIMESTAMPTZ DEFAULT now() ); -- claims: טענות צדדים (בלוק ז) CREATE TABLE IF NOT EXISTS claims ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE CASCADE, party_role TEXT NOT NULL, -- appellant/respondent/permit_applicant/committee party_name TEXT DEFAULT '', claim_text TEXT NOT NULL, claim_index INTEGER DEFAULT 0, -- סדר הופעה source_document TEXT DEFAULT '', -- מאיזה מסמך חולצה הטענה addressed_in_paragraph INTEGER, -- באיזה סעיף בדיון נענתה created_at TIMESTAMPTZ DEFAULT now() ); -- ═══════════════════════════════════════════════════════════════════ -- Layer 3: Legal Knowledge -- ═══════════════════════════════════════════════════════════════════ -- case_law: פסיקה (תקדימים) CREATE TABLE IF NOT EXISTS case_law ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_number TEXT UNIQUE NOT NULL, -- עע"מ 3975/22 או ערר 1011-03-25 case_name TEXT NOT NULL, -- שם קצר: "ב. קרן-נכסים" court TEXT DEFAULT '', -- בג"ץ / עליון / מנהלי / ועדת ערר date DATE, subject_tags JSONB DEFAULT '[]', -- ["proprietary_claims", "parking"] summary TEXT DEFAULT '', -- תמצית 2-3 משפטים key_quote TEXT DEFAULT '', -- ציטוט מרכזי full_text TEXT DEFAULT '', -- טקסט מלא אם זמין source_url TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); -- case_law_citations: קשרים בין פסיקה להחלטות שלנו CREATE TABLE IF NOT EXISTS case_law_citations ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, decision_id UUID REFERENCES decisions(id) ON DELETE CASCADE, paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE SET NULL, citation_type TEXT DEFAULT 'support', -- support/distinguish/overrule/obiter context_text TEXT DEFAULT '', -- ההקשר שבו צוטט created_at TIMESTAMPTZ DEFAULT now() ); -- statutory_provisions: חקיקה נפוצה CREATE TABLE IF NOT EXISTS statutory_provisions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), statute_name TEXT NOT NULL, -- "חוק התכנון והבנייה" section_number TEXT NOT NULL, -- "152(א)(2)" section_title TEXT DEFAULT '', -- "זכות ערר" full_text TEXT DEFAULT '', -- נוסח הסעיף common_usage TEXT DEFAULT '', -- מתי משתמשים subject_tags JSONB DEFAULT '[]', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(statute_name, section_number) ); -- transition_phrases: ביטויי מעבר של דפנה CREATE TABLE IF NOT EXISTS transition_phrases ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), phrase TEXT UNIQUE NOT NULL, -- "ועל מנת לא לצאת בחסר" usage_context TEXT DEFAULT '', -- מתי להשתמש block_types JSONB DEFAULT '[]', -- באילו בלוקים: ["block-yod"] frequency INTEGER DEFAULT 1, -- כמה פעמים ראינו source_decision TEXT DEFAULT '', -- מאיזו החלטה created_at TIMESTAMPTZ DEFAULT now() ); -- lessons_learned: לקחים מהשוואת טיוטות לגרסאות סופיות CREATE TABLE IF NOT EXISTS lessons_learned ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), lesson_title TEXT NOT NULL, -- "Discussion = continuous essay, no sub-headers" lesson_text TEXT NOT NULL, -- תיאור מלא category TEXT DEFAULT '', -- structure/style/content/process applies_to JSONB DEFAULT '[]', -- ["block-yod", "all"] source_case TEXT DEFAULT '', -- "הכט 1180-1181" severity TEXT DEFAULT 'important', -- critical/important/nice-to-have created_at TIMESTAMPTZ DEFAULT now() ); -- ═══════════════════════════════════════════════════════════════════ -- Layer 4: Extended RAG -- ═══════════════════════════════════════════════════════════════════ -- paragraph_embeddings: embeddings של סעיפים בהחלטות CREATE TABLE IF NOT EXISTS paragraph_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), paragraph_id UUID REFERENCES decision_paragraphs(id) ON DELETE CASCADE, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); -- case_law_embeddings: embeddings של פסיקה CREATE TABLE IF NOT EXISTS case_law_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, chunk_text TEXT NOT NULL, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); -- ═══════════════════════════════════════════════════════════════════ -- Chair Feedback (הערות דפנה על טיוטות) -- ═══════════════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS chair_feedback ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID REFERENCES cases(id) ON DELETE SET NULL, block_id TEXT DEFAULT '', -- block-yod, block-vav, etc. feedback_text TEXT NOT NULL, -- ההערה של דפנה category TEXT DEFAULT 'other', -- missing_content/wrong_tone/wrong_structure/factual_error/style/other lesson_extracted TEXT DEFAULT '', -- הלקח שהופק applied_to TEXT[] DEFAULT '{}', -- לאילו קבצים/כללים הלקח יושם resolved BOOLEAN DEFAULT FALSE, -- האם הלקח יושם created_at TIMESTAMPTZ DEFAULT now() ); CREATE TABLE IF NOT EXISTS tag_company_mappings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), tag TEXT NOT NULL, -- appeal_subtype value (e.g. building_permit) tag_label TEXT NOT NULL DEFAULT '', -- Hebrew display label company_id TEXT NOT NULL, -- Paperclip company UUID company_name TEXT NOT NULL DEFAULT '', -- cached company name for display created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(tag, company_id) ); -- ═══════════════════════════════════════════════════════════════════ -- Indexes -- ═══════════════════════════════════════════════════════════════════ CREATE INDEX IF NOT EXISTS idx_decisions_case ON decisions(case_id); CREATE INDEX IF NOT EXISTS idx_decisions_status ON decisions(status); CREATE INDEX IF NOT EXISTS idx_decision_blocks_decision ON decision_blocks(decision_id); CREATE INDEX IF NOT EXISTS idx_decision_blocks_block_id ON decision_blocks(block_id); CREATE INDEX IF NOT EXISTS idx_decision_paragraphs_block ON decision_paragraphs(block_id); CREATE INDEX IF NOT EXISTS idx_claims_case ON claims(case_id); CREATE INDEX IF NOT EXISTS idx_claims_role ON claims(party_role); CREATE INDEX IF NOT EXISTS idx_case_law_subject ON case_law USING gin(subject_tags); CREATE INDEX IF NOT EXISTS idx_case_law_citations_decision ON case_law_citations(decision_id); CREATE INDEX IF NOT EXISTS idx_statutory_provisions_statute ON statutory_provisions(statute_name); CREATE INDEX IF NOT EXISTS idx_transition_phrases_block ON transition_phrases USING gin(block_types); CREATE INDEX IF NOT EXISTS idx_lessons_category ON lessons_learned(category); CREATE INDEX IF NOT EXISTS idx_paragraph_embeddings_vec ON paragraph_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); CREATE INDEX IF NOT EXISTS idx_case_law_embeddings_vec ON case_law_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); """ # ── Phase 4: Methodology alignment ────────────────────────────── SCHEMA_V4_SQL = """ -- ═══════════════════════════════════════════════════════════════════ -- V4: Methodology alignment (decision-methodology.md) -- ═══════════════════════════════════════════════════════════════════ -- claims: טיפול בטענות (bundle/skip) + סוג טענה ALTER TABLE claims ADD COLUMN IF NOT EXISTS claim_type TEXT DEFAULT 'claim'; -- claim / response / reply ALTER TABLE claims ADD COLUMN IF NOT EXISTS claim_handling TEXT DEFAULT 'address'; -- address (דיון מלא) / bundle (קיבוץ) / skip (דילוג) ALTER TABLE claims ADD COLUMN IF NOT EXISTS bundle_group TEXT DEFAULT ''; -- שם הקבוצה לקיבוץ (למשל "פגמים פרוצדורליים") ALTER TABLE claims ADD COLUMN IF NOT EXISTS handling_reason TEXT DEFAULT ''; -- נימוק לדילוג/קיבוץ (למשל "נבחנה ולא מצאנו ממש") -- cases: תקן ביקורת + קטגוריות נושא ALTER TABLE cases ADD COLUMN IF NOT EXISTS standard_of_review TEXT DEFAULT ''; -- "שיקול דעת תכנוני עצמאי" / "בחינת שומה מכרעת" / ... ALTER TABLE cases ADD COLUMN IF NOT EXISTS subject_categories JSONB DEFAULT '[]'; -- ["חניה", "קווי בניין", "גובה", "שימוש חורג", ...] -- case_law: רמת תקדים + מעמד ALTER TABLE case_law ADD COLUMN IF NOT EXISTS precedent_level TEXT DEFAULT ''; -- עליון / מנהלי / ועדת ערר ארצית / ועדת ערר מחוזית ALTER TABLE case_law ADD COLUMN IF NOT EXISTS is_binding BOOLEAN DEFAULT TRUE; -- הלכה מחייבת (true) / אמרת אגב (false) ALTER TABLE case_law ADD COLUMN IF NOT EXISTS creac_role TEXT DEFAULT ''; -- rule (הנחה עליונה) / explanation (הרחבה) / analogy (אנלוגיה) -- decisions: סדר סוגיות + תקן ביקורת ALTER TABLE decisions ADD COLUMN IF NOT EXISTS issue_order JSONB DEFAULT '[]'; -- סדר הסוגיות שנקבע ע"י המנצח: [{"title": "...", "type": "threshold/dispositive/secondary"}] ALTER TABLE decisions ADD COLUMN IF NOT EXISTS claim_handling JSONB DEFAULT '{}'; -- {"overrides": [{"claim_id": "...", "handling": "bundle", "group": "..."}]} -- indexes CREATE INDEX IF NOT EXISTS idx_claims_handling ON claims(claim_handling); CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type); CREATE INDEX IF NOT EXISTS idx_case_law_level ON case_law(precedent_level); """ # ── Phase 5: Interim draft (appraiser facts + post-hearing flag) ─── SCHEMA_V5_SQL = """ -- appraiser_facts: תכניות והיתרים שצוינו ע"י כל שמאי בנפרד. -- בשונה מ-claims (שהוא טענה משפטית), כאן מאוחסנת עובדה עניינית מתוך השומה. -- שימוש ראשי: זיהוי סתירות בין שמאים על איזו תכנית או היתר חל בנכס. CREATE TABLE IF NOT EXISTS appraiser_facts ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_id UUID NOT NULL REFERENCES cases(id) ON DELETE CASCADE, document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, appraiser_name TEXT NOT NULL, fact_type TEXT NOT NULL CHECK (fact_type IN ('plan', 'permit')), identifier TEXT NOT NULL, details JSONB NOT NULL DEFAULT '{}', page_number INTEGER, created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_appraiser_facts_case ON appraiser_facts(case_id, fact_type); CREATE INDEX IF NOT EXISTS idx_appraiser_facts_identifier ON appraiser_facts(case_id, identifier); -- V5.1: appraiser_side — which party this appraiser represents. -- Values: 'committee' (הוועדה), 'appellant' (העורר), 'deciding' (מכריע). -- Required by extract_appraiser_facts; the chair tags it via the UI before extraction. -- Set via documents.metadata.appraiser_side at upload/edit time, then propagated here -- so that conflict rendering in block-tet can label each entry with its side. ALTER TABLE appraiser_facts ADD COLUMN IF NOT EXISTS appraiser_side TEXT DEFAULT ''; CREATE INDEX IF NOT EXISTS idx_appraiser_facts_side ON appraiser_facts(case_id, appraiser_side); -- documents.metadata.is_post_hearing: flag for materials submitted after the hearing -- (השלמות טיעון, הצעות פשרה). Used by block-chet to include them in the proceedings narrative. -- documents.metadata.appraiser_side: which side the appraiser represents (see above). -- No schema change needed — uses existing JSONB metadata column. """ # ── V6: Case archiving ──────────────────────────────────────────── SCHEMA_V6_SQL = """ -- archived_at: timestamp when the case was moved to the archive screen. -- NULL = active (default). Set via POST /api/cases/{case_number}/archive. -- Cleared via POST /api/cases/{case_number}/restore. -- The /api/cases endpoint filters out archived cases by default; -- pass ?include_archived=true (or use /api/cases/archived) to see them. ALTER TABLE cases ADD COLUMN IF NOT EXISTS archived_at TIMESTAMPTZ; CREATE INDEX IF NOT EXISTS idx_cases_archived ON cases(archived_at) WHERE archived_at IS NOT NULL; """ # ── V7: External Precedent Library + halacha extraction ────────── # Chair-uploaded external court rulings and other appeals committee decisions # become an authoritative law corpus. Distinct from style_corpus (Daphna's # style) and case_precedents (chair-attached quotes scoped to a single case). SCHEMA_V7_SQL = """ -- case_law extensions: distinguish chair-uploaded full rulings from -- auto-extracted citation stubs, and track ingestion progress. ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_kind TEXT DEFAULT 'cited_only'; -- 'external_upload' (chair uploaded full ruling) | 'cited_only' (stub from -- references_extractor) | 'nevo_seed' (future: auto-fetched from Nevo). ALTER TABLE case_law ADD COLUMN IF NOT EXISTS document_id UUID REFERENCES documents(id) ON DELETE SET NULL; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS extraction_status TEXT DEFAULT 'pending'; -- 'pending' | 'processing' | 'completed' | 'failed' ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_status TEXT DEFAULT 'pending'; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT ''; -- chair-editable abstract shown in search results. ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT ''; -- 'court_ruling' | 'appeals_committee' -- practice_area is closed to the three appeals committee domains. DO $$ BEGIN ALTER TABLE case_law ADD CONSTRAINT case_law_practice_area_check CHECK (practice_area IN ('', 'rishuy_uvniya', 'betterment_levy', 'compensation_197')); EXCEPTION WHEN duplicate_object THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind); CREATE INDEX IF NOT EXISTS idx_case_law_practice ON case_law(practice_area, appeal_subtype); -- precedent_chunks: full-text chunks of an uploaded ruling, with embeddings. -- Analog of document_chunks for case_law rows where source_kind='external_upload'. CREATE TABLE IF NOT EXISTS precedent_chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, section_type TEXT DEFAULT 'other', -- intro | facts | legal_analysis | ruling | conclusion | other page_number INTEGER, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_case_law ON precedent_chunks(case_law_id); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_section ON precedent_chunks(case_law_id, section_type); CREATE INDEX IF NOT EXISTS idx_precedent_chunks_vec ON precedent_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); -- halachot: extracted binding rules. One halacha = one rule + verbatim quote. -- Embedded separately for rule-precision semantic match (chunks centroid is -- dominated by surrounding context). All halachot start as pending_review; -- only approved/published rows are visible to search_precedent_library. CREATE TABLE IF NOT EXISTS halachot ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, halacha_index INTEGER NOT NULL, rule_statement TEXT NOT NULL, rule_type TEXT DEFAULT 'binding', -- binding | interpretive | procedural | obiter reasoning_summary TEXT DEFAULT '', supporting_quote TEXT NOT NULL, page_reference TEXT DEFAULT '', practice_areas TEXT[] DEFAULT '{}', subject_tags TEXT[] DEFAULT '{}', cites TEXT[] DEFAULT '{}', confidence NUMERIC(3,2) DEFAULT 0.0, quote_verified BOOLEAN DEFAULT FALSE, review_status TEXT DEFAULT 'pending_review', -- pending_review | approved | rejected | published reviewer TEXT DEFAULT '', reviewed_at TIMESTAMPTZ, embedding vector(1024), created_at TIMESTAMPTZ DEFAULT now(), updated_at TIMESTAMPTZ DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id); CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status); CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas); CREATE INDEX IF NOT EXISTS idx_halachot_tags ON halachot USING gin(subject_tags); CREATE INDEX IF NOT EXISTS idx_halachot_vec ON halachot USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); """ # ── V8: Extraction request queue ───────────────────────────────── # Web UI buttons ("Sparkles" = request metadata extraction; "Refresh" = # request halacha extraction) run inside the FastAPI container, which has # no `claude` CLI. They can't run the LLM extractor directly. Instead they # stamp a request timestamp here, and the chair (or me) runs the MCP tool # `precedent_process_pending_extractions` from local Claude Code, where the # CLI is available, to drain the queue. See claude_session.py for the rule. SCHEMA_V8_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS metadata_extraction_requested_at TIMESTAMPTZ; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_requested_at TIMESTAMPTZ; CREATE INDEX IF NOT EXISTS idx_case_law_metadata_requested ON case_law(metadata_extraction_requested_at) WHERE metadata_extraction_requested_at IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_case_law_halacha_requested ON case_law(halacha_extraction_requested_at) WHERE halacha_extraction_requested_at IS NOT NULL; """ # ── V9: Multimodal page-image embeddings ───────────────────────── # voyage-multimodal-3 (1024-dim) embeds the whole page as an image: # captures table layout, scanned content, signatures, plans — content # that text-OCR loses. Ingestion is gated by config.MULTIMODAL_ENABLED; # search_*_hybrid() merge text-cosine + image-cosine when present. # image_thumbnail_path is a relative path under DATA_DIR/cases/{case}/ # thumbnails/ or DATA_DIR/precedent-library/thumbnails/ — a small JPEG # rendered at config.MULTIMODAL_THUMB_DPI for UI preview, distinct from # the higher-DPI render fed to the embedder (which is not persisted). SCHEMA_V9_SQL = """ CREATE TABLE IF NOT EXISTS document_image_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID REFERENCES documents(id) ON DELETE CASCADE, case_id UUID REFERENCES cases(id) ON DELETE CASCADE, page_number INTEGER NOT NULL, image_thumbnail_path TEXT, embedding vector(1024), model_name TEXT DEFAULT 'voyage-multimodal-3', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(document_id, page_number) ); CREATE INDEX IF NOT EXISTS idx_doc_img_emb_vec ON document_image_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); CREATE INDEX IF NOT EXISTS idx_doc_img_emb_doc ON document_image_embeddings(document_id); CREATE INDEX IF NOT EXISTS idx_doc_img_emb_case ON document_image_embeddings(case_id); CREATE TABLE IF NOT EXISTS precedent_image_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE, page_number INTEGER NOT NULL, image_thumbnail_path TEXT, embedding vector(1024), model_name TEXT DEFAULT 'voyage-multimodal-3', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(case_law_id, page_number) ); CREATE INDEX IF NOT EXISTS idx_prec_img_emb_vec ON precedent_image_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50); CREATE INDEX IF NOT EXISTS idx_prec_img_emb_case_law ON precedent_image_embeddings(case_law_id); """ SCHEMA_V10_SQL = """ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS chair_name TEXT DEFAULT ''; ALTER TABLE case_law ADD COLUMN IF NOT EXISTS district TEXT DEFAULT ''; ALTER TABLE cases ADD COLUMN IF NOT EXISTS chair_name TEXT DEFAULT ''; CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind); CREATE INDEX IF NOT EXISTS idx_case_law_chair ON case_law(chair_name) WHERE chair_name <> ''; CREATE INDEX IF NOT EXISTS idx_case_law_district ON case_law(district) WHERE district <> ''; """ SCHEMA_V11_SQL = """ CREATE TABLE IF NOT EXISTS case_law_relations ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, related_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE, relation_type TEXT NOT NULL DEFAULT 'same_case_chain', created_at TIMESTAMPTZ DEFAULT now(), UNIQUE(case_law_id, related_id), CHECK (case_law_id <> related_id) ); CREATE INDEX IF NOT EXISTS idx_clr_a ON case_law_relations(case_law_id); CREATE INDEX IF NOT EXISTS idx_clr_b ON case_law_relations(related_id); """ async def _run_schema_migrations(pool: asyncpg.Pool) -> None: async with pool.acquire() as conn: await conn.execute(SCHEMA_SQL) await conn.execute(MIGRATIONS_SQL) await conn.execute(SCHEMA_V2_SQL) await conn.execute(SCHEMA_V3_SQL) await conn.execute(SCHEMA_V4_SQL) await conn.execute(SCHEMA_V5_SQL) await conn.execute(SCHEMA_V6_SQL) await conn.execute(SCHEMA_V7_SQL) await conn.execute(SCHEMA_V8_SQL) await conn.execute(SCHEMA_V9_SQL) await conn.execute(SCHEMA_V10_SQL) await conn.execute(SCHEMA_V11_SQL) logger.info("Database schema initialized (v1-v11)") async def init_schema() -> None: """Backward-compatible wrapper. Schema init now runs lazily inside get_pool().""" await get_pool() # ── Case CRUD ─────────────────────────────────────────────────────── async def create_case( case_number: str, title: str, appellants: list[str] | None = None, respondents: list[str] | None = None, subject: str = "", property_address: str = "", permit_number: str = "", committee_type: str = "ועדה מקומית", hearing_date: date | None = None, notes: str = "", expected_outcome: str = "", practice_area: str = "appeals_committee", appeal_subtype: str = "", ) -> dict: pool = await get_pool() case_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO cases (id, case_number, title, appellants, respondents, subject, property_address, permit_number, committee_type, hearing_date, notes, expected_outcome, practice_area, appeal_subtype) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)""", case_id, case_number, title, json.dumps(appellants or []), json.dumps(respondents or []), subject, property_address, permit_number, committee_type, hearing_date, notes, expected_outcome, practice_area, appeal_subtype, ) return await get_case(case_id) async def get_case(case_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id) if row is None: return None return _row_to_case(row) async def set_active_draft_path(case_id: UUID, path: str | None) -> None: """Update the case's active_draft_path (the DOCX that is source of truth).""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "UPDATE cases SET active_draft_path = $1, updated_at = now() WHERE id = $2", path, case_id, ) async def get_active_draft_path(case_id: UUID) -> str | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT active_draft_path FROM cases WHERE id = $1", case_id, ) return row["active_draft_path"] if row else None async def get_case_by_number(case_number: str) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM cases WHERE case_number = $1", case_number ) if row is None: return None return _row_to_case(row) async def list_cases( status: str | None = None, limit: int = 50, include_archived: bool = False, archived_only: bool = False, ) -> list[dict]: pool = await get_pool() where = [] args: list = [] if status: where.append(f"status = ${len(args) + 1}") args.append(status) if archived_only: where.append("archived_at IS NOT NULL") elif not include_archived: where.append("archived_at IS NULL") where_clause = f"WHERE {' AND '.join(where)}" if where else "" args.append(limit) sql = f"SELECT * FROM cases {where_clause} ORDER BY updated_at DESC LIMIT ${len(args)}" async with pool.acquire() as conn: rows = await conn.fetch(sql, *args) return [_row_to_case(r) for r in rows] async def update_case(case_id: UUID, **fields) -> dict | None: if not fields: return await get_case(case_id) pool = await get_pool() set_clauses = [] values = [] for i, (key, val) in enumerate(fields.items(), start=2): if key in ("appellants", "respondents", "tags"): val = json.dumps(val) set_clauses.append(f"{key} = ${i}") values.append(val) set_clauses.append("updated_at = now()") sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1" async with pool.acquire() as conn: await conn.execute(sql, case_id, *values) return await get_case(case_id) def _row_to_case(row: asyncpg.Record) -> dict: d = dict(row) for field in ("appellants", "respondents", "tags"): if isinstance(d.get(field), str): d[field] = json.loads(d[field]) d["id"] = str(d["id"]) return d async def archive_case(case_id: UUID) -> dict | None: """Mark a case as archived. Returns updated row, or None if not found.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "UPDATE cases SET archived_at = now(), updated_at = now() " "WHERE id = $1 RETURNING *", case_id, ) return _row_to_case(row) if row else None async def restore_case(case_id: UUID) -> dict | None: """Clear the archived_at timestamp. Returns updated row, or None if not found.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "UPDATE cases SET archived_at = NULL, updated_at = now() " "WHERE id = $1 RETURNING *", case_id, ) return _row_to_case(row) if row else None async def delete_case(case_id: UUID) -> bool: """Delete a case row. Returns True if a row was actually removed. All dependent rows are removed automatically by FK constraints: • CASCADE: documents, document_chunks, claims, appraiser_facts, decisions, qa_results, case_precedents • SET NULL: audit_log.case_id, chair_feedback.case_id NOTE: this only touches the legal-ai database. The Paperclip project (issues, comments, runs) and Gitea repo for the case live in other systems and are NOT cleaned up here — call sites that need a full reset must handle those separately. """ pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute("DELETE FROM cases WHERE id = $1", case_id) # asyncpg execute returns "DELETE " — extract count. return int(result.split()[-1]) > 0 # ── Document CRUD ─────────────────────────────────────────────────── async def create_document( case_id: UUID, doc_type: str, title: str, file_path: str, page_count: int | None = None, ) -> dict: pool = await get_pool() doc_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count) VALUES ($1, $2, $3, $4, $5, $6)""", doc_id, case_id, doc_type, title, file_path, page_count, ) row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) return _row_to_doc(row) async def update_document(doc_id: UUID, **fields) -> None: if not fields: return pool = await get_pool() set_clauses = [] values = [] for i, (key, val) in enumerate(fields.items(), start=2): if key == "metadata": val = json.dumps(val) set_clauses.append(f"{key} = ${i}") values.append(val) sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1" async with pool.acquire() as conn: await conn.execute(sql, doc_id, *values) async def get_document(doc_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) return _row_to_doc(row) if row else None async def list_documents(case_id: UUID) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id ) return [_row_to_doc(r) for r in rows] async def get_document_text(doc_id: UUID) -> str: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT extracted_text FROM documents WHERE id = $1", doc_id ) return row["extracted_text"] if row else "" def _row_to_doc(row: asyncpg.Record) -> dict: d = dict(row) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) if isinstance(d.get("metadata"), str): d["metadata"] = json.loads(d["metadata"]) return d # ── Claims ───────────────────────────────────────────────────────── async def store_claims(case_id: UUID, claims: list[dict], source_document: str = "") -> int: """Store extracted claims. Replaces existing claims from same source. Each claim dict: party_role, claim_text, claim_index, party_name (optional) """ pool = await get_pool() async with pool.acquire() as conn: if source_document: await conn.execute( "DELETE FROM claims WHERE case_id = $1 AND source_document = $2", case_id, source_document, ) for claim in claims: await conn.execute( """INSERT INTO claims (case_id, party_role, party_name, claim_text, claim_index, source_document, claim_type) VALUES ($1, $2, $3, $4, $5, $6, $7)""", case_id, claim["party_role"], claim.get("party_name", ""), claim["claim_text"], claim.get("claim_index", 0), source_document, claim.get("claim_type", "claim"), ) return len(claims) async def get_claims(case_id: UUID, party_role: str | None = None) -> list[dict]: """Get claims for a case, optionally filtered by party role.""" pool = await get_pool() async with pool.acquire() as conn: if party_role: rows = await conn.fetch( "SELECT * FROM claims WHERE case_id = $1 AND party_role = $2 ORDER BY claim_index", case_id, party_role, ) else: rows = await conn.fetch( "SELECT * FROM claims WHERE case_id = $1 ORDER BY party_role, claim_index", case_id, ) return [dict(r) for r in rows] # ── Decisions ────────────────────────────────────────────────────── async def create_decision( case_id: UUID, outcome: str = "", outcome_summary: str = "", outcome_reasoning: str = "", direction_doc: dict | None = None, ) -> dict: """Create a decision record for a case.""" pool = await get_pool() decision_id = uuid4() async with pool.acquire() as conn: # Check if a decision already exists for this case existing = await conn.fetchrow( "SELECT id, version FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1", case_id, ) version = (existing["version"] + 1) if existing else 1 await conn.execute( """INSERT INTO decisions (id, case_id, version, outcome, outcome_summary, outcome_reasoning, direction_doc) VALUES ($1, $2, $3, $4, $5, $6, $7)""", decision_id, case_id, version, outcome, outcome_summary, outcome_reasoning, json.dumps(direction_doc) if direction_doc else None, ) return await get_decision(decision_id) async def get_decision(decision_id: UUID) -> dict | None: pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow("SELECT * FROM decisions WHERE id = $1", decision_id) if not row: return None d = dict(row) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) if isinstance(d.get("direction_doc"), str): d["direction_doc"] = json.loads(d["direction_doc"]) if isinstance(d.get("panel_members"), str): d["panel_members"] = json.loads(d["panel_members"]) return d async def get_decision_by_case(case_id: UUID) -> dict | None: """Get the latest decision for a case.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM decisions WHERE case_id = $1 ORDER BY version DESC LIMIT 1", case_id, ) if not row: return None d = dict(row) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) if isinstance(d.get("direction_doc"), str): d["direction_doc"] = json.loads(d["direction_doc"]) if isinstance(d.get("panel_members"), str): d["panel_members"] = json.loads(d["panel_members"]) return d async def update_decision(decision_id: UUID, **fields) -> None: if not fields: return pool = await get_pool() set_clauses = [] values = [] for i, (key, val) in enumerate(fields.items(), start=2): if key in ("direction_doc", "panel_members") and isinstance(val, (dict, list)): val = json.dumps(val) set_clauses.append(f"{key} = ${i}") values.append(val) set_clauses.append("updated_at = now()") sql = f"UPDATE decisions SET {', '.join(set_clauses)} WHERE id = $1" async with pool.acquire() as conn: await conn.execute(sql, decision_id, *values) # ── Document deletion ────────────────────────────────────────────── async def delete_document(doc_id: UUID) -> bool: """Delete a document and all its chunks. Returns True if deleted.""" pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", doc_id ) result = await conn.execute( "DELETE FROM documents WHERE id = $1", doc_id ) return int(result.split()[-1]) > 0 # ── Chunks & Vectors ─────────────────────────────────────────────── async def delete_document_chunks(document_id: UUID) -> int: """Delete all chunks for a document (used before reprocessing).""" pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", document_id ) return int(result.split()[-1]) # e.g. "DELETE 5" -> 5 async def store_chunks( document_id: UUID, case_id: UUID | None, chunks: list[dict], ) -> int: """Store document chunks with embeddings. Each chunk dict has: content, section_type, embedding (list[float]), page_number, chunk_index """ pool = await get_pool() async with pool.acquire() as conn: # Delete existing chunks for this document await conn.execute( "DELETE FROM document_chunks WHERE document_id = $1", document_id ) for chunk in chunks: await conn.execute( """INSERT INTO document_chunks (document_id, case_id, chunk_index, content, section_type, embedding, page_number) VALUES ($1, $2, $3, $4, $5, $6, $7)""", document_id, case_id, chunk["chunk_index"], chunk["content"], chunk.get("section_type", "other"), chunk["embedding"], chunk.get("page_number"), ) return len(chunks) async def search_similar( query_embedding: list[float], limit: int = 10, case_id: UUID | None = None, section_type: str | None = None, practice_area: str | None = None, appeal_subtype: str | None = None, ) -> list[dict]: """Cosine similarity search on document chunks.""" pool = await get_pool() conditions = [] params: list = [query_embedding, limit] param_idx = 3 if case_id: conditions.append(f"dc.case_id = ${param_idx}") params.append(case_id) param_idx += 1 if section_type: conditions.append(f"dc.section_type = ${param_idx}") params.append(section_type) param_idx += 1 if practice_area: conditions.append(f"c.practice_area = ${param_idx}") params.append(practice_area) param_idx += 1 if appeal_subtype: conditions.append(f"c.appeal_subtype = ${param_idx}") params.append(appeal_subtype) param_idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT dc.content, dc.section_type, dc.page_number, dc.document_id, dc.case_id, d.title AS document_title, c.case_number, 1 - (dc.embedding <=> $1) AS score FROM document_chunks dc JOIN documents d ON d.id = dc.document_id JOIN cases c ON c.id = dc.case_id {where} ORDER BY dc.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] # ── Style corpus ──────────────────────────────────────────────────── async def add_to_style_corpus( document_id: UUID | None, decision_number: str, decision_date: date | None, subject_categories: list[str], full_text: str, summary: str = "", outcome: str = "", key_principles: list[str] | None = None, practice_area: str = "appeals_committee", appeal_subtype: str = "", ) -> UUID: pool = await get_pool() corpus_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO style_corpus (id, document_id, decision_number, decision_date, subject_categories, full_text, summary, outcome, key_principles, practice_area, appeal_subtype) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""", corpus_id, document_id, decision_number, decision_date, json.dumps(subject_categories), full_text, summary, outcome, json.dumps(key_principles or []), practice_area, appeal_subtype, ) return corpus_id async def delete_from_style_corpus(corpus_id: UUID) -> dict: """Remove a decision from style_corpus + related documents (cascades chunks). Also tries to delete the [קורפוס] document associated by title match, since the current training pipeline inserts style_corpus with document_id=NULL. """ pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): row = await conn.fetchrow( "DELETE FROM style_corpus WHERE id = $1 " "RETURNING decision_number, document_id", corpus_id, ) if not row: return {"deleted": False, "reason": "not found"} docs_deleted = 0 if row["document_id"]: await conn.execute( "DELETE FROM documents WHERE id = $1", row["document_id"] ) docs_deleted = 1 else: # Best-effort: match a [קורפוס] document by the decision_number # in its title. Only for single, unambiguous matches. if row["decision_number"]: docs = await conn.fetch( "SELECT id FROM documents " "WHERE case_id IS NULL AND title LIKE $1", f"%{row['decision_number']}%", ) if len(docs) == 1: await conn.execute( "DELETE FROM documents WHERE id = $1", docs[0]["id"] ) docs_deleted = 1 return { "deleted": True, "decision_number": row["decision_number"], "docs_deleted": docs_deleted, } async def get_style_patterns(pattern_type: str | None = None) -> list[dict]: pool = await get_pool() async with pool.acquire() as conn: if pattern_type: rows = await conn.fetch( "SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC", pattern_type, ) else: rows = await conn.fetch( "SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC" ) return [dict(r) for r in rows] async def upsert_style_pattern( pattern_type: str, pattern_text: str, context: str = "", examples: list[str] | None = None, appeal_subtype: str = "", ) -> None: pool = await get_pool() async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT id, frequency FROM style_patterns " "WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3", pattern_type, pattern_text, appeal_subtype, ) if existing: await conn.execute( "UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1", existing["id"], ) else: await conn.execute( """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype) VALUES ($1, $2, $3, $4, $5)""", pattern_type, pattern_text, context, json.dumps(examples or []), appeal_subtype, ) async def clear_style_patterns(appeal_subtype: str = "") -> None: """Delete style patterns, optionally filtered by appeal_subtype. Empty appeal_subtype = delete ALL patterns. """ pool = await get_pool() async with pool.acquire() as conn: if appeal_subtype: await conn.execute( "DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype ) else: await conn.execute("DELETE FROM style_patterns") # ── Semantic Search (V2 — decision blocks & case law) ───────────── async def search_similar_paragraphs( query_embedding: list[float], limit: int = 10, block_type: str | None = None, ) -> list[dict]: """Search decision paragraphs by semantic similarity.""" pool = await get_pool() conditions = [] params: list = [query_embedding, limit] param_idx = 3 if block_type: conditions.append(f"db.block_id = ${param_idx}") params.append(block_type) param_idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT dp.content, dp.word_count, dp.paragraph_number, db.block_id AS block_type, db.title AS block_title, c.case_number, c.title AS case_title, d.outcome, d.author, 1 - (pe.embedding <=> $1) AS score FROM paragraph_embeddings pe JOIN decision_paragraphs dp ON dp.id = pe.paragraph_id JOIN decision_blocks db ON db.id = dp.block_id JOIN decisions d ON d.id = db.decision_id JOIN cases c ON c.id = d.case_id {where} ORDER BY pe.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] async def search_similar_case_law( query_embedding: list[float], limit: int = 5, ) -> list[dict]: """Search case law by semantic similarity.""" pool = await get_pool() sql = """ SELECT cl.case_number, cl.case_name, cl.court, cl.summary, cl.key_quote, cl.subject_tags, cle.chunk_text, 1 - (cle.embedding <=> $1) AS score FROM case_law_embeddings cle JOIN case_law cl ON cl.id = cle.case_law_id ORDER BY cle.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, query_embedding, limit) results = [] for r in rows: d = dict(r) if isinstance(d.get("subject_tags"), str): d["subject_tags"] = json.loads(d["subject_tags"]) results.append(d) return results async def search_precedents( query_embedding: list[float], limit: int = 10, ) -> list[dict]: """Combined search: paragraphs + case law, ranked by score.""" paragraphs = await search_similar_paragraphs(query_embedding, limit=limit) case_law = await search_similar_case_law(query_embedding, limit=limit) # Combine and sort by score results = [] for p in paragraphs: results.append({ "type": "decision_paragraph", "score": float(p["score"]), "case_number": p["case_number"], "case_title": p["case_title"], "block_type": p["block_type"], "content": p["content"][:500], "author": p["author"], }) for c in case_law: results.append({ "type": "case_law", "score": float(c["score"]), "case_number": c["case_number"], "case_name": c["case_name"], "court": c["court"], "content": c["summary"], }) results.sort(key=lambda x: x["score"], reverse=True) return results[:limit] # ── Case precedents (CRUD) ──────────────────────────────────────── async def create_case_precedent( case_id: UUID, quote: str, citation: str, section_id: str | None = None, chair_note: str = "", pdf_document_id: UUID | None = None, practice_area: str | None = None, ) -> dict: """Insert a new precedent attached to a case.""" pool = await get_pool() row = await pool.fetchrow( """ INSERT INTO case_precedents (case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area) VALUES ($1, $2, $3, $4, $5, $6, $7) RETURNING * """, case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area, ) return dict(row) async def list_case_precedents(case_id: UUID) -> list[dict]: """List all precedents attached to a case, ordered by section then creation time.""" pool = await get_pool() rows = await pool.fetch( """ SELECT id, case_id, section_id, quote, citation, chair_note, pdf_document_id, practice_area, created_at, updated_at FROM case_precedents WHERE case_id = $1 ORDER BY section_id NULLS LAST, created_at """, case_id, ) return [dict(r) for r in rows] async def delete_case_precedent(precedent_id: UUID) -> bool: """Delete a precedent attachment by ID. Returns True if deleted.""" pool = await get_pool() result = await pool.execute( "DELETE FROM case_precedents WHERE id = $1", precedent_id ) return result == "DELETE 1" async def search_precedent_library( query: str, practice_area: str = "", limit: int = 10, ) -> list[dict]: """Search all precedents across cases by citation or quote text.""" pool = await get_pool() pattern = f"%{query}%" if practice_area: rows = await pool.fetch( """ SELECT id, case_id, section_id, quote, citation, chair_note, practice_area, created_at FROM case_precedents WHERE (citation ILIKE $1 OR quote ILIKE $1) AND practice_area = $2 ORDER BY created_at DESC LIMIT $3 """, pattern, practice_area, limit, ) else: rows = await pool.fetch( """ SELECT id, case_id, section_id, quote, citation, chair_note, practice_area, created_at FROM case_precedents WHERE citation ILIKE $1 OR quote ILIKE $1 ORDER BY created_at DESC LIMIT $2 """, pattern, limit, ) return [dict(r) for r in rows] # ── Chair feedback ──────────────────────────────────────────────── async def record_chair_feedback( case_id: UUID | None, block_id: str, feedback_text: str, category: str = "other", lesson_extracted: str = "", ) -> UUID: """Record feedback from the chair (Dafna) on a draft block.""" pool = await get_pool() feedback_id = uuid4() async with pool.acquire() as conn: await conn.execute( """INSERT INTO chair_feedback (id, case_id, block_id, feedback_text, category, lesson_extracted) VALUES ($1, $2, $3, $4, $5, $6)""", feedback_id, case_id, block_id, feedback_text, category, lesson_extracted, ) return feedback_id async def list_chair_feedback( case_id: UUID | None = None, category: str | None = None, unresolved_only: bool = False, ) -> list[dict]: """List chair feedback, optionally filtered.""" pool = await get_pool() conditions = [] params: list = [] idx = 1 if case_id: conditions.append(f"case_id = ${idx}") params.append(case_id) idx += 1 if category: conditions.append(f"category = ${idx}") params.append(category) idx += 1 if unresolved_only: conditions.append("resolved = FALSE") where = f"WHERE {' AND '.join(conditions)}" if conditions else "" async with pool.acquire() as conn: rows = await conn.fetch( f"SELECT * FROM chair_feedback {where} ORDER BY created_at DESC", *params, ) return [dict(r) for r in rows] async def resolve_chair_feedback( feedback_id: UUID, applied_to: list[str], ) -> None: """Mark feedback as resolved and record where it was applied.""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( """UPDATE chair_feedback SET resolved = TRUE, applied_to = $2 WHERE id = $1""", feedback_id, applied_to, ) # ── Appraiser facts (V5 — for interim drafts) ───────────────────── async def replace_appraiser_facts( case_id: UUID, document_id: UUID, facts: list[dict], ) -> int: """Replace all appraiser_facts for a given document. Each fact dict: appraiser_name, appraiser_side, fact_type ('plan'|'permit'), identifier, details (dict), page_number (optional). """ pool = await get_pool() async with pool.acquire() as conn: async with conn.transaction(): await conn.execute( "DELETE FROM appraiser_facts WHERE document_id = $1", document_id, ) for f in facts: await conn.execute( """INSERT INTO appraiser_facts (case_id, document_id, appraiser_name, appraiser_side, fact_type, identifier, details, page_number) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""", case_id, document_id, f["appraiser_name"], f.get("appraiser_side", ""), f["fact_type"], f["identifier"], json.dumps(f.get("details", {}), ensure_ascii=False), f.get("page_number"), ) return len(facts) async def list_appraiser_facts( case_id: UUID, fact_type: str | None = None, ) -> list[dict]: """List appraiser_facts for a case, optionally filtered by fact_type.""" pool = await get_pool() async with pool.acquire() as conn: if fact_type: rows = await conn.fetch( """SELECT * FROM appraiser_facts WHERE case_id = $1 AND fact_type = $2 ORDER BY identifier, appraiser_name""", case_id, fact_type, ) else: rows = await conn.fetch( """SELECT * FROM appraiser_facts WHERE case_id = $1 ORDER BY fact_type, identifier, appraiser_name""", case_id, ) results = [] for r in rows: d = dict(r) d["id"] = str(d["id"]) d["case_id"] = str(d["case_id"]) d["document_id"] = str(d["document_id"]) if isinstance(d.get("details"), str): d["details"] = json.loads(d["details"]) results.append(d) return results async def detect_appraiser_conflicts(case_id: UUID) -> list[dict]: """Detect conflicts: identifiers cited by 2+ different appraisers in this case. A conflict exists when the SAME identifier (e.g., "תמ"א 38") was reported differently by two appraisers — different details, or one cited it and the other did not. Returns list of conflict groups. Each entry in a group carries the appraiser's side so the caller can label it as committee / appellant / deciding. """ pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """SELECT identifier, fact_type, json_agg(jsonb_build_object( 'appraiser_name', appraiser_name, 'appraiser_side', appraiser_side, 'details', details, 'page_number', page_number, 'document_id', document_id ) ORDER BY CASE appraiser_side WHEN 'committee' THEN 1 WHEN 'appellant' THEN 2 WHEN 'deciding' THEN 3 ELSE 4 END, appraiser_name ) AS entries, COUNT(DISTINCT appraiser_name) AS n_appraisers FROM appraiser_facts WHERE case_id = $1 GROUP BY identifier, fact_type HAVING COUNT(DISTINCT appraiser_name) > 1""", case_id, ) conflicts = [] for r in rows: entries = r["entries"] if isinstance(entries, str): entries = json.loads(entries) # Parse nested details if still strings for e in entries: if isinstance(e.get("details"), str): e["details"] = json.loads(e["details"]) conflicts.append({ "identifier": r["identifier"], "fact_type": r["fact_type"], "n_appraisers": r["n_appraisers"], "entries": entries, }) return conflicts # ── V7: External precedent library + halachot ───────────────────── def _row_to_case_law(row: asyncpg.Record) -> dict: """Normalize a case_law row, parsing subject_tags JSONB to list.""" d = dict(row) if isinstance(d.get("subject_tags"), str): try: d["subject_tags"] = json.loads(d["subject_tags"]) except (TypeError, ValueError): d["subject_tags"] = [] if d.get("date") is not None: d["date"] = d["date"].isoformat() return d async def get_case_law(case_law_id: UUID) -> dict | None: pool = await get_pool() row = await pool.fetchrow( "SELECT * FROM case_law WHERE id = $1", case_law_id, ) return _row_to_case_law(row) if row else None async def add_case_law_relation( a_id: UUID, b_id: UUID, relation_type: str = "same_case_chain" ) -> None: """Link two case_law records bidirectionally. Idempotent (ON CONFLICT DO NOTHING).""" pool = await get_pool() async with pool.acquire() as conn: await conn.executemany( """ INSERT INTO case_law_relations(case_law_id, related_id, relation_type) VALUES($1, $2, $3) ON CONFLICT (case_law_id, related_id) DO NOTHING """, [(a_id, b_id, relation_type), (b_id, a_id, relation_type)], ) async def remove_case_law_relation(a_id: UUID, b_id: UUID) -> None: """Remove a bidirectional link between two case_law records.""" pool = await get_pool() await pool.execute( """ DELETE FROM case_law_relations WHERE (case_law_id = $1 AND related_id = $2) OR (case_law_id = $2 AND related_id = $1) """, a_id, b_id, ) async def get_case_law_relations(case_law_id: UUID) -> list[dict]: """Return all case_law records linked to case_law_id, ordered by date asc.""" pool = await get_pool() rows = await pool.fetch( """ SELECT cl.*, r.relation_type FROM case_law_relations r JOIN case_law cl ON cl.id = r.related_id WHERE r.case_law_id = $1 ORDER BY cl.date ASC NULLS LAST """, case_law_id, ) results = [] for row in rows: d = dict(row) relation_type = d.pop("relation_type") normalized = _row_to_case_law(d) normalized["relation_type"] = relation_type results.append(normalized) return results async def get_case_law_by_citation(case_number: str) -> dict | None: pool = await get_pool() row = await pool.fetchrow( "SELECT * FROM case_law WHERE case_number = $1", case_number, ) return _row_to_case_law(row) if row else None async def create_external_case_law( case_number: str, case_name: str, full_text: str, court: str = "", decision_date: date | None = None, practice_area: str = "", appeal_subtype: str = "", subject_tags: list[str] | None = None, summary: str = "", headnote: str = "", key_quote: str = "", source_url: str = "", source_type: str = "", precedent_level: str = "", is_binding: bool = True, document_id: UUID | None = None, ) -> dict: """Insert a chair-uploaded external precedent into case_law. If a row with this ``case_number`` already exists with source_kind='cited_only' (auto-discovered), promote it to source_kind='external_upload' and fill in the missing fields. """ pool = await get_pool() tags_json = json.dumps(subject_tags or [], ensure_ascii=False) async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT id, source_kind FROM case_law WHERE case_number = $1", case_number, ) if existing: row = await conn.fetchrow( """ UPDATE case_law SET case_name = $2, court = COALESCE(NULLIF($3, ''), court), date = COALESCE($4, date), practice_area = $5, appeal_subtype = $6, subject_tags = $7, summary = COALESCE(NULLIF($8, ''), summary), headnote = $9, key_quote = COALESCE(NULLIF($10, ''), key_quote), full_text = $11, source_url = COALESCE(NULLIF($12, ''), source_url), source_type = $13, precedent_level = $14, is_binding = $15, document_id = COALESCE($16, document_id), source_kind = 'external_upload', extraction_status = 'processing', halacha_extraction_status = 'pending' WHERE id = $1 RETURNING * """, existing["id"], case_name, court, decision_date, practice_area, appeal_subtype, tags_json, summary, headnote, key_quote, full_text, source_url, source_type, precedent_level, is_binding, document_id, ) else: row = await conn.fetchrow( """ INSERT INTO case_law ( case_number, case_name, court, date, subject_tags, summary, key_quote, full_text, source_url, source_kind, document_id, extraction_status, halacha_extraction_status, practice_area, appeal_subtype, headnote, source_type, precedent_level, is_binding ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, 'external_upload', $10, 'processing', 'pending', $11, $12, $13, $14, $15, $16 ) RETURNING * """, case_number, case_name, court, decision_date, tags_json, summary, key_quote, full_text, source_url, document_id, practice_area, appeal_subtype, headnote, source_type, precedent_level, is_binding, ) return _row_to_case_law(row) async def create_internal_committee_decision( case_number: str, case_name: str, full_text: str, court: str = "", decision_date: date | None = None, chair_name: str = "", district: str = "", practice_area: str = "", appeal_subtype: str = "", subject_tags: list[str] | None = None, summary: str = "", is_binding: bool = True, document_id: UUID | None = None, ) -> dict: """Upsert an appeals-committee decision as source_kind='internal_committee'. If a row with this case_number already exists as cited_only, promotes it. Idempotent: calling again updates metadata in-place. """ pool = await get_pool() tags_json = json.dumps(subject_tags or [], ensure_ascii=False) async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT id FROM case_law WHERE case_number = $1", case_number, ) if existing: row = await conn.fetchrow( """ UPDATE case_law SET case_name = $2, court = COALESCE(NULLIF($3, ''), court), date = COALESCE($4, date), chair_name = COALESCE(NULLIF($5, ''), chair_name), district = COALESCE(NULLIF($6, ''), district), practice_area = $7, appeal_subtype = $8, subject_tags = $9, summary = COALESCE(NULLIF($10, ''), summary), full_text = $11, source_type = 'appeals_committee', source_kind = 'internal_committee', is_binding = $12, document_id = COALESCE($13, document_id), extraction_status = 'processing', halacha_extraction_status = 'pending' WHERE id = $1 RETURNING * """, existing["id"], case_name, court, decision_date, chair_name, district, practice_area, appeal_subtype, tags_json, summary, full_text, is_binding, document_id, ) else: row = await conn.fetchrow( """ INSERT INTO case_law ( case_number, case_name, court, date, chair_name, district, subject_tags, summary, full_text, source_kind, source_type, document_id, extraction_status, halacha_extraction_status, practice_area, appeal_subtype, is_binding ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, 'internal_committee', 'appeals_committee', $10, 'processing', 'pending', $11, $12, $13 ) RETURNING * """, case_number, case_name, court, decision_date, chair_name, district, tags_json, summary, full_text, document_id, practice_area, appeal_subtype, is_binding, ) return _row_to_case_law(row) async def update_case_law(case_law_id: UUID, **fields) -> dict | None: """Patch metadata fields on a case_law row. Allowed fields: case_name, court, date, practice_area, appeal_subtype, subject_tags, summary, headnote, key_quote, source_url, source_type, precedent_level, is_binding. """ allowed = { "case_number", "case_name", "court", "date", "practice_area", "appeal_subtype", "subject_tags", "summary", "headnote", "key_quote", "source_url", "source_type", "precedent_level", "is_binding", } updates = {k: v for k, v in fields.items() if k in allowed} if not updates: return await get_case_law(case_law_id) pool = await get_pool() set_parts = [] params: list = [case_law_id] for i, (k, v) in enumerate(updates.items(), start=2): if k == "subject_tags": v = json.dumps(v or [], ensure_ascii=False) set_parts.append(f"{k} = ${i}") params.append(v) sql = f"UPDATE case_law SET {', '.join(set_parts)} WHERE id = $1 RETURNING *" row = await pool.fetchrow(sql, *params) return _row_to_case_law(row) if row else None async def set_case_law_extraction_status(case_law_id: UUID, status: str) -> None: """Set text-extraction status. When transitioning to a terminal state ('completed'/'failed') we also NULL ``metadata_extraction_requested_at`` so the local-MCP queue (`process_pending_extractions`, which scans by ``WHERE *_requested_at IS NOT NULL``) doesn't re-pick the row forever and leave the row blocked in the UI's `isPrecedentActive` check.""" pool = await get_pool() if status in ("completed", "failed"): await pool.execute( "UPDATE case_law SET extraction_status = $2, " "metadata_extraction_requested_at = NULL WHERE id = $1", case_law_id, status, ) else: await pool.execute( "UPDATE case_law SET extraction_status = $2 WHERE id = $1", case_law_id, status, ) async def set_case_law_halacha_status(case_law_id: UUID, status: str) -> None: """Set halacha-extraction status. Mirrors ``set_case_law_extraction_status``: on terminal states we also clear ``halacha_extraction_requested_at`` so the queue and UI don't see a stale request flag.""" pool = await get_pool() if status in ("completed", "failed"): await pool.execute( "UPDATE case_law SET halacha_extraction_status = $2, " "halacha_extraction_requested_at = NULL WHERE id = $1", case_law_id, status, ) else: await pool.execute( "UPDATE case_law SET halacha_extraction_status = $2 WHERE id = $1", case_law_id, status, ) async def list_external_case_law( practice_area: str = "", court: str = "", precedent_level: str = "", source_type: str = "", search: str = "", limit: int = 100, offset: int = 0, source_kind: str = "external_upload", ) -> list[dict]: """List chair-uploaded precedents, with simple filters. source_kind="all_committees" expands to: source_kind='internal_committee' OR (source_kind='external_upload' AND source_type='appeals_committee'). """ pool = await get_pool() if source_kind == "all_committees": conditions = [ "(source_kind = 'internal_committee' OR " "(source_kind = 'external_upload' AND source_type = 'appeals_committee'))" ] else: conditions = [f"source_kind = '{source_kind}'"] params: list = [] idx = 1 if practice_area: conditions.append(f"practice_area = ${idx}") params.append(practice_area) idx += 1 if court: conditions.append(f"court ILIKE ${idx}") params.append(f"%{court}%") idx += 1 if precedent_level: conditions.append(f"precedent_level = ${idx}") params.append(precedent_level) idx += 1 if source_type: conditions.append(f"source_type = ${idx}") params.append(source_type) idx += 1 if search: conditions.append( f"(case_number ILIKE ${idx} OR case_name ILIKE ${idx} " f"OR summary ILIKE ${idx} OR headnote ILIKE ${idx})" ) params.append(f"%{search}%") idx += 1 where_sql = " AND ".join(conditions) params.extend([limit, offset]) sql = f""" SELECT id, case_number, case_name, court, date, practice_area, appeal_subtype, source_type, precedent_level, is_binding, summary, headnote, subject_tags, source_kind, chair_name, district, extraction_status, halacha_extraction_status, metadata_extraction_requested_at, halacha_extraction_requested_at, created_at, (SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id) AS halachot_count, (SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id AND h.review_status IN ('approved', 'published')) AS approved_count FROM case_law WHERE {where_sql} ORDER BY created_at DESC LIMIT ${idx} OFFSET ${idx + 1} """ rows = await pool.fetch(sql, *params) out = [] for r in rows: d = _row_to_case_law(r) # Render timestamps as ISO strings so the JSON layer stays simple for k in ("metadata_extraction_requested_at", "halacha_extraction_requested_at"): if d.get(k) is not None: d[k] = d[k].isoformat() out.append(d) return out async def delete_case_law(case_law_id: UUID) -> bool: """Delete a precedent and cascade chunks + halachot.""" pool = await get_pool() result = await pool.execute( "DELETE FROM case_law WHERE id = $1", case_law_id, ) return result == "DELETE 1" async def store_precedent_chunks( case_law_id: UUID, chunks: list[dict], ) -> int: """Replace precedent chunks for a case_law row. Each chunk dict has: chunk_index, content, section_type, page_number, embedding (list[float] or None). """ pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "DELETE FROM precedent_chunks WHERE case_law_id = $1", case_law_id, ) for c in chunks: await conn.execute( """INSERT INTO precedent_chunks (case_law_id, chunk_index, content, section_type, page_number, embedding) VALUES ($1, $2, $3, $4, $5, $6)""", case_law_id, c["chunk_index"], c["content"], c.get("section_type", "other"), c.get("page_number"), c.get("embedding"), ) return len(chunks) async def list_precedent_chunks( case_law_id: UUID, section_types: tuple[str, ...] | None = None, ) -> list[dict]: pool = await get_pool() if section_types: rows = await pool.fetch( """SELECT id, chunk_index, content, section_type, page_number FROM precedent_chunks WHERE case_law_id = $1 AND section_type = ANY($2::text[]) ORDER BY chunk_index""", case_law_id, list(section_types), ) else: rows = await pool.fetch( """SELECT id, chunk_index, content, section_type, page_number FROM precedent_chunks WHERE case_law_id = $1 ORDER BY chunk_index""", case_law_id, ) return [dict(r) for r in rows] async def delete_halachot(case_law_id: UUID) -> int: pool = await get_pool() result = await pool.execute( "DELETE FROM halachot WHERE case_law_id = $1", case_law_id, ) # result is e.g. "DELETE 5" — extract the number. try: return int(result.split()[-1]) except (ValueError, IndexError): return 0 async def store_halachot(case_law_id: UUID, halachot: list[dict]) -> int: """Bulk-insert extracted halachot. Each halacha enters with review_status determined by extractor confidence vs ``config.HALACHA_AUTO_APPROVE_THRESHOLD``: - confidence >= threshold → 'approved' (visible to search immediately) - else → 'pending_review' (chair must approve manually) The auto-approval reviewer is recorded as 'auto' for traceability. """ if not halachot: return 0 threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD pool = await get_pool() async with pool.acquire() as conn: for i, h in enumerate(halachot): confidence = float(h.get("confidence", 0.0)) auto_approve = confidence >= threshold review_status = "approved" if auto_approve else "pending_review" reviewer = ( f"auto-approved (confidence ≥ {threshold:.2f})" if auto_approve else None ) reviewed_at_clause = "now()" if auto_approve else "NULL" await conn.execute( f"""INSERT INTO halachot (case_law_id, halacha_index, rule_statement, rule_type, reasoning_summary, supporting_quote, page_reference, practice_areas, subject_tags, cites, confidence, quote_verified, embedding, review_status, reviewer, reviewed_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, {reviewed_at_clause})""", case_law_id, i, h["rule_statement"], h.get("rule_type", "binding"), h.get("reasoning_summary", ""), h["supporting_quote"], h.get("page_reference", ""), h.get("practice_areas", []), h.get("subject_tags", []), h.get("cites", []), confidence, h.get("quote_verified", False), h.get("embedding"), review_status, reviewer, ) return len(halachot) async def list_halachot( case_law_id: UUID | None = None, review_status: str | None = None, practice_area: str | None = None, limit: int = 200, offset: int = 0, ) -> list[dict]: pool = await get_pool() conditions = [] params: list = [] idx = 1 if case_law_id is not None: conditions.append(f"h.case_law_id = ${idx}") params.append(case_law_id) idx += 1 if review_status: conditions.append(f"h.review_status = ${idx}") params.append(review_status) idx += 1 if practice_area: conditions.append(f"${idx} = ANY(h.practice_areas)") params.append(practice_area) idx += 1 where_sql = f"WHERE {' AND '.join(conditions)}" if conditions else "" params.extend([limit, offset]) sql = f""" SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement, h.rule_type, h.reasoning_summary, h.supporting_quote, h.page_reference, h.practice_areas, h.subject_tags, h.cites, h.confidence, h.quote_verified, h.review_status, h.reviewer, h.reviewed_at, h.created_at, h.updated_at, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level FROM halachot h LEFT JOIN case_law cl ON cl.id = h.case_law_id {where_sql} ORDER BY h.case_law_id, h.halacha_index LIMIT ${idx} OFFSET ${idx + 1} """ rows = await pool.fetch(sql, *params) out = [] for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() out.append(d) return out async def update_halacha( halacha_id: UUID, review_status: str | None = None, reviewer: str = "", rule_statement: str | None = None, reasoning_summary: str | None = None, subject_tags: list[str] | None = None, practice_areas: list[str] | None = None, ) -> dict | None: """Update a halacha — used by the chair to approve/reject/edit.""" pool = await get_pool() set_parts: list[str] = [] params: list = [halacha_id] idx = 2 if review_status is not None: set_parts.append(f"review_status = ${idx}") params.append(review_status) idx += 1 if review_status in ("approved", "rejected", "published"): set_parts.append(f"reviewed_at = now()") set_parts.append(f"reviewer = ${idx}") params.append(reviewer) idx += 1 if rule_statement is not None: set_parts.append(f"rule_statement = ${idx}") params.append(rule_statement) idx += 1 if reasoning_summary is not None: set_parts.append(f"reasoning_summary = ${idx}") params.append(reasoning_summary) idx += 1 if subject_tags is not None: set_parts.append(f"subject_tags = ${idx}") params.append(subject_tags) idx += 1 if practice_areas is not None: set_parts.append(f"practice_areas = ${idx}") params.append(practice_areas) idx += 1 if not set_parts: return None set_parts.append("updated_at = now()") # Exclude `embedding` — it's a numpy.ndarray of np.float32 that breaks # FastAPI's jsonable_encoder downstream (PATCH /api/halachot/{id}). # Callers that need it (none today) can re-fetch with get_halacha. sql = f""" UPDATE halachot SET {', '.join(set_parts)} WHERE id = $1 RETURNING id, case_law_id, halacha_index, rule_statement, rule_type, reasoning_summary, supporting_quote, page_reference, practice_areas, subject_tags, cites, confidence, quote_verified, review_status, reviewer, reviewed_at, created_at, updated_at """ row = await pool.fetchrow(sql, *params) return dict(row) if row else None async def search_precedent_library_semantic( query_embedding: list[float], practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, subject_tag: str = "", limit: int = 10, include_halachot: bool = True, source_kind: str = "external_upload", district: str = "", chair_name: str = "", ) -> list[dict]: """Semantic search over precedents filtered by source_kind. source_kind='external_upload' → court rulings (default) source_kind='internal_committee' → appeals-committee decisions Returns merged halachot + chunks. Halachot are pre-distilled rules, so they get a small score boost. Only ``approved`` / ``published`` halachot are visible (per chair-review policy). Chunks are visible regardless of halacha review status. """ pool = await get_pool() halacha_filters = ["h.review_status IN ('approved', 'published')"] chunk_filters = [f"cl.source_kind = '{source_kind}'"] h_params: list = [query_embedding, limit] c_params: list = [query_embedding, limit] h_idx = 3 c_idx = 3 if practice_area: halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)") h_params.append(practice_area) h_idx += 1 chunk_filters.append(f"cl.practice_area = ${c_idx}") c_params.append(practice_area) c_idx += 1 if court: halacha_filters.append(f"cl.court ILIKE ${h_idx}") h_params.append(f"%{court}%") h_idx += 1 chunk_filters.append(f"cl.court ILIKE ${c_idx}") c_params.append(f"%{court}%") c_idx += 1 if precedent_level: halacha_filters.append(f"cl.precedent_level = ${h_idx}") h_params.append(precedent_level) h_idx += 1 chunk_filters.append(f"cl.precedent_level = ${c_idx}") c_params.append(precedent_level) c_idx += 1 if appeal_subtype: halacha_filters.append(f"cl.appeal_subtype = ${h_idx}") h_params.append(appeal_subtype) h_idx += 1 chunk_filters.append(f"cl.appeal_subtype = ${c_idx}") c_params.append(appeal_subtype) c_idx += 1 if is_binding is not None: halacha_filters.append(f"cl.is_binding = ${h_idx}") h_params.append(is_binding) h_idx += 1 chunk_filters.append(f"cl.is_binding = ${c_idx}") c_params.append(is_binding) c_idx += 1 if subject_tag: halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)") h_params.append(subject_tag) h_idx += 1 if district: halacha_filters.append(f"cl.district = ${h_idx}") h_params.append(district) h_idx += 1 chunk_filters.append(f"cl.district = ${c_idx}") c_params.append(district) c_idx += 1 if chair_name: halacha_filters.append(f"cl.chair_name = ${h_idx}") h_params.append(chair_name) h_idx += 1 chunk_filters.append(f"cl.chair_name = ${c_idx}") c_params.append(chair_name) c_idx += 1 halacha_sql = f""" SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement, h.reasoning_summary, h.supporting_quote, h.page_reference, h.practice_areas, h.subject_tags, h.confidence, h.rule_type, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.chair_name, cl.district, 1 - (h.embedding <=> $1) AS score FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id WHERE {' AND '.join(halacha_filters)} AND h.embedding IS NOT NULL ORDER BY h.embedding <=> $1 LIMIT $2 """ chunk_sql = f""" SELECT pc.id AS chunk_id, pc.case_law_id, pc.content, pc.section_type, pc.page_number, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.practice_area, cl.chair_name, cl.district, 1 - (pc.embedding <=> $1) AS score FROM precedent_chunks pc JOIN case_law cl ON cl.id = pc.case_law_id WHERE {' AND '.join(chunk_filters)} AND pc.embedding IS NOT NULL ORDER BY pc.embedding <=> $1 LIMIT $2 """ results: list[dict] = [] if include_halachot: rows = await pool.fetch(halacha_sql, *h_params) for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() d["score"] = float(d["score"]) + 0.05 # rule-level boost d["type"] = "halacha" results.append(d) rows = await pool.fetch(chunk_sql, *c_params) for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() d["score"] = float(d["score"]) d["type"] = "passage" results.append(d) results.sort(key=lambda x: x["score"], reverse=True) return results[:limit] async def precedent_library_stats() -> dict: """Aggregate stats for the /precedents stats tab.""" pool = await get_pool() async with pool.acquire() as conn: total = await conn.fetchval( "SELECT COUNT(*) FROM case_law" ) by_practice = await conn.fetch( """SELECT practice_area, COUNT(*) AS n FROM case_law GROUP BY practice_area ORDER BY n DESC""" ) by_level = await conn.fetch( """SELECT precedent_level, COUNT(*) AS n FROM case_law GROUP BY precedent_level ORDER BY n DESC""" ) halachot_total = await conn.fetchval( "SELECT COUNT(*) FROM halachot" ) halachot_pending = await conn.fetchval( "SELECT COUNT(*) FROM halachot WHERE review_status = 'pending_review'" ) halachot_approved = await conn.fetchval( "SELECT COUNT(*) FROM halachot WHERE review_status IN ('approved', 'published')" ) return { "precedents_total": int(total or 0), "by_practice_area": [ {"practice_area": r["practice_area"], "count": int(r["n"])} for r in by_practice ], "by_precedent_level": [ {"precedent_level": r["precedent_level"], "count": int(r["n"])} for r in by_level ], "halachot_total": int(halachot_total or 0), "halachot_pending": int(halachot_pending or 0), "halachot_approved": int(halachot_approved or 0), } # ── V8: extraction request queue helpers ───────────────────────── async def request_metadata_extraction(case_law_id: UUID) -> bool: """Stamp ``metadata_extraction_requested_at`` for the local MCP worker to pick up. Returns False if the row is missing. Originally restricted to ``source_kind='external_upload'`` (see git blame). Opened to all source kinds 2026-05-06 — internal_committee rows can also need re-extraction (e.g. corrupted subject_tags from an early ingest pipeline). The extractor itself preserves user values (``precedent_metadata_extractor.extract_and_apply`` only fills empty fields), so this is safe. """ pool = await get_pool() result = await pool.execute( "UPDATE case_law SET metadata_extraction_requested_at = now() " "WHERE id = $1", case_law_id, ) return result == "UPDATE 1" async def request_halacha_extraction(case_law_id: UUID) -> bool: """Same but for halacha extraction. See note on :func:`request_metadata_extraction` re: opening to all source kinds.""" pool = await get_pool() result = await pool.execute( "UPDATE case_law SET halacha_extraction_requested_at = now() " "WHERE id = $1", case_law_id, ) return result == "UPDATE 1" async def list_pending_extraction_requests( kind: str = "metadata", # 'metadata' | 'halacha' limit: int = 20, ) -> list[dict]: """Return rows requesting extraction, oldest request first. The MCP worker drains the queue in order: process → clear timestamp. """ col = ( "metadata_extraction_requested_at" if kind == "metadata" else "halacha_extraction_requested_at" ) pool = await get_pool() # Drop the legacy ``source_kind = 'external_upload'`` filter — without it # internal_committee rows could be stamped (we opened that gate in # request_metadata_extraction / request_halacha_extraction) but stayed # invisible to the worker forever. rows = await pool.fetch( f"""SELECT id, case_number, case_name, court, date, practice_area, is_binding, {col} AS requested_at FROM case_law WHERE {col} IS NOT NULL ORDER BY {col} ASC LIMIT $1""", limit, ) out = [] for r in rows: d = dict(r) if d.get("date") is not None: d["date"] = d["date"].isoformat() if d.get("requested_at") is not None: d["requested_at"] = d["requested_at"].isoformat() out.append(d) return out async def clear_extraction_request( case_law_id: UUID, kind: str = "metadata", ) -> None: col = ( "metadata_extraction_requested_at" if kind == "metadata" else "halacha_extraction_requested_at" ) pool = await get_pool() await pool.execute( f"UPDATE case_law SET {col} = NULL WHERE id = $1", case_law_id, ) # ── V9: Multimodal page image embeddings ───────────────────────── async def store_document_image_embeddings( document_id: UUID, case_id: UUID | None, page_records: list[dict], model_name: str = "voyage-multimodal-3", ) -> int: """Replace per-page image embeddings for a document. Each ``page_records`` entry: ``{page_number, embedding, image_thumbnail_path}``. Embeddings should already be 1024-dim lists (or None for skipped pages). """ pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "DELETE FROM document_image_embeddings WHERE document_id = $1", document_id, ) for r in page_records: await conn.execute( """INSERT INTO document_image_embeddings (document_id, case_id, page_number, embedding, image_thumbnail_path, model_name) VALUES ($1, $2, $3, $4, $5, $6)""", document_id, case_id, r["page_number"], r.get("embedding"), r.get("image_thumbnail_path"), model_name, ) return len(page_records) async def store_precedent_image_embeddings( case_law_id: UUID, page_records: list[dict], model_name: str = "voyage-multimodal-3", ) -> int: """Same pattern as store_document_image_embeddings but for precedents.""" pool = await get_pool() async with pool.acquire() as conn: await conn.execute( "DELETE FROM precedent_image_embeddings WHERE case_law_id = $1", case_law_id, ) for r in page_records: await conn.execute( """INSERT INTO precedent_image_embeddings (case_law_id, page_number, embedding, image_thumbnail_path, model_name) VALUES ($1, $2, $3, $4, $5)""", case_law_id, r["page_number"], r.get("embedding"), r.get("image_thumbnail_path"), model_name, ) return len(page_records) async def search_document_images_similar( query_embedding: list[float], limit: int = 10, case_id: UUID | None = None, practice_area: str | None = None, appeal_subtype: str | None = None, ) -> list[dict]: """Cosine search over per-page image embeddings of case documents.""" pool = await get_pool() conditions: list[str] = [] params: list = [query_embedding, limit] idx = 3 if case_id: conditions.append(f"die.case_id = ${idx}") params.append(case_id); idx += 1 if practice_area: conditions.append(f"c.practice_area = ${idx}") params.append(practice_area); idx += 1 if appeal_subtype: conditions.append(f"c.appeal_subtype = ${idx}") params.append(appeal_subtype); idx += 1 where = f"WHERE {' AND '.join(conditions)}" if conditions else "" sql = f""" SELECT die.document_id, die.case_id, die.page_number, die.image_thumbnail_path, d.title AS document_title, c.case_number, 1 - (die.embedding <=> $1) AS score FROM document_image_embeddings die JOIN documents d ON d.id = die.document_id JOIN cases c ON c.id = die.case_id {where} ORDER BY die.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) return [dict(r) for r in rows] async def search_precedent_images_similar( query_embedding: list[float], limit: int = 10, practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, ) -> list[dict]: """Cosine search over per-page image embeddings of precedent rulings.""" pool = await get_pool() conditions: list[str] = ["cl.source_kind = 'external_upload'"] params: list = [query_embedding, limit] idx = 3 if practice_area: conditions.append(f"cl.practice_area = ${idx}") params.append(practice_area); idx += 1 if court: conditions.append(f"cl.court ILIKE ${idx}") params.append(f"%{court}%"); idx += 1 if precedent_level: conditions.append(f"cl.precedent_level = ${idx}") params.append(precedent_level); idx += 1 if appeal_subtype: conditions.append(f"cl.appeal_subtype = ${idx}") params.append(appeal_subtype); idx += 1 if is_binding is not None: conditions.append(f"cl.is_binding = ${idx}") params.append(is_binding); idx += 1 where = " AND ".join(conditions) sql = f""" SELECT pie.case_law_id, pie.page_number, pie.image_thumbnail_path, cl.case_number, cl.case_name, cl.court, cl.date AS decision_date, cl.precedent_level, cl.practice_area, 1 - (pie.embedding <=> $1) AS score FROM precedent_image_embeddings pie JOIN case_law cl ON cl.id = pie.case_law_id WHERE {where} ORDER BY pie.embedding <=> $1 LIMIT $2 """ async with pool.acquire() as conn: rows = await conn.fetch(sql, *params) out = [] for r in rows: d = dict(r) if d.get("decision_date") is not None: d["decision_date"] = d["decision_date"].isoformat() out.append(d) return out async def search_similar_hybrid( query_text_embedding: list[float], query_image_embedding: list[float], limit: int = 10, fetch_k: int = 30, text_weight: float = 0.65, case_id: UUID | None = None, section_type: str | None = None, practice_area: str | None = None, appeal_subtype: str | None = None, ) -> list[dict]: """Weighted merge of text-chunk and per-page image search. Same (document_id, page_number) → boost text chunk by image score on that page. Image-only pages with no overlapping text chunk are surfaced as ``match_type='image'`` so dense scanned content still appears in results. """ img_weight = 1.0 - text_weight text_rows = await search_similar( query_text_embedding, limit=fetch_k, case_id=case_id, section_type=section_type, practice_area=practice_area, appeal_subtype=appeal_subtype, ) img_rows = await search_document_images_similar( query_image_embedding, limit=fetch_k, case_id=case_id, practice_area=practice_area, appeal_subtype=appeal_subtype, ) img_by_page: dict[tuple, dict] = { (str(r["document_id"]), r["page_number"]): r for r in img_rows } seen: set = set() merged: list[dict] = [] for r in text_rows: page = r.get("page_number") key = (str(r["document_id"]), page) if page is not None else None img_hit = img_by_page.get(key) if key else None text_score = float(r["score"]) image_score = float(img_hit["score"]) if img_hit else 0.0 d = dict(r) d["text_score"] = text_score d["image_score"] = image_score d["score"] = text_score * text_weight + image_score * img_weight d["match_type"] = "text+image" if img_hit else "text" if img_hit: d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path") merged.append(d) if key: seen.add(key) for r in img_rows: key = (str(r["document_id"]), r["page_number"]) if key in seen: continue d = dict(r) d["text_score"] = 0.0 d["image_score"] = float(r["score"]) d["score"] = float(r["score"]) * img_weight d["match_type"] = "image" d["content"] = "" d["section_type"] = "image" merged.append(d) merged.sort(key=lambda x: -x["score"]) return merged[:limit] async def search_precedent_library_hybrid( query_text_embedding: list[float], query_image_embedding: list[float], limit: int = 10, fetch_k: int = 30, text_weight: float = 0.65, practice_area: str = "", court: str = "", precedent_level: str = "", appeal_subtype: str = "", is_binding: bool | None = None, subject_tag: str = "", include_halachot: bool = True, ) -> list[dict]: """Hybrid variant of search_precedent_library_semantic. Halachot have no ``page_number`` — they're boosted by the max image score from any page in the same case_law row. """ img_weight = 1.0 - text_weight text_results = await search_precedent_library_semantic( query_text_embedding, practice_area=practice_area, court=court, precedent_level=precedent_level, appeal_subtype=appeal_subtype, is_binding=is_binding, subject_tag=subject_tag, limit=fetch_k, include_halachot=include_halachot, ) img_results = await search_precedent_images_similar( query_image_embedding, limit=fetch_k, practice_area=practice_area, court=court, precedent_level=precedent_level, appeal_subtype=appeal_subtype, is_binding=is_binding, ) img_by_page: dict[tuple, dict] = {} img_by_case: dict[str, float] = {} for r in img_results: cid = str(r["case_law_id"]) img_by_page[(cid, r["page_number"])] = r img_by_case[cid] = max(img_by_case.get(cid, 0.0), float(r["score"])) seen: set = set() merged: list[dict] = [] for r in text_results: cid = str(r["case_law_id"]) page = r.get("page_number") key = (cid, page) if page is not None else None img_hit = img_by_page.get(key) if key else None if img_hit: image_score = float(img_hit["score"]) elif r.get("type") == "halacha": image_score = img_by_case.get(cid, 0.0) else: image_score = 0.0 text_score = float(r["score"]) d = dict(r) d["text_score"] = text_score d["image_score"] = image_score d["score"] = text_score * text_weight + image_score * img_weight if img_hit: d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path") if key: seen.add(key) merged.append(d) for r in img_results: key = (str(r["case_law_id"]), r["page_number"]) if key in seen: continue d = dict(r) d["text_score"] = 0.0 d["image_score"] = float(r["score"]) d["score"] = float(r["score"]) * img_weight d["type"] = "image_page" d["content"] = "" d["section_type"] = "image" merged.append(d) merged.sort(key=lambda x: -x["score"]) return merged[:limit]