Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with:
- MCP server (FastMCP) with document processing pipeline
- Web upload interface (FastAPI) for file upload and classification
- pgvector-based semantic search
- Hebrew legal document chunking and embedding
This commit is contained in:
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions

View File

@@ -0,0 +1,440 @@
"""Database service - asyncpg connection pool and queries."""
from __future__ import annotations
import json
import logging
from datetime import date
from uuid import UUID, uuid4
import asyncpg
from pgvector.asyncpg import register_vector
from legal_mcp import config
logger = logging.getLogger(__name__)
_pool: asyncpg.Pool | None = None
async def get_pool() -> asyncpg.Pool:
global _pool
if _pool is None:
# First, ensure pgvector extension exists (before registering type codec)
conn = await asyncpg.connect(config.POSTGRES_URL)
await conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"')
await conn.close()
_pool = await asyncpg.create_pool(
config.POSTGRES_URL,
min_size=2,
max_size=10,
init=_init_connection,
)
return _pool
async def _init_connection(conn: asyncpg.Connection) -> None:
await register_vector(conn)
async def close_pool() -> None:
global _pool
if _pool:
await _pool.close()
_pool = None
# ── Schema ──────────────────────────────────────────────────────────
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS cases (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_number TEXT UNIQUE NOT NULL,
title TEXT NOT NULL,
appellants JSONB DEFAULT '[]',
respondents JSONB DEFAULT '[]',
subject TEXT DEFAULT '',
property_address TEXT DEFAULT '',
permit_number TEXT DEFAULT '',
committee_type TEXT DEFAULT 'ועדה מקומית',
status TEXT DEFAULT 'new',
hearing_date DATE,
decision_date DATE,
tags JSONB DEFAULT '[]',
notes TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS documents (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
doc_type TEXT NOT NULL,
title TEXT NOT NULL,
file_path TEXT NOT NULL,
extracted_text TEXT DEFAULT '',
extraction_status TEXT DEFAULT 'pending',
page_count INTEGER,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS document_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_type TEXT DEFAULT 'other',
embedding vector(1024),
page_number INTEGER,
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_corpus (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
decision_number TEXT,
decision_date DATE,
subject_categories JSONB DEFAULT '[]',
full_text TEXT NOT NULL,
summary TEXT DEFAULT '',
outcome TEXT DEFAULT '',
key_principles JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_patterns (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
pattern_type TEXT NOT NULL,
pattern_text TEXT NOT NULL,
frequency INTEGER DEFAULT 1,
context TEXT DEFAULT '',
examples JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_chunks_embedding
ON document_chunks USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id);
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id);
CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status);
CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number);
"""
async def init_schema() -> None:
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL)
logger.info("Database schema initialized")
# ── Case CRUD ───────────────────────────────────────────────────────
async def create_case(
case_number: str,
title: str,
appellants: list[str] | None = None,
respondents: list[str] | None = None,
subject: str = "",
property_address: str = "",
permit_number: str = "",
committee_type: str = "ועדה מקומית",
hearing_date: date | None = None,
notes: str = "",
) -> dict:
pool = await get_pool()
case_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO cases (id, case_number, title, appellants, respondents,
subject, property_address, permit_number, committee_type,
hearing_date, notes)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""",
case_id, case_number, title,
json.dumps(appellants or []),
json.dumps(respondents or []),
subject, property_address, permit_number, committee_type,
hearing_date, notes,
)
return await get_case(case_id)
async def get_case(case_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id)
if row is None:
return None
return _row_to_case(row)
async def get_case_by_number(case_number: str) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM cases WHERE case_number = $1", case_number
)
if row is None:
return None
return _row_to_case(row)
async def list_cases(status: str | None = None, limit: int = 50) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if status:
rows = await conn.fetch(
"SELECT * FROM cases WHERE status = $1 ORDER BY updated_at DESC LIMIT $2",
status, limit,
)
else:
rows = await conn.fetch(
"SELECT * FROM cases ORDER BY updated_at DESC LIMIT $1", limit
)
return [_row_to_case(r) for r in rows]
async def update_case(case_id: UUID, **fields) -> dict | None:
if not fields:
return await get_case(case_id)
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key in ("appellants", "respondents", "tags"):
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, case_id, *values)
return await get_case(case_id)
def _row_to_case(row: asyncpg.Record) -> dict:
d = dict(row)
for field in ("appellants", "respondents", "tags"):
if isinstance(d.get(field), str):
d[field] = json.loads(d[field])
d["id"] = str(d["id"])
return d
# ── Document CRUD ───────────────────────────────────────────────────
async def create_document(
case_id: UUID,
doc_type: str,
title: str,
file_path: str,
page_count: int | None = None,
) -> dict:
pool = await get_pool()
doc_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count)
VALUES ($1, $2, $3, $4, $5, $6)""",
doc_id, case_id, doc_type, title, file_path, page_count,
)
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row)
async def update_document(doc_id: UUID, **fields) -> None:
if not fields:
return
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key == "metadata":
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, doc_id, *values)
async def get_document(doc_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row) if row else None
async def list_documents(case_id: UUID) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id
)
return [_row_to_doc(r) for r in rows]
async def get_document_text(doc_id: UUID) -> str:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT extracted_text FROM documents WHERE id = $1", doc_id
)
return row["extracted_text"] if row else ""
def _row_to_doc(row: asyncpg.Record) -> dict:
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("metadata"), str):
d["metadata"] = json.loads(d["metadata"])
return d
# ── Chunks & Vectors ───────────────────────────────────────────────
async def store_chunks(
document_id: UUID,
case_id: UUID | None,
chunks: list[dict],
) -> int:
"""Store document chunks with embeddings. Each chunk dict has:
content, section_type, embedding (list[float]), page_number, chunk_index
"""
pool = await get_pool()
async with pool.acquire() as conn:
# Delete existing chunks for this document
await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", document_id
)
for chunk in chunks:
await conn.execute(
"""INSERT INTO document_chunks
(document_id, case_id, chunk_index, content, section_type, embedding, page_number)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
document_id, case_id,
chunk["chunk_index"],
chunk["content"],
chunk.get("section_type", "other"),
chunk["embedding"],
chunk.get("page_number"),
)
return len(chunks)
async def search_similar(
query_embedding: list[float],
limit: int = 10,
case_id: UUID | None = None,
section_type: str | None = None,
) -> list[dict]:
"""Cosine similarity search on document chunks."""
pool = await get_pool()
conditions = []
params: list = [query_embedding, limit]
param_idx = 3
if case_id:
conditions.append(f"dc.case_id = ${param_idx}")
params.append(case_id)
param_idx += 1
if section_type:
conditions.append(f"dc.section_type = ${param_idx}")
params.append(section_type)
param_idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT dc.content, dc.section_type, dc.page_number,
dc.document_id, dc.case_id,
d.title AS document_title,
c.case_number,
1 - (dc.embedding <=> $1) AS score
FROM document_chunks dc
JOIN documents d ON d.id = dc.document_id
JOIN cases c ON c.id = dc.case_id
{where}
ORDER BY dc.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
# ── Style corpus ────────────────────────────────────────────────────
async def add_to_style_corpus(
document_id: UUID | None,
decision_number: str,
decision_date: date | None,
subject_categories: list[str],
full_text: str,
summary: str = "",
outcome: str = "",
key_principles: list[str] | None = None,
) -> UUID:
pool = await get_pool()
corpus_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO style_corpus
(id, document_id, decision_number, decision_date,
subject_categories, full_text, summary, outcome, key_principles)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
corpus_id, document_id, decision_number, decision_date,
json.dumps(subject_categories), full_text, summary, outcome,
json.dumps(key_principles or []),
)
return corpus_id
async def get_style_patterns(pattern_type: str | None = None) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if pattern_type:
rows = await conn.fetch(
"SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC",
pattern_type,
)
else:
rows = await conn.fetch(
"SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC"
)
return [dict(r) for r in rows]
async def upsert_style_pattern(
pattern_type: str,
pattern_text: str,
context: str = "",
examples: list[str] | None = None,
) -> None:
pool = await get_pool()
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2",
pattern_type, pattern_text,
)
if existing:
await conn.execute(
"UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1",
existing["id"],
)
else:
await conn.execute(
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples)
VALUES ($1, $2, $3, $4)""",
pattern_type, pattern_text, context,
json.dumps(examples or []),
)