All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 2m7s
Six-phase upgrade of /training from a read-only dashboard into a full Style Studio for managing Daphna's style corpus. - Upload Sheet on /training: file → proofread preview → commit (no more CLI-only `upload-training` skill). - Rich corpus metadata: GET /api/training/corpus returns summary, outcome, key_principles, page_count, parties (regex), legal_citation, lessons_count. PATCH endpoint for chair edits. CorpusDetailDrawer with 4 tabs (details /content/lessons/patterns) replaces the bare table row. - LLM metadata enrichment: style_metadata_extractor + MCP tools (style_corpus_enrich, style_corpus_pending_enrichment) fill summary /outcome/key_principles via claude_session (free, host-side). - Per-decision lessons: new decision_lessons table + 4 REST endpoints + LessonsTab in drawer; hermes-curator now auto-posts findings as decision_lessons(source=curator). - Curator Portrait tab: prompt rendered with link to Gitea, recent curator findings, style_analyzer training prompts, propose-change form that writes proposals to data/curator-proposals/ for manual chair review (no auto-mutation of the agent file). - Style chat tab: SSE-streamed conversations with the style agent. New host-side pm2 service (legal-chat-service, port 8770) wraps claude CLI with stream-json + --resume continuation; FastAPI proxies via host.docker.internal. Zero API cost — uses chaim's claude.ai subscription. chat_conversations + chat_messages persist history. Architecture: keeps the existing rule that claude_session only runs on the host (not the container). The new legal-chat-service is the canonical bridge between the container and the local CLI for the chat feature; everything else (upload, metadata, lessons) stays within the container's existing capabilities. Audit script (scripts/audit_training_corpus.py) included for verifying which corpus rows still need enrichment. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
197 lines
6.9 KiB
Python
Executable File
197 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""Audit the style_corpus table — list each decision with what's populated and what's missing.
|
|
|
|
Produces a JSON report at data/audit/corpus-YYYY-MM-DD.json so we can see at a glance
|
|
which corpus entries lack summary/outcome/key_principles/appeal_subtype/chunks/embeddings.
|
|
|
|
Run with the mcp-server venv (has asyncpg):
|
|
POSTGRES_URL=postgres://... ./mcp-server/.venv/bin/python scripts/audit_training_corpus.py
|
|
|
|
Without POSTGRES_URL, falls back to the per-field env vars used by web/mcp-server config.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import UTC, date, datetime
|
|
from pathlib import Path
|
|
|
|
import asyncpg
|
|
|
|
|
|
def _build_dsn() -> str:
|
|
if url := os.environ.get("POSTGRES_URL"):
|
|
return url
|
|
return (
|
|
f"postgres://{os.environ.get('POSTGRES_USER', 'legal_ai')}:"
|
|
f"{os.environ.get('POSTGRES_PASSWORD', '')}@"
|
|
f"{os.environ.get('POSTGRES_HOST', '127.0.0.1')}:"
|
|
f"{os.environ.get('POSTGRES_PORT', '5433')}/"
|
|
f"{os.environ.get('POSTGRES_DB', 'legal_ai')}"
|
|
)
|
|
|
|
|
|
async def audit() -> dict:
|
|
dsn = _build_dsn()
|
|
conn = await asyncpg.connect(dsn)
|
|
try:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT id, decision_number, decision_date, subject_categories,
|
|
length(full_text) AS chars,
|
|
summary,
|
|
outcome,
|
|
key_principles,
|
|
practice_area,
|
|
appeal_subtype,
|
|
document_id,
|
|
created_at
|
|
FROM style_corpus
|
|
ORDER BY decision_date NULLS LAST, decision_number
|
|
"""
|
|
)
|
|
|
|
# Chunk + embedding counts for each related document — by direct FK first,
|
|
# then by title-match for legacy rows where style_corpus.document_id is NULL.
|
|
chunk_counts = await conn.fetch(
|
|
"""
|
|
SELECT d.id AS doc_id, d.title,
|
|
count(c.id) AS chunks,
|
|
count(c.embedding) FILTER (WHERE c.embedding IS NOT NULL) AS chunks_with_emb
|
|
FROM documents d
|
|
LEFT JOIN document_chunks c ON c.document_id = d.id
|
|
WHERE d.title LIKE '[קורפוס]%' OR d.id IN (SELECT document_id FROM style_corpus WHERE document_id IS NOT NULL)
|
|
GROUP BY d.id, d.title
|
|
"""
|
|
)
|
|
|
|
finally:
|
|
await conn.close()
|
|
|
|
by_doc_id = {r["doc_id"]: r for r in chunk_counts}
|
|
|
|
# Index corpus documents by every digit cluster in their title so we can
|
|
# match against style_corpus.decision_number regardless of formatting
|
|
# (e.g. style_corpus has "1109-25" but title may say "ARAR-25-1109" or
|
|
# "ערר 1009-25"). Each digit run >=3 chars becomes a key.
|
|
by_digit: dict[str, dict] = {}
|
|
for r in chunk_counts:
|
|
title = r["title"] or ""
|
|
for tok in re.findall(r"\d{3,}", title):
|
|
by_digit.setdefault(tok, r)
|
|
|
|
decisions = []
|
|
gaps_total = {
|
|
"summary": 0, "outcome": 0, "key_principles": 0,
|
|
"appeal_subtype": 0, "subject_categories": 0,
|
|
"chunks": 0, "embeddings": 0, "document_id": 0,
|
|
}
|
|
|
|
for row in rows:
|
|
cats = row["subject_categories"]
|
|
if isinstance(cats, str):
|
|
try:
|
|
cats = json.loads(cats)
|
|
except json.JSONDecodeError:
|
|
cats = []
|
|
cats = cats or []
|
|
|
|
kp = row["key_principles"]
|
|
if isinstance(kp, str):
|
|
try:
|
|
kp = json.loads(kp)
|
|
except json.JSONDecodeError:
|
|
kp = []
|
|
kp = kp or []
|
|
|
|
# Resolve chunks: prefer FK, fall back to digit-cluster match on decision_number.
|
|
chunks = 0
|
|
chunks_with_emb = 0
|
|
if row["document_id"] and row["document_id"] in by_doc_id:
|
|
r = by_doc_id[row["document_id"]]
|
|
chunks = r["chunks"]
|
|
chunks_with_emb = r["chunks_with_emb"]
|
|
elif row["decision_number"]:
|
|
for tok in re.findall(r"\d{3,}", row["decision_number"]):
|
|
if tok in by_digit:
|
|
r = by_digit[tok]
|
|
chunks = r["chunks"]
|
|
chunks_with_emb = r["chunks_with_emb"]
|
|
break
|
|
|
|
missing = []
|
|
if not row["summary"]:
|
|
missing.append("summary")
|
|
gaps_total["summary"] += 1
|
|
if not row["outcome"]:
|
|
missing.append("outcome")
|
|
gaps_total["outcome"] += 1
|
|
if not kp:
|
|
missing.append("key_principles")
|
|
gaps_total["key_principles"] += 1
|
|
if not row["appeal_subtype"]:
|
|
missing.append("appeal_subtype")
|
|
gaps_total["appeal_subtype"] += 1
|
|
if not cats:
|
|
missing.append("subject_categories")
|
|
gaps_total["subject_categories"] += 1
|
|
if chunks == 0:
|
|
missing.append("chunks")
|
|
gaps_total["chunks"] += 1
|
|
elif chunks_with_emb < chunks:
|
|
missing.append(f"embeddings({chunks_with_emb}/{chunks})")
|
|
gaps_total["embeddings"] += 1
|
|
if row["document_id"] is None:
|
|
missing.append("document_id")
|
|
gaps_total["document_id"] += 1
|
|
|
|
decisions.append({
|
|
"id": str(row["id"]),
|
|
"decision_number": row["decision_number"] or "",
|
|
"decision_date": row["decision_date"].isoformat() if row["decision_date"] else None,
|
|
"chars": row["chars"],
|
|
"subject_categories": cats,
|
|
"practice_area": row["practice_area"] or "",
|
|
"appeal_subtype": row["appeal_subtype"] or "",
|
|
"summary_len": len(row["summary"] or ""),
|
|
"outcome_len": len(row["outcome"] or ""),
|
|
"key_principles_count": len(kp),
|
|
"chunks": chunks,
|
|
"chunks_with_embeddings": chunks_with_emb,
|
|
"document_id": str(row["document_id"]) if row["document_id"] else None,
|
|
"missing": missing,
|
|
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
|
})
|
|
|
|
return {
|
|
"generated_at": datetime.now(UTC).isoformat(),
|
|
"total_decisions": len(decisions),
|
|
"gaps_total": gaps_total,
|
|
"decisions": decisions,
|
|
}
|
|
|
|
|
|
async def main() -> int:
|
|
report = await audit()
|
|
out_dir = Path(__file__).resolve().parents[1] / "data" / "audit"
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
today = date.today().isoformat()
|
|
out_file = out_dir / f"corpus-{today}.json"
|
|
out_file.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
# Console summary
|
|
print(f"Total decisions: {report['total_decisions']}")
|
|
print("Gaps by field (count of decisions missing it):")
|
|
for field, n in report["gaps_total"].items():
|
|
bar = "█" * min(n, 60)
|
|
print(f" {field:25s} {n:3d} {bar}")
|
|
print(f"\nReport written to {out_file}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(asyncio.run(main()))
|