feat(halachot): canonical lookup-before-insert + MCP tools (Phase 3+4) #299

Merged
chaim merged 1 commits from worktree-canonical-phase34 into main 2026-06-18 08:14:46 +00:00
4 changed files with 203 additions and 10 deletions
Showing only changes of commit 7c39c685e5 - Show all commits

View File

@@ -205,6 +205,11 @@ HALACHA_CONSOLIDATE_ENABLED = os.environ.get("HALACHA_CONSOLIDATE_ENABLED", "tru
HALACHA_CONSOLIDATE_MODEL = os.environ.get("HALACHA_CONSOLIDATE_MODEL", HALACHA_EXTRACT_MODEL) HALACHA_CONSOLIDATE_MODEL = os.environ.get("HALACHA_CONSOLIDATE_MODEL", HALACHA_EXTRACT_MODEL)
HALACHA_CONSOLIDATE_EFFORT = os.environ.get("HALACHA_CONSOLIDATE_EFFORT", "high") HALACHA_CONSOLIDATE_EFFORT = os.environ.get("HALACHA_CONSOLIDATE_EFFORT", "high")
# V41 canonical lookup-before-insert: cosine gate for reusing an existing canonical
# instead of creating a new one. 0.85 is tuned to the embedding space (1024-dim voyage).
HALACHA_CANONICAL_LOOKUP_ENABLED = os.environ.get("HALACHA_CANONICAL_LOOKUP_ENABLED", "true").lower() == "true"
HALACHA_CANONICAL_THRESHOLD = float(os.environ.get("HALACHA_CANONICAL_THRESHOLD", "0.85"))
# Google Cloud Vision (OCR for scanned PDFs) # Google Cloud Vision (OCR for scanned PDFs)
GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "") GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")

View File

@@ -427,18 +427,42 @@ async def halacha_review(
reasoning_summary: str = "", reasoning_summary: str = "",
subject_tags: list[str] | None = None, subject_tags: list[str] | None = None,
practice_areas: list[str] | None = None, practice_areas: list[str] | None = None,
canonical_statement: str = "",
) -> str: ) -> str:
"""אישור / דחייה / עריכה של הלכה שחולצה אוטומטית. status: pending_review / approved / rejected / published.""" """אישור / דחייה / עריכה של הלכה שחולצה אוטומטית. status: pending_review / approved / rejected / published.
canonical_statement: עריכת ניסוח העיקרון הקנוני הרחב (V41)."""
return await plib.halacha_review( return await plib.halacha_review(
halacha_id, status, reviewer, rule_statement, reasoning_summary, halacha_id, status, reviewer, rule_statement, reasoning_summary,
subject_tags, practice_areas, subject_tags, practice_areas, canonical_statement,
) )
@mcp.tool() @mcp.tool()
async def halachot_pending(limit: int = 100) -> str: async def halachot_pending(
"""תור ההלכות הממתינות לאישור.""" limit: int = 100,
return await plib.halachot_pending(_clamp_limit(limit)) include_low_quality: bool = False,
instance_type: str = "original",
) -> str:
"""תור ההלכות הממתינות לאישור. V41: ברירת-מחדל instance_type='original' (עקרונות חדשים בלבד, לא ציטוטים)."""
return await plib.halachot_pending(_clamp_limit(limit), include_low_quality, instance_type)
@mcp.tool()
async def canonical_halacha_list(
practice_area: str = "",
review_status: str = "",
limit: int = 50,
offset: int = 0,
) -> str:
"""רשימת עקרונות קנוניים (canonical_halachot). V41.
practice_area: סינון תחום עיסוק. review_status: pending_synthesis/pending_review/approved/published."""
return await plib.canonical_halacha_list(practice_area, review_status, limit, offset)
@mcp.tool()
async def canonical_halacha_get(canonical_id: str) -> str:
"""שלוף עיקרון קנוני + כל האינסטנסים שלו לפי פסיקה. V41."""
return await plib.canonical_halacha_get(canonical_id)
# Documents # Documents

View File

@@ -5320,6 +5320,24 @@ async def store_halachot_for_chunk(
and halacha_quality.FLAG_NEAR_DUPLICATE not in flags): and halacha_quality.FLAG_NEAR_DUPLICATE not in flags):
flags.append(halacha_quality.FLAG_NEAR_DUPLICATE) flags.append(halacha_quality.FLAG_NEAR_DUPLICATE)
# 3) V41 lookup-before-insert: does this principle already have a canonical?
# If yes → 'citation' instance linked to the existing canonical.
# If no → 'original' instance; a new canonical is created after INSERT.
canonical_id = None
instance_type = "original"
if emb is not None and config.HALACHA_CANONICAL_LOOKUP_ENABLED:
canon_match = await conn.fetchrow(
"SELECT id, 1 - (embedding <=> $1) AS sim "
"FROM canonical_halachot "
"WHERE embedding IS NOT NULL "
"ORDER BY embedding <=> $1 LIMIT 1",
emb,
)
if (canon_match
and float(canon_match["sim"]) >= config.HALACHA_CANONICAL_THRESHOLD):
canonical_id = canon_match["id"]
instance_type = "citation"
confidence = float(h.get("confidence", 0.0)) confidence = float(h.get("confidence", 0.0))
auto_approve = confidence >= threshold and not flags auto_approve = confidence >= threshold and not flags
review_status = "approved" if auto_approve else "pending_review" review_status = "approved" if auto_approve else "pending_review"
@@ -5334,18 +5352,50 @@ async def store_halachot_for_chunk(
reasoning_summary, supporting_quote, page_reference, reasoning_summary, supporting_quote, page_reference,
practice_areas, subject_tags, cites, confidence, practice_areas, subject_tags, cites, confidence,
quote_verified, quality_flags, embedding, review_status, quote_verified, quality_flags, embedding, review_status,
reviewer, reviewed_at) reviewer, reviewed_at, canonical_id, instance_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
$12, $13, $14, $15, $16, {reviewed_at_clause})""", $12, $13, $14, $15, $16, {reviewed_at_clause},
$17, $18)""",
case_law_id, base + inserted, h["rule_statement"], case_law_id, base + inserted, h["rule_statement"],
h.get("rule_type", "interpretive"), h.get("reasoning_summary", ""), h.get("rule_type", "interpretive"), h.get("reasoning_summary", ""),
h["supporting_quote"], h.get("page_reference", ""), h["supporting_quote"], h.get("page_reference", ""),
h.get("practice_areas", []), h.get("subject_tags", []), h.get("practice_areas", []), h.get("subject_tags", []),
h.get("cites", []), confidence, h.get("quote_verified", False), h.get("cites", []), confidence, h.get("quote_verified", False),
flags, h.get("embedding"), review_status, reviewer, flags, emb, review_status, reviewer,
canonical_id, instance_type,
) )
existing_quotes.add(norm_quote) existing_quotes.add(norm_quote)
inserted += 1 inserted += 1
# V41: maintain canonical_halachot after successful insert.
if config.HALACHA_CANONICAL_LOOKUP_ENABLED:
if instance_type == "original":
# New principle — create canonical and link back.
new_canon_id = await conn.fetchval(
"INSERT INTO canonical_halachot "
"(canonical_statement, rule_type, practice_areas, subject_tags, "
" embedding, first_established_in, review_status, instance_count) "
"VALUES ($1,$2,$3,$4,$5,$6,'pending_synthesis',1) RETURNING id",
h.get("rule_statement") or "",
h.get("rule_type", "interpretive"),
h.get("practice_areas") or [],
h.get("subject_tags") or [],
emb,
case_law_id,
)
await conn.execute(
"UPDATE halachot SET canonical_id=$1 "
"WHERE case_law_id=$2 AND halacha_index=$3",
new_canon_id, case_law_id, base + inserted - 1,
)
elif canonical_id is not None:
# Citation of existing canonical — bump its instance count.
await conn.execute(
"UPDATE canonical_halachot SET "
"instance_count = instance_count + 1, updated_at = now() "
"WHERE id = $1",
canonical_id,
)
await conn.execute( await conn.execute(
"UPDATE precedent_chunks SET halacha_extracted_at = now() " "UPDATE precedent_chunks SET halacha_extracted_at = now() "
"WHERE id = $1", chunk_id, "WHERE id = $1", chunk_id,
@@ -5362,6 +5412,7 @@ async def list_halachot(
case_law_id: UUID | None = None, case_law_id: UUID | None = None,
review_status: str | None = None, review_status: str | None = None,
practice_area: str | None = None, practice_area: str | None = None,
instance_type: str | None = None,
limit: int = 200, limit: int = 200,
offset: int = 0, offset: int = 0,
exclude_low_quality: bool = False, exclude_low_quality: bool = False,
@@ -5407,6 +5458,10 @@ async def list_halachot(
conditions.append(f"${idx} = ANY(h.practice_areas)") conditions.append(f"${idx} = ANY(h.practice_areas)")
params.append(practice_area) params.append(practice_area)
idx += 1 idx += 1
if instance_type:
conditions.append(f"h.instance_type = ${idx}")
params.append(instance_type)
idx += 1
if exclude_low_quality: if exclude_low_quality:
# a clean item has an empty/NULL quality_flags array # a clean item has an empty/NULL quality_flags array
conditions.append("COALESCE(array_length(h.quality_flags, 1), 0) = 0") conditions.append("COALESCE(array_length(h.quality_flags, 1), 0) = 0")
@@ -6034,6 +6089,51 @@ async def get_canonical_halacha(canonical_id: "UUID") -> "dict | None":
} }
async def list_canonical_halachot(
practice_area: str | None = None,
review_status: str | None = None,
limit: int = 50,
offset: int = 0,
) -> list[dict]:
"""List canonical principles, optionally filtered by practice_area / review_status."""
pool = await get_pool()
conditions = ["1=1"]
params: list = []
idx = 1
if practice_area:
conditions.append(f"${ idx} = ANY(practice_areas)")
params.append(practice_area)
idx += 1
if review_status:
conditions.append(f"review_status = ${idx}")
params.append(review_status)
idx += 1
params += [limit, offset]
rows = await pool.fetch(
f"SELECT id::text, canonical_statement, rule_type, practice_areas, "
f" subject_tags, review_status, instance_count, created_at, updated_at "
f"FROM canonical_halachot "
f"WHERE {' AND '.join(conditions)} "
f"ORDER BY instance_count DESC, created_at DESC "
f"LIMIT ${idx} OFFSET ${idx + 1}",
*params,
)
return [dict(r) for r in rows]
async def update_canonical_statement(
canonical_id: "UUID", canonical_statement: str,
) -> bool:
"""Update the synthesized statement of a canonical principle. Returns True if found."""
pool = await get_pool()
result = await pool.execute(
"UPDATE canonical_halachot SET canonical_statement=$2, updated_at=now() "
"WHERE id=$1",
canonical_id, canonical_statement,
)
return result.split()[-1] != "0"
async def _annotate_equivalents(pool, out: list[dict]) -> None: async def _annotate_equivalents(pool, out: list[dict]) -> None:
"""Attach an `equivalents` list to each row (#84.2) — parallel-authority links. """Attach an `equivalents` list to each row (#84.2) — parallel-authority links.

View File

@@ -320,6 +320,7 @@ async def halacha_review(
reasoning_summary: str = "", reasoning_summary: str = "",
subject_tags: list[str] | None = None, subject_tags: list[str] | None = None,
practice_areas: list[str] | None = None, practice_areas: list[str] | None = None,
canonical_statement: str = "",
) -> str: ) -> str:
"""אישור / דחייה / עריכה של הלכה שחולצה אוטומטית. """אישור / דחייה / עריכה של הלכה שחולצה אוטומטית.
@@ -331,6 +332,7 @@ async def halacha_review(
reasoning_summary: עריכת תמצית ההיגיון (ריק = ללא שינוי). reasoning_summary: עריכת תמצית ההיגיון (ריק = ללא שינוי).
subject_tags: עריכת תגיות (None = ללא שינוי). subject_tags: עריכת תגיות (None = ללא שינוי).
practice_areas: עריכת תחומים (None = ללא שינוי). practice_areas: עריכת תחומים (None = ללא שינוי).
canonical_statement: עריכת הניסוח הקנוני הרחב של העיקרון (ריק = ללא שינוי).
""" """
if status not in {"pending_review", "approved", "rejected", "published"}: if status not in {"pending_review", "approved", "rejected", "published"}:
return _err( return _err(
@@ -353,25 +355,87 @@ async def halacha_review(
) )
if row is None: if row is None:
return _err("הלכה לא נמצאה") return _err("הלכה לא נמצאה")
# V41: propagate canonical_statement edit to the canonical principle.
if canonical_statement and row.get("canonical_id"):
try:
await db.update_canonical_statement(
UUID(str(row["canonical_id"])), canonical_statement,
)
except Exception as e:
import logging
logging.getLogger(__name__).warning(
"halacha_review: failed to update canonical_statement: %s", e,
)
return _ok(row) return _ok(row)
async def halachot_pending(limit: int = 100, include_low_quality: bool = False) -> str: async def halachot_pending(
limit: int = 100,
include_low_quality: bool = False,
instance_type: str = "original",
) -> str:
"""תור ההלכות הממתינות לאישור (review_status='pending_review'). """תור ההלכות הממתינות לאישור (review_status='pending_review').
כברירת-מחדל (#84.1, #84.3) התור **מסונן** — הלכות עם דגל-איכות כלשהו כברירת-מחדל (#84.1, #84.3, V41) התור **מסונן** — הלכות עם דגל-איכות כלשהו
(application / ציטוט-לא-מאומת / קטוע / obiter / restatement דק / לא-נתמך / (application / ציטוט-לא-מאומת / קטוע / obiter / restatement דק / לא-נתמך /
near-duplicate) מוסתרות (הן שייכות ל'דורש תיקון-חילוץ', לא לתור-האישור), near-duplicate) מוסתרות (הן שייכות ל'דורש תיקון-חילוץ', לא לתור-האישור),
ו**ממוין לפי עדיפות** (טופלו-לרעה תחילה, אז הכי לא-ודאיים, אז הישנים). ו**ממוין לפי עדיפות** (טופלו-לרעה תחילה, אז הכי לא-ודאיים, אז הישנים).
V41: כברירת-מחדל מציג רק instance_type='original' (עקרונות חדשים, לא ציטוטים).
העברת instance_type='' מציגה הכל (כולל ציטוטים).
Args: Args:
limit: מספר מקסימלי. limit: מספר מקסימלי.
include_low_quality: True כדי לחשוף גם פריטים מסומני-איכות (בקט 'דורש תיקון'). include_low_quality: True כדי לחשוף גם פריטים מסומני-איכות (בקט 'דורש תיקון').
instance_type: 'original' (ברירת מחדל) / 'citation' / 'application' / '' (הכל).
""" """
rows = await db.list_halachot( rows = await db.list_halachot(
review_status="pending_review", review_status="pending_review",
instance_type=instance_type or None,
limit=limit, limit=limit,
exclude_low_quality=not include_low_quality, exclude_low_quality=not include_low_quality,
order_by_priority=True, order_by_priority=True,
) )
return _ok(rows) return _ok(rows)
async def canonical_halacha_list(
practice_area: str = "",
review_status: str = "",
limit: int = 50,
offset: int = 0,
) -> str:
"""רשימת עקרונות קנוניים (canonical_halachot) — שאילתת נוחות לסוכני-הכתיבה.
Args:
practice_area: סינון לפי תחום עיסוק (ריק = הכל).
review_status: pending_synthesis / pending_review / approved / published (ריק = הכל).
limit: מספר מקסימלי (עד 200).
offset: עמוד (pagination).
"""
rows = await db.list_canonical_halachot(
practice_area=practice_area or None,
review_status=review_status or None,
limit=min(limit, 200),
offset=offset,
)
return _ok(rows)
async def canonical_halacha_get(canonical_id: str) -> str:
"""שלוף עיקרון קנוני אחד (canonical_statement, practice_areas, subject_tags,
review_status, instance_count) + כל האינסטנסים שלו (לפי פסיקה).
Args:
canonical_id: מזהה (UUID) של העיקרון הקנוני.
"""
try:
cid = UUID(canonical_id)
except ValueError:
return _err("canonical_id לא תקין")
row = await db.get_canonical_halacha(cid)
if row is None:
return _err("עיקרון קנוני לא נמצא")
return _ok(row)