Merge pull request 'feat(halacha): #82.4 provenance-union על dedup-skip + #82.6 over-merge guard' (#192) from worktree-halacha-dedup-provenance-guard into main
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
G12 Leak-Guard / leak-guard (push) Successful in 5s

This commit was merged in pull request #192.
This commit is contained in:
2026-06-11 16:35:05 +00:00
3 changed files with 106 additions and 7 deletions

View File

@@ -4299,22 +4299,39 @@ async def store_halachot_for_chunk(
flags = list(h.get("quality_flags") or [])
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
neighbor = await conn.fetchrow(
"SELECT rule_statement, (embedding <=> $2) AS dist "
"SELECT id, rule_statement, cites, (embedding <=> $2) AS dist "
"FROM halachot WHERE case_law_id = $1 "
"AND embedding IS NOT NULL "
"ORDER BY embedding <=> $2 LIMIT 1",
case_law_id, emb,
)
if neighbor is not None:
dist = float(neighbor["dist"])
if dist <= dedup_distance:
# PAIRWISE decision vs the single nearest neighbor — no
# cluster closure, so a chain A~B~C can't over-merge to one
# row (#82.6 over-merge guard). See halacha_quality.dedup_action.
action = halacha_quality.dedup_action(
float(neighbor["dist"]), h["rule_statement"],
neighbor["rule_statement"], dedup_distance, band_distance,
)
if action == "skip":
# #82.4 — merge-with-provenance, not blind drop: fold the
# incoming row's cites into the surviving neighbor (the
# only provenance present at insert; full canonical-
# selection/merge lives in the offline reconciliation
# path, #82.7 / #84.2).
new_cites = [c for c in (h.get("cites") or []) if c]
if new_cites:
await conn.execute(
"UPDATE halachot SET cites = ARRAY(SELECT DISTINCT "
"unnest(COALESCE(cites, '{}') || $2::text[])), "
"updated_at = now() WHERE id = $1",
neighbor["id"], new_cites,
)
skipped += 1
continue
# tail band: below auto-skip but lexically near → flag.
if (dist <= band_distance
and halacha_quality.FLAG_NEAR_DUPLICATE not in flags
and halacha_quality.lexical_near_duplicate(
h["rule_statement"], neighbor["rule_statement"])):
if (action == "flag"
and halacha_quality.FLAG_NEAR_DUPLICATE not in flags):
flags.append(halacha_quality.FLAG_NEAR_DUPLICATE)
confidence = float(h.get("confidence", 0.0))