feat(halacha): #82.4 provenance-union on dedup-skip + #82.6 over-merge guard
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s
חילוץ החלטת-ה-dedup ל-helper טהור ובדיק `halacha_quality.dedup_action()` (skip/flag/keep), ושני שיפורים על מסלול ה-dedup-on-insert: #82.4 — merge-with-provenance, לא blind-drop: כשמדלגים על כפילות-סמנטית (cosine≥0.93), מאחדים את ה-`cites` של השורה הנכנסת אל השכן הקנוני ששורד (במקום לאבד אותם). זהו שדה-ה- provenance היחיד שקיים בהכנסה; בחירת-קנוני + מיזוג-corroboration מלא שייכים למסלול ה- reconimation הלא-מקוון (#82.7 / #84.2, שם לשורות כבר יש provenance מצטבר) — מתועד בקוד. #82.6 — over-merge guard: ההחלטה PAIRWISE מול שכן יחיד הקרוב ביותר, ורק השורה הנכנסת מודלגת אי-פעם (אף שורה קיימת לא ממוזגת/נמחקת). אין connected-components closure בהכנסה, לכן שרשרת A~B~C לא קורסת לשורה אחת גם כש-A,C מובחנים. מתועד ב-dedup_action + נבדק. invariants: G1 (provenance נשמר במקור, לא אובד) · G2 (לוגיקת-החלטה ב-helper יחיד בדיק, refactor משמר-התנהגות) · INV-G10 (אין auto-merge של שורות קיימות; tail→flag→סקירת-יו"ר). tests: 6 חדשות (skip/flag/keep/over-merge/boundaries) + 59 בדיקות-הלכה קיימות עוברות. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -4299,22 +4299,39 @@ async def store_halachot_for_chunk(
|
|||||||
flags = list(h.get("quality_flags") or [])
|
flags = list(h.get("quality_flags") or [])
|
||||||
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
|
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
|
||||||
neighbor = await conn.fetchrow(
|
neighbor = await conn.fetchrow(
|
||||||
"SELECT rule_statement, (embedding <=> $2) AS dist "
|
"SELECT id, rule_statement, cites, (embedding <=> $2) AS dist "
|
||||||
"FROM halachot WHERE case_law_id = $1 "
|
"FROM halachot WHERE case_law_id = $1 "
|
||||||
"AND embedding IS NOT NULL "
|
"AND embedding IS NOT NULL "
|
||||||
"ORDER BY embedding <=> $2 LIMIT 1",
|
"ORDER BY embedding <=> $2 LIMIT 1",
|
||||||
case_law_id, emb,
|
case_law_id, emb,
|
||||||
)
|
)
|
||||||
if neighbor is not None:
|
if neighbor is not None:
|
||||||
dist = float(neighbor["dist"])
|
# PAIRWISE decision vs the single nearest neighbor — no
|
||||||
if dist <= dedup_distance:
|
# cluster closure, so a chain A~B~C can't over-merge to one
|
||||||
|
# row (#82.6 over-merge guard). See halacha_quality.dedup_action.
|
||||||
|
action = halacha_quality.dedup_action(
|
||||||
|
float(neighbor["dist"]), h["rule_statement"],
|
||||||
|
neighbor["rule_statement"], dedup_distance, band_distance,
|
||||||
|
)
|
||||||
|
if action == "skip":
|
||||||
|
# #82.4 — merge-with-provenance, not blind drop: fold the
|
||||||
|
# incoming row's cites into the surviving neighbor (the
|
||||||
|
# only provenance present at insert; full canonical-
|
||||||
|
# selection/merge lives in the offline reconciliation
|
||||||
|
# path, #82.7 / #84.2).
|
||||||
|
new_cites = [c for c in (h.get("cites") or []) if c]
|
||||||
|
if new_cites:
|
||||||
|
await conn.execute(
|
||||||
|
"UPDATE halachot SET cites = ARRAY(SELECT DISTINCT "
|
||||||
|
"unnest(COALESCE(cites, '{}') || $2::text[])), "
|
||||||
|
"updated_at = now() WHERE id = $1",
|
||||||
|
neighbor["id"], new_cites,
|
||||||
|
)
|
||||||
skipped += 1
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
# tail band: below auto-skip but lexically near → flag.
|
# tail band: below auto-skip but lexically near → flag.
|
||||||
if (dist <= band_distance
|
if (action == "flag"
|
||||||
and halacha_quality.FLAG_NEAR_DUPLICATE not in flags
|
and halacha_quality.FLAG_NEAR_DUPLICATE not in flags):
|
||||||
and halacha_quality.lexical_near_duplicate(
|
|
||||||
h["rule_statement"], neighbor["rule_statement"])):
|
|
||||||
flags.append(halacha_quality.FLAG_NEAR_DUPLICATE)
|
flags.append(halacha_quality.FLAG_NEAR_DUPLICATE)
|
||||||
|
|
||||||
confidence = float(h.get("confidence", 0.0))
|
confidence = float(h.get("confidence", 0.0))
|
||||||
|
|||||||
@@ -244,6 +244,35 @@ def lexical_near_duplicate(
|
|||||||
or normalized_levenshtein(a, b) >= levenshtein_min)
|
or normalized_levenshtein(a, b) >= levenshtein_min)
|
||||||
|
|
||||||
|
|
||||||
|
def dedup_action(
|
||||||
|
dist: float, rule_new: str, rule_neighbor: str,
|
||||||
|
dedup_distance: float, band_distance: float,
|
||||||
|
) -> str:
|
||||||
|
"""Decide a fresh halacha's fate vs its nearest same-precedent neighbor (#82.4).
|
||||||
|
|
||||||
|
PAIRWISE by construction — it compares the new rule to exactly ONE neighbor
|
||||||
|
(the nearest already-stored one), never to a cluster, so dedup-on-insert can
|
||||||
|
NEVER collapse a chain A~B~C into a single row even when A and C are
|
||||||
|
distinct: each insert is an independent pairwise decision and only the
|
||||||
|
*incoming* row is ever skipped (no existing row is merged or deleted). This
|
||||||
|
is the over-merge guard (#82.6) — connected-components closure, the central
|
||||||
|
over-merge risk in entity-resolution, is deliberately NOT performed here.
|
||||||
|
|
||||||
|
``dist`` is cosine distance (1 − cosine sim) to the neighbor. Returns:
|
||||||
|
* 'skip' — semantic duplicate (dist ≤ dedup_distance): drop the incoming
|
||||||
|
row; the caller unions its provenance (cites) into the surviving
|
||||||
|
neighbor rather than blind-dropping it.
|
||||||
|
* 'flag' — lexical tail (dedup_distance < dist ≤ band_distance AND high
|
||||||
|
lexical overlap): keep the row but mark near_duplicate → chair review.
|
||||||
|
* 'keep' — distinct enough: store normally.
|
||||||
|
"""
|
||||||
|
if dist <= dedup_distance:
|
||||||
|
return "skip"
|
||||||
|
if dist <= band_distance and lexical_near_duplicate(rule_new, rule_neighbor):
|
||||||
|
return "flag"
|
||||||
|
return "keep"
|
||||||
|
|
||||||
|
|
||||||
# ── Aggregate ──
|
# ── Aggregate ──
|
||||||
|
|
||||||
FLAG_NON_DECISION = "non_decision"
|
FLAG_NON_DECISION = "non_decision"
|
||||||
|
|||||||
53
mcp-server/tests/test_halacha_dedup_action.py
Normal file
53
mcp-server/tests/test_halacha_dedup_action.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Tests for #82.4 / #82.6 — dedup-on-insert decision + over-merge guard.
|
||||||
|
|
||||||
|
``halacha_quality.dedup_action`` is the PAIRWISE decision a fresh halacha makes
|
||||||
|
against its single nearest same-precedent neighbor: skip (semantic dup), flag
|
||||||
|
(lexical tail), or keep. It compares to exactly ONE neighbor and only ever drops
|
||||||
|
the *incoming* row, so a chain A~B~C can never collapse to one row — the
|
||||||
|
over-merge guard (#82.6). Pure/offline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from legal_mcp.services import halacha_quality as hq
|
||||||
|
|
||||||
|
# operating point: DEDUP_COSINE=0.93 → dedup_distance=0.07 ; BAND=0.83 → 0.17
|
||||||
|
DEDUP_D = 1.0 - 0.93
|
||||||
|
BAND_D = 1.0 - 0.83
|
||||||
|
|
||||||
|
SIMILAR_A = "מיצוי הליכים הוא תנאי סף להגשת ערר לוועדה"
|
||||||
|
SIMILAR_B = "מיצוי הליכים הוא תנאי סף להגשת הערר לוועדה"
|
||||||
|
DIFFERENT = "מתחם שיקול הדעת התכנוני של הוועדה המקומית רחב"
|
||||||
|
|
||||||
|
|
||||||
|
def test_skip_below_dedup_distance():
|
||||||
|
# cosine ≥ 0.93 (dist ≤ 0.07) → skip, regardless of wording
|
||||||
|
assert hq.dedup_action(0.03, DIFFERENT, SIMILAR_A, DEDUP_D, BAND_D) == "skip"
|
||||||
|
assert hq.dedup_action(0.05, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "skip"
|
||||||
|
|
||||||
|
|
||||||
|
def test_flag_in_lexical_tail():
|
||||||
|
# in the 0.07–0.17 band AND lexically near → flag (not skip, not keep)
|
||||||
|
assert hq.dedup_action(0.12, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "flag"
|
||||||
|
|
||||||
|
|
||||||
|
def test_keep_in_tail_when_not_lexically_similar():
|
||||||
|
# in the band but lexically distinct → keep (don't flag a different rule)
|
||||||
|
assert hq.dedup_action(0.12, DIFFERENT, SIMILAR_A, DEDUP_D, BAND_D) == "keep"
|
||||||
|
|
||||||
|
|
||||||
|
def test_over_merge_guard_distinct_rule_kept():
|
||||||
|
"""Beyond the band, even a lexically-similar rule is KEPT — and because the
|
||||||
|
decision is pairwise (one neighbor, incoming-only drop), a chain A~B~C with
|
||||||
|
A,C distinct never collapses to a single row (#82.6)."""
|
||||||
|
assert hq.dedup_action(0.30, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "keep"
|
||||||
|
assert hq.dedup_action(0.50, DIFFERENT, SIMILAR_A, DEDUP_D, BAND_D) == "keep"
|
||||||
|
|
||||||
|
|
||||||
|
def test_boundary_exactly_at_band_edge():
|
||||||
|
# dist == band_distance is still within the tail (≤), lexical → flag
|
||||||
|
assert hq.dedup_action(BAND_D, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "flag"
|
||||||
|
# just past the band → keep
|
||||||
|
assert hq.dedup_action(BAND_D + 0.001, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "keep"
|
||||||
Reference in New Issue
Block a user