From 97271689eff75c00214523df128e002a54bc64a3 Mon Sep 17 00:00:00 2001 From: Chaim Date: Thu, 11 Jun 2026 16:34:47 +0000 Subject: [PATCH] feat(halacha): #82.4 provenance-union on dedup-skip + #82.6 over-merge guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit חילוץ החלטת-ה-dedup ל-helper טהור ובדיק `halacha_quality.dedup_action()` (skip/flag/keep), ושני שיפורים על מסלול ה-dedup-on-insert: #82.4 — merge-with-provenance, לא blind-drop: כשמדלגים על כפילות-סמנטית (cosine≥0.93), מאחדים את ה-`cites` של השורה הנכנסת אל השכן הקנוני ששורד (במקום לאבד אותם). זהו שדה-ה- provenance היחיד שקיים בהכנסה; בחירת-קנוני + מיזוג-corroboration מלא שייכים למסלול ה- reconimation הלא-מקוון (#82.7 / #84.2, שם לשורות כבר יש provenance מצטבר) — מתועד בקוד. #82.6 — over-merge guard: ההחלטה PAIRWISE מול שכן יחיד הקרוב ביותר, ורק השורה הנכנסת מודלגת אי-פעם (אף שורה קיימת לא ממוזגת/נמחקת). אין connected-components closure בהכנסה, לכן שרשרת A~B~C לא קורסת לשורה אחת גם כש-A,C מובחנים. מתועד ב-dedup_action + נבדק. invariants: G1 (provenance נשמר במקור, לא אובד) · G2 (לוגיקת-החלטה ב-helper יחיד בדיק, refactor משמר-התנהגות) · INV-G10 (אין auto-merge של שורות קיימות; tail→flag→סקירת-יו"ר). tests: 6 חדשות (skip/flag/keep/over-merge/boundaries) + 59 בדיקות-הלכה קיימות עוברות. Co-Authored-By: Claude Opus 4.8 --- mcp-server/src/legal_mcp/services/db.py | 31 ++++++++--- .../src/legal_mcp/services/halacha_quality.py | 29 ++++++++++ mcp-server/tests/test_halacha_dedup_action.py | 53 +++++++++++++++++++ 3 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 mcp-server/tests/test_halacha_dedup_action.py diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 87e6cd5..cf9263e 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -4299,22 +4299,39 @@ async def store_halachot_for_chunk( flags = list(h.get("quality_flags") or []) if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0: neighbor = await conn.fetchrow( - "SELECT rule_statement, (embedding <=> $2) AS dist " + "SELECT id, rule_statement, cites, (embedding <=> $2) AS dist " "FROM halachot WHERE case_law_id = $1 " "AND embedding IS NOT NULL " "ORDER BY embedding <=> $2 LIMIT 1", case_law_id, emb, ) if neighbor is not None: - dist = float(neighbor["dist"]) - if dist <= dedup_distance: + # PAIRWISE decision vs the single nearest neighbor — no + # cluster closure, so a chain A~B~C can't over-merge to one + # row (#82.6 over-merge guard). See halacha_quality.dedup_action. + action = halacha_quality.dedup_action( + float(neighbor["dist"]), h["rule_statement"], + neighbor["rule_statement"], dedup_distance, band_distance, + ) + if action == "skip": + # #82.4 — merge-with-provenance, not blind drop: fold the + # incoming row's cites into the surviving neighbor (the + # only provenance present at insert; full canonical- + # selection/merge lives in the offline reconciliation + # path, #82.7 / #84.2). + new_cites = [c for c in (h.get("cites") or []) if c] + if new_cites: + await conn.execute( + "UPDATE halachot SET cites = ARRAY(SELECT DISTINCT " + "unnest(COALESCE(cites, '{}') || $2::text[])), " + "updated_at = now() WHERE id = $1", + neighbor["id"], new_cites, + ) skipped += 1 continue # tail band: below auto-skip but lexically near → flag. - if (dist <= band_distance - and halacha_quality.FLAG_NEAR_DUPLICATE not in flags - and halacha_quality.lexical_near_duplicate( - h["rule_statement"], neighbor["rule_statement"])): + if (action == "flag" + and halacha_quality.FLAG_NEAR_DUPLICATE not in flags): flags.append(halacha_quality.FLAG_NEAR_DUPLICATE) confidence = float(h.get("confidence", 0.0)) diff --git a/mcp-server/src/legal_mcp/services/halacha_quality.py b/mcp-server/src/legal_mcp/services/halacha_quality.py index 5d15c17..ee90d05 100644 --- a/mcp-server/src/legal_mcp/services/halacha_quality.py +++ b/mcp-server/src/legal_mcp/services/halacha_quality.py @@ -244,6 +244,35 @@ def lexical_near_duplicate( or normalized_levenshtein(a, b) >= levenshtein_min) +def dedup_action( + dist: float, rule_new: str, rule_neighbor: str, + dedup_distance: float, band_distance: float, +) -> str: + """Decide a fresh halacha's fate vs its nearest same-precedent neighbor (#82.4). + + PAIRWISE by construction — it compares the new rule to exactly ONE neighbor + (the nearest already-stored one), never to a cluster, so dedup-on-insert can + NEVER collapse a chain A~B~C into a single row even when A and C are + distinct: each insert is an independent pairwise decision and only the + *incoming* row is ever skipped (no existing row is merged or deleted). This + is the over-merge guard (#82.6) — connected-components closure, the central + over-merge risk in entity-resolution, is deliberately NOT performed here. + + ``dist`` is cosine distance (1 − cosine sim) to the neighbor. Returns: + * 'skip' — semantic duplicate (dist ≤ dedup_distance): drop the incoming + row; the caller unions its provenance (cites) into the surviving + neighbor rather than blind-dropping it. + * 'flag' — lexical tail (dedup_distance < dist ≤ band_distance AND high + lexical overlap): keep the row but mark near_duplicate → chair review. + * 'keep' — distinct enough: store normally. + """ + if dist <= dedup_distance: + return "skip" + if dist <= band_distance and lexical_near_duplicate(rule_new, rule_neighbor): + return "flag" + return "keep" + + # ── Aggregate ── FLAG_NON_DECISION = "non_decision" diff --git a/mcp-server/tests/test_halacha_dedup_action.py b/mcp-server/tests/test_halacha_dedup_action.py new file mode 100644 index 0000000..b20a984 --- /dev/null +++ b/mcp-server/tests/test_halacha_dedup_action.py @@ -0,0 +1,53 @@ +"""Tests for #82.4 / #82.6 — dedup-on-insert decision + over-merge guard. + +``halacha_quality.dedup_action`` is the PAIRWISE decision a fresh halacha makes +against its single nearest same-precedent neighbor: skip (semantic dup), flag +(lexical tail), or keep. It compares to exactly ONE neighbor and only ever drops +the *incoming* row, so a chain A~B~C can never collapse to one row — the +over-merge guard (#82.6). Pure/offline. +""" + +from __future__ import annotations + +import pytest + +from legal_mcp.services import halacha_quality as hq + +# operating point: DEDUP_COSINE=0.93 → dedup_distance=0.07 ; BAND=0.83 → 0.17 +DEDUP_D = 1.0 - 0.93 +BAND_D = 1.0 - 0.83 + +SIMILAR_A = "מיצוי הליכים הוא תנאי סף להגשת ערר לוועדה" +SIMILAR_B = "מיצוי הליכים הוא תנאי סף להגשת הערר לוועדה" +DIFFERENT = "מתחם שיקול הדעת התכנוני של הוועדה המקומית רחב" + + +def test_skip_below_dedup_distance(): + # cosine ≥ 0.93 (dist ≤ 0.07) → skip, regardless of wording + assert hq.dedup_action(0.03, DIFFERENT, SIMILAR_A, DEDUP_D, BAND_D) == "skip" + assert hq.dedup_action(0.05, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "skip" + + +def test_flag_in_lexical_tail(): + # in the 0.07–0.17 band AND lexically near → flag (not skip, not keep) + assert hq.dedup_action(0.12, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "flag" + + +def test_keep_in_tail_when_not_lexically_similar(): + # in the band but lexically distinct → keep (don't flag a different rule) + assert hq.dedup_action(0.12, DIFFERENT, SIMILAR_A, DEDUP_D, BAND_D) == "keep" + + +def test_over_merge_guard_distinct_rule_kept(): + """Beyond the band, even a lexically-similar rule is KEPT — and because the + decision is pairwise (one neighbor, incoming-only drop), a chain A~B~C with + A,C distinct never collapses to a single row (#82.6).""" + assert hq.dedup_action(0.30, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "keep" + assert hq.dedup_action(0.50, DIFFERENT, SIMILAR_A, DEDUP_D, BAND_D) == "keep" + + +def test_boundary_exactly_at_band_edge(): + # dist == band_distance is still within the tail (≤), lexical → flag + assert hq.dedup_action(BAND_D, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "flag" + # just past the band → keep + assert hq.dedup_action(BAND_D + 0.001, SIMILAR_A, SIMILAR_B, DEDUP_D, BAND_D) == "keep"