feat(graph): research-gap (ghost) nodes (corpus graph PR C)

Turns the graph into a gap-finder: the 247 unresolved internal citations
(a corpus precedent cites a ruling NOT in the corpus) collapse to 230 distinct
"gap" nodes — each sized by how many corpus precedents cite it, i.e. the
most-wanted missing precedent.

Backend (web/graph_api.py — read-only, G2):
- "gap" added to VALID_NODE_TYPES (NOT default → off unless requested).
- New _gap_nodes_and_edges(): gap:<normalized citation> nodes from
  precedent_internal_citations WHERE cited_case_law_id IS NULL, sized by global
  in-degree; cites edges only from precedents present in the view (dangling-edge
  invariant holds). Best-effort enrichment from missing_precedents via exact
  normalized-citation match → gap_status + missing_precedent_id. Validated:
  230 gaps, top ע"א 3213/97 (cited 5×), 230/230 matched to missing_precedents.
- GraphNode += gap_status, missing_precedent_id. Metrics correctly exclude gap
  edges (target not a precedent). No app.py change (gated via node_types).

Frontend:
- graph.ts: GraphNodeType += "gap"; node fields.
- graph-filter-panel: toggle "חוסרי מחקר (פסיקה חסרה)" (off by default).
- graph-canvas: gaps render as faint hollow dashed circles, never recoloured
  by color-by; sized by citation count.
- graph-node-panel: gap branch — "מצוטטת ע״י N פסיקות" + status badge + link
  to /missing-precedents.

web-ui build + lint pass. Invariants: G2 (SELECT-only), UI2 (model grows on
explicit Pydantic). api:types post-deploy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 21:21:53 +00:00
parent ecd9e46bb9
commit 9a126f7c36
6 changed files with 151 additions and 7 deletions

View File

@@ -38,7 +38,7 @@ from pydantic import BaseModel
from web import graph_metrics
# ── Node-type vocabulary ─────────────────────────────────────────────
VALID_NODE_TYPES = {"precedent", "halacha", "topic", "practice_area"}
VALID_NODE_TYPES = {"precedent", "halacha", "topic", "practice_area", "gap"}
DEFAULT_NODE_TYPES = ("precedent", "topic", "practice_area")
NODE_CAP_DEFAULT = 400
NODE_CAP_MAX = 1500
@@ -69,6 +69,9 @@ class GraphNode(BaseModel):
pagerank: float | None = None # normalized 01 (global influence)
betweenness: float | None = None # normalized 01 (bridge-ness)
community: int | None = None # dense cluster id, 0 = largest
# Gap nodes only — research-gap status from missing_precedents (best-effort).
gap_status: str | None = None # open | uploaded | closed | irrelevant
missing_precedent_id: str | None = None
class GraphFacets(BaseModel):
@@ -233,6 +236,76 @@ async def _edges_and_hubs(
return hub_nodes, edges
_NORM_NUM = "regexp_replace(btrim(cited_case_number), '\\s+', ' ', 'g')"
async def _gap_nodes_and_edges(
conn: asyncpg.Connection,
prec_ids: list,
) -> tuple[list[GraphNode], list[GraphEdge]]:
"""Research-gap ("ghost") nodes: precedents that are CITED but not in the
corpus (``precedent_internal_citations.cited_case_law_id IS NULL``).
One ``gap:<normalized citation>`` node per distinct cited number, sized by
how many corpus precedents cite it (global — the "most-wanted missing
precedent"). Edges only from citing precedents present in ``prec_ids`` so no
edge dangles. Best-effort enriched with ``missing_precedents`` status via an
exact normalized-citation match (an unmatched gap still renders)."""
nodes: list[GraphNode] = []
edges: list[GraphEdge] = []
if not prec_ids:
return nodes, edges
# Edges from the displayed precedents to the numbers they cite.
edge_rows = await conn.fetch(
f"""
SELECT {_NORM_NUM} AS num, source_case_law_id AS s
FROM precedent_internal_citations
WHERE cited_case_law_id IS NULL AND btrim(cited_case_number) <> ''
AND source_case_law_id = ANY($1::uuid[])
""",
prec_ids,
)
if not edge_rows:
return nodes, edges
nums = {r["num"] for r in edge_rows}
# Global in-degree per number (importance), independent of the cap.
indeg_rows = await conn.fetch(
f"""
SELECT {_NORM_NUM} AS num, COUNT(*) AS n
FROM precedent_internal_citations
WHERE cited_case_law_id IS NULL AND btrim(cited_case_number) <> ''
GROUP BY 1
"""
)
indeg = {r["num"]: int(r["n"]) for r in indeg_rows}
# Best-effort enrichment from missing_precedents (exact normalized match).
mp_rows = await conn.fetch(
"SELECT id, regexp_replace(btrim(citation), '\\s+', ' ', 'g') AS num, status "
"FROM missing_precedents"
)
mp = {r["num"]: (str(r["id"]), r["status"]) for r in mp_rows if r["num"]}
for num in sorted(nums):
gid = f"gap:{num}"
match = mp.get(num)
nodes.append(
GraphNode(
id=gid,
type="gap",
label=num,
size=indeg.get(num, 1),
gap_status=(match[1] if match else None),
missing_precedent_id=(match[0] if match else None),
)
)
for r in edge_rows:
edges.append(GraphEdge(source=f"cl:{r['s']}", target=f"gap:{r['num']}", type="cites"))
return nodes, edges
# ── Endpoints' core logic ────────────────────────────────────────────
async def build_corpus_graph(
pool: asyncpg.Pool,
@@ -308,6 +381,10 @@ async def build_corpus_graph(
nodes = [_precedent_node(r) for r in prec_rows]
hub_nodes, edges = await _edges_and_hubs(conn, prec_rows, types)
nodes.extend(hub_nodes)
if "gap" in types:
gap_nodes, gap_edges = await _gap_nodes_and_edges(conn, [r["id"] for r in prec_rows])
nodes.extend(gap_nodes)
edges.extend(gap_edges)
if metrics:
_stamp_metrics(nodes, edges)
@@ -441,6 +518,10 @@ async def build_node_neighborhood(
nodes = [_precedent_node(r) for r in prec_rows]
hub_nodes, edges = await _edges_and_hubs(conn, prec_rows, forced_types)
nodes.extend(hub_nodes)
if "gap" in forced_types:
gap_nodes, gap_edges = await _gap_nodes_and_edges(conn, [r["id"] for r in prec_rows])
nodes.extend(gap_nodes)
edges.extend(gap_edges)
return CorpusGraph(
nodes=nodes,