feat(graph): research-gap (ghost) nodes (corpus graph PR C)

Turns the graph into a gap-finder: the 247 unresolved internal citations (a corpus precedent cites a ruling NOT in the corpus) collapse to 230 distinct "gap" nodes — each sized by how many corpus precedents cite it, i.e. the most-wanted missing precedent. Backend (web/graph_api.py — read-only, G2): - "gap" added to VALID_NODE_TYPES (NOT default → off unless requested). - New _gap_nodes_and_edges(): gap:<normalized citation> nodes from precedent_internal_citations WHERE cited_case_law_id IS NULL, sized by global in-degree; cites edges only from precedents present in the view (dangling-edge invariant holds). Best-effort enrichment from missing_precedents via exact normalized-citation match → gap_status + missing_precedent_id. Validated: 230 gaps, top ע"א 3213/97 (cited 5×), 230/230 matched to missing_precedents. - GraphNode += gap_status, missing_precedent_id. Metrics correctly exclude gap edges (target not a precedent). No app.py change (gated via node_types). Frontend: - graph.ts: GraphNodeType += "gap"; node fields. - graph-filter-panel: toggle "חוסרי מחקר (פסיקה חסרה)" (off by default). - graph-canvas: gaps render as faint hollow dashed circles, never recoloured by color-by; sized by citation count. - graph-node-panel: gap branch — "מצוטטת ע״י N פסיקות" + status badge + link to /missing-precedents. web-ui build + lint pass. Invariants: G2 (SELECT-only), UI2 (model grows on explicit Pydantic). api:types post-deploy. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 21:21:53 +00:00
parent ecd9e46bb9
commit 9a126f7c36
6 changed files with 151 additions and 7 deletions
--- a/web/graph_api.py
+++ b/web/graph_api.py
@@ -38,7 +38,7 @@ from pydantic import BaseModel
 from web import graph_metrics

 # ── Node-type vocabulary ─────────────────────────────────────────────
-VALID_NODE_TYPES = {"precedent", "halacha", "topic", "practice_area"}
+VALID_NODE_TYPES = {"precedent", "halacha", "topic", "practice_area", "gap"}
 DEFAULT_NODE_TYPES = ("precedent", "topic", "practice_area")
 NODE_CAP_DEFAULT = 400
 NODE_CAP_MAX = 1500
@@ -69,6 +69,9 @@ class GraphNode(BaseModel):
    pagerank: float | None = None  # normalized 0–1 (global influence)
    betweenness: float | None = None  # normalized 0–1 (bridge-ness)
    community: int | None = None  # dense cluster id, 0 = largest
+    # Gap nodes only — research-gap status from missing_precedents (best-effort).
+    gap_status: str | None = None  # open | uploaded | closed | irrelevant
+    missing_precedent_id: str | None = None


 class GraphFacets(BaseModel):
@@ -233,6 +236,76 @@ async def _edges_and_hubs(
    return hub_nodes, edges


+_NORM_NUM = "regexp_replace(btrim(cited_case_number), '\\s+', ' ', 'g')"
+
+
+async def _gap_nodes_and_edges(
+    conn: asyncpg.Connection,
+    prec_ids: list,
+) -> tuple[list[GraphNode], list[GraphEdge]]:
+    """Research-gap ("ghost") nodes: precedents that are CITED but not in the
+    corpus (``precedent_internal_citations.cited_case_law_id IS NULL``).
+
+    One ``gap:<normalized citation>`` node per distinct cited number, sized by
+    how many corpus precedents cite it (global — the "most-wanted missing
+    precedent"). Edges only from citing precedents present in ``prec_ids`` so no
+    edge dangles. Best-effort enriched with ``missing_precedents`` status via an
+    exact normalized-citation match (an unmatched gap still renders)."""
+    nodes: list[GraphNode] = []
+    edges: list[GraphEdge] = []
+    if not prec_ids:
+        return nodes, edges
+
+    # Edges from the displayed precedents to the numbers they cite.
+    edge_rows = await conn.fetch(
+        f"""
+        SELECT {_NORM_NUM} AS num, source_case_law_id AS s
+        FROM precedent_internal_citations
+        WHERE cited_case_law_id IS NULL AND btrim(cited_case_number) <> ''
+          AND source_case_law_id = ANY($1::uuid[])
+        """,
+        prec_ids,
+    )
+    if not edge_rows:
+        return nodes, edges
+    nums = {r["num"] for r in edge_rows}
+
+    # Global in-degree per number (importance), independent of the cap.
+    indeg_rows = await conn.fetch(
+        f"""
+        SELECT {_NORM_NUM} AS num, COUNT(*) AS n
+        FROM precedent_internal_citations
+        WHERE cited_case_law_id IS NULL AND btrim(cited_case_number) <> ''
+        GROUP BY 1
+        """
+    )
+    indeg = {r["num"]: int(r["n"]) for r in indeg_rows}
+
+    # Best-effort enrichment from missing_precedents (exact normalized match).
+    mp_rows = await conn.fetch(
+        "SELECT id, regexp_replace(btrim(citation), '\\s+', ' ', 'g') AS num, status "
+        "FROM missing_precedents"
+    )
+    mp = {r["num"]: (str(r["id"]), r["status"]) for r in mp_rows if r["num"]}
+
+    for num in sorted(nums):
+        gid = f"gap:{num}"
+        match = mp.get(num)
+        nodes.append(
+            GraphNode(
+                id=gid,
+                type="gap",
+                label=num,
+                size=indeg.get(num, 1),
+                gap_status=(match[1] if match else None),
+                missing_precedent_id=(match[0] if match else None),
+            )
+        )
+    for r in edge_rows:
+        edges.append(GraphEdge(source=f"cl:{r['s']}", target=f"gap:{r['num']}", type="cites"))
+    return nodes, edges
+
+
 # ── Endpoints' core logic ────────────────────────────────────────────
 async def build_corpus_graph(
    pool: asyncpg.Pool,
@@ -308,6 +381,10 @@ async def build_corpus_graph(
        nodes = [_precedent_node(r) for r in prec_rows]
        hub_nodes, edges = await _edges_and_hubs(conn, prec_rows, types)
        nodes.extend(hub_nodes)
+        if "gap" in types:
+            gap_nodes, gap_edges = await _gap_nodes_and_edges(conn, [r["id"] for r in prec_rows])
+            nodes.extend(gap_nodes)
+            edges.extend(gap_edges)

    if metrics:
        _stamp_metrics(nodes, edges)
@@ -441,6 +518,10 @@ async def build_node_neighborhood(
        nodes = [_precedent_node(r) for r in prec_rows]
        hub_nodes, edges = await _edges_and_hubs(conn, prec_rows, forced_types)
        nodes.extend(hub_nodes)
+        if "gap" in forced_types:
+            gap_nodes, gap_edges = await _gap_nodes_and_edges(conn, [r["id"] for r in prec_rows])
+            nodes.extend(gap_nodes)
+            edges.extend(gap_edges)

    return CorpusGraph(
        nodes=nodes,