feat(graph): centrality + cluster analytics (corpus graph PR B)

The Obsidian "Graph Analysis" equivalent — surfaces influence and structure beyond raw citation count. Backend (new web/graph_metrics.py — pure, dependency-free, no DB → G2): - PageRank (power-iteration), betweenness (Brandes), community (deterministic label-propagation + connected-components fallback), computed in-memory over the precedent citation subgraph that build_corpus_graph already fetched. Normalized 0–1; community ints dense-ranked by size (stable colours). - GraphNode += pagerank/betweenness/community (None unless metrics=true). - build_corpus_graph + /api/graph/corpus gain metrics=false (default path unchanged). Validated on the live corpus: 147 nodes in 13ms. Frontend: - graph.ts: GraphNode metrics fields + metrics param. - graph-canvas: color-by (type | practice_area | precedent_level | community | recency) and size-by (in-degree | pagerank | betweenness) via colorForNode / radiusForNode; exported palettes. - graph-view: colorBy/sizeBy controls; metrics requested only when needed; global metrics overlaid onto neighborhood nodes by id (a node's PageRank shouldn't change when focused); a ranking panel (Tabs: המשפיעות / גשרים, click → focus); dynamic legend per color-by. - graph-filter-panel: "צביעה לפי" + "גודל נקודה לפי" Selects. web-ui build + lint pass. Invariants: G2 (metrics pure, no DB writes), UI2 (model grows on explicit Pydantic). api:types post-deploy. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 21:04:47 +00:00
parent 106ab53231
commit 2fbc0cd3c2
7 changed files with 497 additions and 19 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -5780,6 +5780,7 @@ async def graph_corpus(
    district: str = "",
    year_from: int = 0,
    year_to: int = 0,
+    metrics: bool = False,
 ):
    """Full corpus graph under the given filters (most-cited nodes survive the cap)."""
    if practice_area and practice_area not in _PRACTICE_AREAS:
@@ -5799,6 +5800,7 @@ async def graph_corpus(
        district=district,
        year_from=year_from,
        year_to=year_to,
+        metrics=metrics,
    )


--- a/web/graph_api.py
+++ b/web/graph_api.py
@@ -35,6 +35,8 @@ from uuid import UUID
 import asyncpg
 from pydantic import BaseModel

+from web import graph_metrics
+
 # ── Node-type vocabulary ─────────────────────────────────────────────
 VALID_NODE_TYPES = {"precedent", "halacha", "topic", "practice_area"}
 DEFAULT_NODE_TYPES = ("precedent", "topic", "practice_area")
@@ -63,6 +65,10 @@ class GraphNode(BaseModel):
    court: str | None = None  # precedents only — for color-by / filter
    date: str | None = None  # precedents only — ISO date, for recency color/filter
    case_law_id: str | None = None  # canonical id for deep-link (precedents)
+    # Graph metrics — populated only when ``metrics=true`` (precedents only).
+    pagerank: float | None = None  # normalized 0–1 (global influence)
+    betweenness: float | None = None  # normalized 0–1 (bridge-ness)
+    community: int | None = None  # dense cluster id, 0 = largest


 class GraphFacets(BaseModel):
@@ -243,6 +249,7 @@ async def build_corpus_graph(
    district: str = "",
    year_from: int = 0,
    year_to: int = 0,
+    metrics: bool = False,
 ) -> CorpusGraph:
    """Assemble the full corpus graph under the given filters.

@@ -250,6 +257,10 @@ async def build_corpus_graph(
    so clipping never hides the structurally important nodes. ``truncated`` +
    ``total_available`` let the UI prompt the user to narrow filters. All
    filters are applied server-side in the WHERE clause (G5).
+
+    When ``metrics`` is true, PageRank / betweenness / community are computed
+    in-memory over the precedent citation subgraph (``graph_metrics``) and
+    stamped onto precedent nodes — no extra DB work (G2).
    """
    types = normalize_node_types(node_types)
    cap = max(1, min(int(limit), NODE_CAP_MAX))
@@ -298,6 +309,9 @@ async def build_corpus_graph(
        hub_nodes, edges = await _edges_and_hubs(conn, prec_rows, types)
        nodes.extend(hub_nodes)

+    if metrics:
+        _stamp_metrics(nodes, edges)
+
    return CorpusGraph(
        nodes=nodes,
        edges=edges,
@@ -306,6 +320,23 @@ async def build_corpus_graph(
    )


+def _stamp_metrics(nodes: list[GraphNode], edges: list[GraphEdge]) -> None:
+    """Compute PageRank/betweenness/community over the precedent subgraph and
+    stamp them onto precedent nodes in place (hubs stay ``None``)."""
+    prec_ids = [n.id for n in nodes if n.type == "precedent"]
+    if not prec_ids:
+        return
+    directed = [(e.source, e.target) for e in edges if e.type == "cites"]
+    undirected = [(e.source, e.target) for e in edges if e.type == "same_chain"]
+    m = graph_metrics.compute(prec_ids, directed, undirected)
+    for n in nodes:
+        mv = m.get(n.id)
+        if mv:
+            n.pagerank = mv["pagerank"]
+            n.betweenness = mv["betweenness"]
+            n.community = mv["community"]
+
+
 async def build_node_neighborhood(
    pool: asyncpg.Pool,
    node_id: str,
--- a/web/graph_metrics.py
+++ b/web/graph_metrics.py
@@ -0,0 +1,158 @@
+"""Graph metrics for the corpus graph — dependency-free (no networkx).
+
+Computed in-memory over the precedent citation subgraph that ``graph_api``
+already fetched (**G2**: no DB access here — pure functions over data the caller
+holds). The corpus graph is tiny (≤ ``NODE_CAP_MAX`` = 1500 nodes, sparse), so
+power-iteration PageRank, Brandes betweenness, and label-propagation communities
+all run synchronously well under a second.
+
+Edge model: ``cites`` is directional (authority flows citing → cited);
+``same_chain`` is non-directional. PageRank uses cites-direction + same_chain
+both ways; betweenness and communities treat the whole graph as undirected.
+Determinism (stable colors across requests): nodes are processed in sorted
+order and ties break by lowest label — no randomness.
+"""
+from __future__ import annotations
+
+from collections import Counter, defaultdict, deque
+
+
+def compute(
+    node_ids: list[str],
+    directed_edges: list[tuple[str, str]],
+    undirected_edges: list[tuple[str, str]] | None = None,
+) -> dict[str, dict]:
+    """Return ``{node_id: {pagerank, betweenness, community}}``.
+
+    ``pagerank`` / ``betweenness`` are normalized to max = 1.0 (easy client
+    scaling); ``community`` is a dense int 0..k-1 ordered by descending cluster
+    size (so the largest cluster is always colour 0).
+    """
+    nodes = list(node_ids)
+    node_set = set(nodes)
+    if not nodes:
+        return {}
+    undirected_edges = undirected_edges or []
+
+    de = [(s, t) for s, t in directed_edges if s in node_set and t in node_set and s != t]
+    ue = [(s, t) for s, t in undirected_edges if s in node_set and t in node_set and s != t]
+
+    pr = _normalize(_pagerank(nodes, de, ue))
+    bt = _normalize(_betweenness(nodes, de, ue))
+    comm = _communities(nodes, de, ue)
+
+    return {
+        n: {
+            "pagerank": round(pr[n], 4),
+            "betweenness": round(bt[n], 4),
+            "community": comm[n],
+        }
+        for n in nodes
+    }
+
+
+def _normalize(d: dict[str, float]) -> dict[str, float]:
+    m = max(d.values()) if d else 0.0
+    if m <= 0:
+        return {k: 0.0 for k in d}
+    return {k: v / m for k, v in d.items()}
+
+
+def _undirected_adj(nodes: list[str], de, ue) -> dict[str, set[str]]:
+    adj: dict[str, set[str]] = {n: set() for n in nodes}
+    for s, t in de:
+        adj[s].add(t)
+        adj[t].add(s)
+    for s, t in ue:
+        adj[s].add(t)
+        adj[t].add(s)
+    return adj
+
+
+def _pagerank(nodes, de, ue, d: float = 0.85, iters: int = 100, tol: float = 1e-9):
+    """Power-iteration PageRank. cites direction + same_chain both ways."""
+    out: dict[str, list[str]] = defaultdict(list)
+    for s, t in de:
+        out[s].append(t)
+    for s, t in ue:
+        out[s].append(t)
+        out[t].append(s)
+    n = len(nodes)
+    pr = {x: 1.0 / n for x in nodes}
+    for _ in range(iters):
+        dangling = sum(pr[x] for x in nodes if not out[x])
+        base = (1.0 - d) / n + d * dangling / n
+        new = {x: base for x in nodes}
+        for x in nodes:
+            deg = len(out[x])
+            if deg:
+                share = d * pr[x] / deg
+                for m in out[x]:
+                    new[m] += share
+        if sum(abs(new[x] - pr[x]) for x in nodes) < tol:
+            return new
+        pr = new
+    return pr
+
+
+def _betweenness(nodes, de, ue):
+    """Brandes betweenness on the undirected graph. O(V·(V+E))."""
+    adj = _undirected_adj(nodes, de, ue)
+    bc = {x: 0.0 for x in nodes}
+    for s in nodes:
+        stack: list[str] = []
+        preds: dict[str, list[str]] = {w: [] for w in nodes}
+        sigma = {w: 0.0 for w in nodes}
+        sigma[s] = 1.0
+        dist = {w: -1 for w in nodes}
+        dist[s] = 0
+        queue = deque([s])
+        while queue:
+            v = queue.popleft()
+            stack.append(v)
+            for w in adj[v]:
+                if dist[w] < 0:
+                    dist[w] = dist[v] + 1
+                    queue.append(w)
+                if dist[w] == dist[v] + 1:
+                    sigma[w] += sigma[v]
+                    preds[w].append(v)
+        delta = {w: 0.0 for w in nodes}
+        while stack:
+            w = stack.pop()
+            for v in preds[w]:
+                if sigma[w]:
+                    delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w])
+            if w != s:
+                bc[w] += delta[w]
+    # Undirected: each shortest path counted from both endpoints.
+    return {x: v / 2.0 for x, v in bc.items()}
+
+
+def _communities(nodes, de, ue) -> dict[str, int]:
+    """Deterministic synchronous label propagation (+ dense renumbering).
+
+    Each node starts in its own community and repeatedly adopts the most common
+    label among its neighbours (ties → lowest label). Isolated nodes keep their
+    own singleton community. Labels are renumbered 0..k-1 by descending size.
+    """
+    adj = _undirected_adj(nodes, de, ue)
+    order = sorted(nodes)
+    label = {n: n for n in nodes}
+    for _ in range(30):
+        changed = False
+        for n in order:
+            neigh = adj[n]
+            if not neigh:
+                continue
+            counts = Counter(label[m] for m in neigh)
+            best = min(counts.items(), key=lambda kv: (-kv[1], kv[0]))[0]
+            if label[n] != best:
+                label[n] = best
+                changed = True
+        if not changed:
+            break
+    sizes = Counter(label.values())
+    ranked = [lab for lab, _ in sorted(sizes.items(), key=lambda kv: (-kv[1], kv[0]))]
+    remap = {lab: i for i, lab in enumerate(ranked)}
+    return {n: remap[label[n]] for n in nodes}