"""Graph metrics for the corpus graph — dependency-free (no networkx). Computed in-memory over the precedent citation subgraph that ``graph_api`` already fetched (**G2**: no DB access here — pure functions over data the caller holds). The corpus graph is tiny (≤ ``NODE_CAP_MAX`` = 1500 nodes, sparse), so power-iteration PageRank, Brandes betweenness, and label-propagation communities all run synchronously well under a second. Edge model: ``cites`` is directional (authority flows citing → cited); ``same_chain`` is non-directional. PageRank uses cites-direction + same_chain both ways; betweenness and communities treat the whole graph as undirected. Determinism (stable colors across requests): nodes are processed in sorted order and ties break by lowest label — no randomness. """ from __future__ import annotations from collections import Counter, defaultdict, deque def compute( node_ids: list[str], directed_edges: list[tuple[str, str]], undirected_edges: list[tuple[str, str]] | None = None, ) -> dict[str, dict]: """Return ``{node_id: {pagerank, betweenness, community}}``. ``pagerank`` / ``betweenness`` are normalized to max = 1.0 (easy client scaling); ``community`` is a dense int 0..k-1 ordered by descending cluster size (so the largest cluster is always colour 0). """ nodes = list(node_ids) node_set = set(nodes) if not nodes: return {} undirected_edges = undirected_edges or [] de = [(s, t) for s, t in directed_edges if s in node_set and t in node_set and s != t] ue = [(s, t) for s, t in undirected_edges if s in node_set and t in node_set and s != t] pr = _normalize(_pagerank(nodes, de, ue)) bt = _normalize(_betweenness(nodes, de, ue)) comm = _communities(nodes, de, ue) return { n: { "pagerank": round(pr[n], 4), "betweenness": round(bt[n], 4), "community": comm[n], } for n in nodes } def _normalize(d: dict[str, float]) -> dict[str, float]: m = max(d.values()) if d else 0.0 if m <= 0: return {k: 0.0 for k in d} return {k: v / m for k, v in d.items()} def _undirected_adj(nodes: list[str], de, ue) -> dict[str, set[str]]: adj: dict[str, set[str]] = {n: set() for n in nodes} for s, t in de: adj[s].add(t) adj[t].add(s) for s, t in ue: adj[s].add(t) adj[t].add(s) return adj def _pagerank(nodes, de, ue, d: float = 0.85, iters: int = 100, tol: float = 1e-9): """Power-iteration PageRank. cites direction + same_chain both ways.""" out: dict[str, list[str]] = defaultdict(list) for s, t in de: out[s].append(t) for s, t in ue: out[s].append(t) out[t].append(s) n = len(nodes) pr = {x: 1.0 / n for x in nodes} for _ in range(iters): dangling = sum(pr[x] for x in nodes if not out[x]) base = (1.0 - d) / n + d * dangling / n new = {x: base for x in nodes} for x in nodes: deg = len(out[x]) if deg: share = d * pr[x] / deg for m in out[x]: new[m] += share if sum(abs(new[x] - pr[x]) for x in nodes) < tol: return new pr = new return pr def _betweenness(nodes, de, ue): """Brandes betweenness on the undirected graph. O(V·(V+E)).""" adj = _undirected_adj(nodes, de, ue) bc = {x: 0.0 for x in nodes} for s in nodes: stack: list[str] = [] preds: dict[str, list[str]] = {w: [] for w in nodes} sigma = {w: 0.0 for w in nodes} sigma[s] = 1.0 dist = {w: -1 for w in nodes} dist[s] = 0 queue = deque([s]) while queue: v = queue.popleft() stack.append(v) for w in adj[v]: if dist[w] < 0: dist[w] = dist[v] + 1 queue.append(w) if dist[w] == dist[v] + 1: sigma[w] += sigma[v] preds[w].append(v) delta = {w: 0.0 for w in nodes} while stack: w = stack.pop() for v in preds[w]: if sigma[w]: delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w]) if w != s: bc[w] += delta[w] # Undirected: each shortest path counted from both endpoints. return {x: v / 2.0 for x, v in bc.items()} def _communities(nodes, de, ue) -> dict[str, int]: """Deterministic synchronous label propagation (+ dense renumbering). Each node starts in its own community and repeatedly adopts the most common label among its neighbours (ties → lowest label). Isolated nodes keep their own singleton community. Labels are renumbered 0..k-1 by descending size. """ adj = _undirected_adj(nodes, de, ue) order = sorted(nodes) label = {n: n for n in nodes} for _ in range(30): changed = False for n in order: neigh = adj[n] if not neigh: continue counts = Counter(label[m] for m in neigh) best = min(counts.items(), key=lambda kv: (-kv[1], kv[0]))[0] if label[n] != best: label[n] = best changed = True if not changed: break sizes = Counter(label.values()) ranked = [lab for lab, _ in sorted(sizes.items(), key=lambda kv: (-kv[1], kv[0]))] remap = {lab: i for i, lab in enumerate(ranked)} return {n: remap[label[n]] for n in nodes}