From 36f21c815eff2a985097df99016b5213bff42873 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Mon, 4 May 2026 05:13:10 +0000
Subject: [PATCH] fix(precedents): distinguish silent extraction failure from
 "no halachot"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Observed 2026-05-03: a `precedent_process_pending(halacha)` run that
chained two precedents (1110/20 → 317/10) succeeded for the first
(9 halachot, 129 chunks) and produced status=`no_halachot` for the
second despite it being a 47KB Supreme Court ruling with rich legal
analysis. A manual single-precedent re-run on 317/10 immediately
extracted 53 halachot. Diagnosis: every chunk's claude_session call
in the back-to-back run silently failed (likely Anthropic rate-limit
storm after the 1110/20 token burn), and the empty list was reported
as "Claude looked and found nothing" — same code path as a real
0-halacha ruling. The user couldn't tell the difference.

Three changes:

1. Surface chunk-level failures (halacha_extractor.py)
   `_extract_chunk` now returns `(halachot, succeeded)` so the caller
   can count how many chunks crashed. `extract()` uses this to
   distinguish:
   - `no_halachot` — chunks ran cleanly, Claude found nothing
   - `extraction_failed` — ≥50% of chunks crashed AND zero halachot
     came back (rate limit, subprocess crash, etc.)
   When `extraction_failed`, DB status is left as 'processing' so the
   request stays in the queue for the caller to retry — instead of
   the old behaviour where it got marked 'completed' and silently
   dropped from the queue.

2. Inter-precedent cooldown (precedent_library.py)
   `process_pending_extractions` now sleeps 30s between precedents.
   Anthropic rate-limits per-org, and back-to-back large rulings
   (~4M tokens for 1110/20, immediately followed by another 2-3M)
   was the empirical trigger. 30s gives the per-minute counter time
   to drain.

3. Auto-retry on extraction_failed (precedent_library.py)
   When a precedent comes back as `extraction_failed`, retry once
   after a 60s cooldown before giving up. Rate-limit storms are
   transient — the manual re-run of 317/10 minutes later succeeded
   with 53 halachot and zero chunk failures, confirming a single
   retry is sufficient. Only retries `extraction_failed`; never
   `no_halachot` (Claude looked and there genuinely is no holding).

The DB status now ends up as 'failed' only after retries are
exhausted, matching the UI's terminal-failure chip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../legal_mcp/services/halacha_extractor.py   | 64 ++++++++++++++++---
 .../legal_mcp/services/precedent_library.py   | 64 +++++++++++++++++--
 2 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/mcp-server/src/legal_mcp/services/halacha_extractor.py b/mcp-server/src/legal_mcp/services/halacha_extractor.py
index d3267f4..67c5523 100644
--- a/mcp-server/src/legal_mcp/services/halacha_extractor.py
+++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py
@@ -36,6 +36,14 @@ logger = logging.getLogger(__name__)
 CHUNK_CONCURRENCY = 3
 CHUNK_RETRY_ATTEMPTS = 1
 
+# If at least this fraction of chunks crash and the precedent yields zero
+# halachot, treat the run as `extraction_failed` rather than `no_halachot`.
+# Picked at 0.5 so a precedent that genuinely has no holdings (e.g. a remand
+# ruling that just sends the case back) isn't misflagged just because a few
+# chunks timed out, while a real rate-limit storm — which kills nearly every
+# call — is correctly distinguished and re-tried by the caller.
+EXTRACTION_FAILURE_THRESHOLD = 0.5
+
 # Sections from which to extract. facts/intro/appellant_claims/respondent_claims
 # never contain holdings, only positions, so we skip them.
 EXTRACTABLE_SECTIONS = ("legal_analysis", "ruling", "conclusion")
@@ -267,12 +275,18 @@ async def _extract_chunk(
     chunk_total: int,
     context: str,
     is_binding: bool,
-) -> list[dict]:
+) -> tuple[list[dict], bool]:
     """Run the halacha extractor on one chunk with retry.
 
-    The prompt branches on ``is_binding`` so that non-binding sources
-    (other appeals committees, district courts) yield application /
-    persuasive entries rather than a forced 0-result strict halacha pass.
+    Returns ``(halachot, succeeded)`` so the caller can distinguish "Claude
+    said there are no halachot here" (`(_, True)`) from "every attempt
+    crashed/timed out" (`(_, False)`). Without this distinction a precedent
+    that hit a rate-limit storm looks identical to one that genuinely has no
+    halachot — and gets silently marked `no_halachot`.
+
+    The prompt branches on ``is_binding`` so non-binding sources (other
+    appeals committees, district courts) yield application/persuasive
+    entries rather than a forced 0-result strict halacha pass.
     """
     base_prompt = (
         HALACHA_EXTRACTION_PROMPT_BINDING if is_binding
@@ -299,7 +313,7 @@ async def _extract_chunk(
             )
             continue
         if isinstance(result, list):
-            return result
+            return result, True
         logger.warning(
             "halacha_extractor chunk %d/%d attempt %d returned non-list (%s)",
             chunk_index + 1, chunk_total, attempt + 1, type(result).__name__,
@@ -308,7 +322,7 @@ async def _extract_chunk(
         "halacha_extractor chunk %d/%d failed after %d attempts: %s",
         chunk_index + 1, chunk_total, CHUNK_RETRY_ATTEMPTS + 1, last_err,
     )
-    return []
+    return [], False
 
 
 async def extract(case_law_id: UUID | str) -> dict:
@@ -359,7 +373,7 @@ async def extract(case_law_id: UUID | str) -> dict:
 
     sem = asyncio.Semaphore(CHUNK_CONCURRENCY)
 
-    async def _bounded(idx: int, chunk_row: dict) -> list[dict]:
+    async def _bounded(idx: int, chunk_row: dict) -> tuple[list[dict], bool]:
         async with sem:
             return await _extract_chunk(
                 chunk_row["content"], chunk_row["section_type"],
@@ -370,12 +384,44 @@ async def extract(case_law_id: UUID | str) -> dict:
         *[_bounded(i, c) for i, c in enumerate(chunks)]
     )
     raw_halachot: list[dict] = []
-    for items in chunk_results:
+    failed_chunks = 0
+    for items, ok in chunk_results:
         raw_halachot.extend(items)
+        if not ok:
+            failed_chunks += 1
+
+    # If most chunks failed (rate limit storm, claude_session crash, etc.)
+    # do NOT touch the DB status — leave it 'processing' so the caller can
+    # retry without the request falling out of the queue. The caller
+    # (`process_pending_extractions`) is responsible for either retrying or
+    # finalising the status as 'failed' after retries are exhausted. This
+    # is the bug that produced 317/10's silent `no_halachot` after a
+    # 129-chunk neighbour saturated the API.
+    failure_rate = failed_chunks / len(chunks) if chunks else 0
+    if failure_rate >= EXTRACTION_FAILURE_THRESHOLD and not raw_halachot:
+        logger.error(
+            "halacha_extractor: case_law=%s extraction_failed — "
+            "%d/%d chunks failed (rate=%.0f%%), no halachot retrieved. "
+            "DB status left as 'processing' for caller-level retry.",
+            case_law_id, failed_chunks, len(chunks), failure_rate * 100,
+        )
+        return {
+            "status": "extraction_failed",
+            "extracted": 0,
+            "stored": 0,
+            "failed_chunks": failed_chunks,
+            "total_chunks": len(chunks),
+        }
 
     if not raw_halachot:
         await db.set_case_law_halacha_status(case_law_id, "completed")
-        return {"status": "no_halachot", "extracted": 0, "stored": 0}
+        return {
+            "status": "no_halachot",
+            "extracted": 0,
+            "stored": 0,
+            "failed_chunks": failed_chunks,
+            "total_chunks": len(chunks),
+        }
 
     # Validate against the full text of the precedent for the quote check.
     full_text = record.get("full_text") or ""
diff --git a/mcp-server/src/legal_mcp/services/precedent_library.py b/mcp-server/src/legal_mcp/services/precedent_library.py
index 81c6bae..de4a1c2 100644
--- a/mcp-server/src/legal_mcp/services/precedent_library.py
+++ b/mcp-server/src/legal_mcp/services/precedent_library.py
@@ -270,6 +270,19 @@ async def reextract_halachot(
     return result
 
 
+# Wait this many seconds between precedents in a multi-precedent run.
+# Anthropic rate-limits across the org, so back-to-back extractions of large
+# rulings (e.g. 129 chunks for one, then 79 for another) can spill the second
+# precedent into a 429 storm. Observed 2026-05-03: 1110/20 succeeded with 9
+# halachot, 317/10 immediately after returned silent no_halachot.
+INTER_PRECEDENT_COOLDOWN_SEC = 30
+
+# How many times to retry a precedent that came back as 'extraction_failed'
+# (i.e. >50% chunks crashed). Each retry uses a longer cooldown.
+PRECEDENT_RETRY_ATTEMPTS = 1
+PRECEDENT_RETRY_COOLDOWN_SEC = 60
+
+
 async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -> dict:
     """Drain the extraction queue (UI-button-stamped requests).
 
@@ -279,6 +292,14 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
     tool — picks each stamped row up, runs the extractor, and clears the
     timestamp.
 
+    Sequencing: precedents are processed serially (never in parallel) and
+    each is followed by a short cooldown so the Anthropic rate-limit
+    counter has time to drain before the next big precedent starts. If
+    halacha extraction comes back as ``extraction_failed`` we retry the
+    same precedent once with a longer cooldown — matching the empirical
+    pattern where the second precedent in a back-to-back run gets
+    rate-limited but recovers after a brief pause.
+
     Args:
         kind: 'metadata' or 'halacha'.
         limit: max rows to process this run.
@@ -292,15 +313,46 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
     if not pending:
         return {"status": "no_pending", "kind": kind, "processed": 0, "results": []}
 
+    async def _run_once(cid: UUID) -> dict:
+        if kind == "metadata":
+            return await precedent_metadata_extractor.extract_and_apply(cid)
+        return await halacha_extractor.extract(cid)
+
     results: list[dict] = []
     processed = 0
-    for row in pending:
+    for idx, row in enumerate(pending):
+        if idx > 0:
+            await asyncio.sleep(INTER_PRECEDENT_COOLDOWN_SEC)
         cid = UUID(str(row["id"]))
+        attempts = 0
+        result: dict = {}
         try:
-            if kind == "metadata":
-                result = await precedent_metadata_extractor.extract_and_apply(cid)
-            else:
-                result = await halacha_extractor.extract(cid)
+            result = await _run_once(cid)
+            # Retry only on systematic extraction failure (rate-limit storm).
+            # Don't retry on 'no_halachot' — that means Claude looked and
+            # genuinely found nothing.
+            while (
+                result.get("status") == "extraction_failed"
+                and attempts < PRECEDENT_RETRY_ATTEMPTS
+            ):
+                attempts += 1
+                logger.warning(
+                    "process_pending_extractions: %s returned extraction_failed "
+                    "(%d/%d chunks crashed), retry %d/%d after %ds cooldown",
+                    cid,
+                    result.get("failed_chunks", 0),
+                    result.get("total_chunks", 0),
+                    attempts, PRECEDENT_RETRY_ATTEMPTS,
+                    PRECEDENT_RETRY_COOLDOWN_SEC,
+                )
+                await asyncio.sleep(PRECEDENT_RETRY_COOLDOWN_SEC)
+                result = await _run_once(cid)
+
+            # Finalise: success or terminal failure both clear the request
+            # so the queue moves on. (Use 'failed' DB state for terminal
+            # extraction_failed so the UI shows the warning chip.)
+            if kind == "halacha" and result.get("status") == "extraction_failed":
+                await db.set_case_law_halacha_status(cid, "failed")
             await db.clear_extraction_request(cid, kind=kind)
             processed += 1
             results.append({
@@ -309,6 +361,7 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
                 "status": result.get("status", "unknown"),
                 "fields": result.get("fields", []),
                 "stored": result.get("stored", 0),
+                "retry_attempts": attempts,
             })
         except Exception as e:
             logger.exception("process_pending_extractions failed for %s: %s", cid, e)
@@ -317,6 +370,7 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
                 "case_number": row.get("case_number", ""),
                 "status": "failed",
                 "error": str(e),
+                "retry_attempts": attempts,
             })
             # Don't clear the request — it stays for the next run.