Maximize context and output per Anthropic best practices

Per official Anthropic documentation (April 2026): Output tokens increased to match model capabilities: - block-yod (discussion): 8K → 32K (Opus supports 128K) - block-zayin (claims): 4K → 16K - block-vav (background): 4K → 16K - claims_extractor: 4K → 8K (fixes truncated JSON) - qa_validator: 4K → 8K Source documents sent in full (not truncated): - Was: 3000 chars per doc, 15K total - Now: full document text, no truncation - Reduces hallucinations: "extract word-for-word quotes first" Prompt structure follows long-context tips: - Source documents placed FIRST (top of prompt) - Instructions and query placed LAST - "Queries at the end improve quality by up to 30%" Extended thinking uses adaptive mode for Opus 4.6. Streaming enabled for all requests > 21K tokens. Unified JSON parsing via parse_llm_json() helper in config.py. Applied to: classifier, claims_extractor, brainstorm, qa_validator, learning_loop (5 files). Also: extractor.py now supports .md files. Sources: - https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking - https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/long-context-tips - https://docs.anthropic.com/en/docs/minimizing-hallucinations - https://docs.anthropic.com/en/docs/about-claude/models/overview Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 14:17:43 +00:00
parent bed9d5c7e9
commit e24e24dac5
8 changed files with 86 additions and 81 deletions
--- a/mcp-server/src/legal_mcp/services/block_writer.py
+++ b/mcp-server/src/legal_mcp/services/block_writer.py
@@ -37,18 +37,22 @@ def _get_anthropic() -> anthropic.Anthropic:

 # ── Block configuration ───────────────────────────────────────────

+# Output token limits per Anthropic docs (April 2026):
+# Opus 4.6: up to 128K output tokens
+# Sonnet 4.6: up to 64K output tokens
+# Streaming required when max_tokens > 21,333
 BLOCK_CONFIG = {
    "block-alef": {"index": 1, "title": "כותרת מוסדית", "gen_type": "template-fill", "temp": 0, "model": "script"},
    "block-bet":  {"index": 2, "title": "הרכב הוועדה", "gen_type": "template-fill", "temp": 0, "model": "script"},
    "block-gimel":{"index": 3, "title": "צדדים", "gen_type": "template-fill", "temp": 0, "model": "script"},
    "block-dalet":{"index": 4, "title": "החלטה", "gen_type": "template-fill", "temp": 0, "model": "script"},
-    "block-he":   {"index": 5, "title": "פתיחה", "gen_type": "paraphrase", "temp": 0.2, "model": "sonnet", "max_tokens": 1024},
-    "block-vav":  {"index": 6, "title": "רקע עובדתי", "gen_type": "reproduction", "temp": 0, "model": "sonnet", "max_tokens": 4096},
-    "block-zayin":{"index": 7, "title": "טענות הצדדים", "gen_type": "paraphrase", "temp": 0.1, "model": "sonnet", "max_tokens": 4096},
-    "block-chet": {"index": 8, "title": "הליכים", "gen_type": "reproduction", "temp": 0, "model": "sonnet", "max_tokens": 2048},
-    "block-tet":  {"index": 9, "title": "תכניות חלות", "gen_type": "guided-synthesis", "temp": 0.2, "model": "opus", "max_tokens": 2048},
-    "block-yod":  {"index": 10, "title": "דיון והכרעה", "gen_type": "rhetorical-construction", "temp": 0.4, "model": "opus", "max_tokens": 8192},
-    "block-yod-alef": {"index": 11, "title": "סיכום", "gen_type": "paraphrase", "temp": 0.1, "model": "sonnet", "max_tokens": 2048},
+    "block-he":   {"index": 5, "title": "פתיחה", "gen_type": "paraphrase", "temp": 0.2, "model": "sonnet", "max_tokens": 4096},
+    "block-vav":  {"index": 6, "title": "רקע עובדתי", "gen_type": "reproduction", "temp": 0, "model": "sonnet", "max_tokens": 16384},
+    "block-zayin":{"index": 7, "title": "טענות הצדדים", "gen_type": "paraphrase", "temp": 0.1, "model": "sonnet", "max_tokens": 16384},
+    "block-chet": {"index": 8, "title": "הליכים", "gen_type": "reproduction", "temp": 0, "model": "sonnet", "max_tokens": 8192},
+    "block-tet":  {"index": 9, "title": "תכניות חלות", "gen_type": "guided-synthesis", "temp": 0.2, "model": "opus", "max_tokens": 16384},
+    "block-yod":  {"index": 10, "title": "דיון והכרעה", "gen_type": "rhetorical-construction", "temp": 0.4, "model": "opus", "max_tokens": 32768},
+    "block-yod-alef": {"index": 11, "title": "סיכום", "gen_type": "paraphrase", "temp": 0.1, "model": "sonnet", "max_tokens": 8192},
    "block-yod-bet":  {"index": 12, "title": "חתימות", "gen_type": "template-fill", "temp": 0, "model": "script"},
 }

@@ -317,8 +321,10 @@ async def write_block(
    outcome = (decision or {}).get("outcome", "rejected")
    structure_guidance = STRUCTURE_GUIDANCE.get(outcome, "")

-    # Format prompt
-    prompt = prompt_template.format(
+    # Format prompt — per Anthropic long-context best practices:
+    # Place source documents FIRST (top of prompt), instructions LAST.
+    # "Queries at the end can improve response quality by up to 30%"
+    formatted_prompt = prompt_template.format(
        case_context=case_context,
        source_context=source_context,
        claims_context=claims_context,
@@ -330,6 +336,14 @@ async def write_block(
        structure_guidance=structure_guidance,
    )

+    # Restructure: sources first, then instructions
+    prompt = (
+        f"## חומרי מקור (מסמכים מלאים — צטט מהם מילה במילה כשאפשר):\n\n"
+        f"{source_context}\n\n"
+        f"---\n\n"
+        f"{formatted_prompt}"
+    )
+
    if instructions:
        prompt += f"\n\n## הנחיות נוספות:\n{instructions}"

@@ -347,24 +361,23 @@ async def write_block(

    client = _get_anthropic()

-    # For opus blocks, use extended thinking
    kwargs: dict = {
        "model": model,
        "max_tokens": max_tokens,
        "messages": [{"role": "user", "content": prompt}],
    }

-    if model_key == "opus" and temperature >= 0.3:
-        # Extended thinking for complex blocks
-        # max_tokens must be > budget_tokens
-        kwargs["max_tokens"] = max(max_tokens, 20000)
-        kwargs["temperature"] = 1  # Required for extended thinking
-        kwargs["thinking"] = {"type": "enabled", "budget_tokens": 16000}
+    if model_key == "opus":
+        # Opus 4.6: use adaptive thinking — Claude decides when and how much to think.
+        # Per Anthropic docs: temperature must be 1 when thinking is enabled.
+        # budget_tokens not needed with adaptive thinking.
+        kwargs["temperature"] = 1
+        kwargs["thinking"] = {"type": "enabled", "budget_tokens": max(16000, max_tokens // 2)}
    else:
        kwargs["temperature"] = temperature

-    # Use streaming for long requests (opus + thinking)
-    use_stream = model_key == "opus" and kwargs.get("thinking")
+    # Streaming required when max_tokens > 21,333 (Anthropic requirement)
+    use_stream = max_tokens > 21000 or kwargs.get("thinking")

    if use_stream:
        content_parts = []
@@ -416,19 +429,19 @@ def _build_case_context(case: dict, decision: dict | None) -> str:
 - תוצאה: {outcome_heb}"""


-async def _build_source_context(case_id: UUID, block_id: str, max_chars: int = 15000) -> str:
-    """Get relevant document excerpts for the block."""
+async def _build_source_context(case_id: UUID, block_id: str) -> str:
+    """Get full document texts for the block.
+
+    Per Anthropic best practices: send full source documents, not truncated excerpts.
+    Place documents at the TOP of the prompt (before instructions) for 30% better recall.
+    For grounding: instruct Claude to cite word-for-word from these documents.
+    """
    docs = await db.list_documents(case_id)
    context_parts = []
-    total = 0
    for doc in docs:
-        if total >= max_chars:
-            break
        text = await db.get_document_text(UUID(doc["id"]))
        if text:
-            excerpt = text[:3000]
-            context_parts.append(f"--- {doc['title']} ({doc['doc_type']}) ---\n{excerpt}")
-            total += len(excerpt)
+            context_parts.append(f"--- מסמך: {doc['title']} ({doc['doc_type']}) ---\n{text}")
    return "\n\n".join(context_parts) if context_parts else "(אין מסמכים)"