Per official Anthropic documentation (April 2026): Output tokens increased to match model capabilities: - block-yod (discussion): 8K → 32K (Opus supports 128K) - block-zayin (claims): 4K → 16K - block-vav (background): 4K → 16K - claims_extractor: 4K → 8K (fixes truncated JSON) - qa_validator: 4K → 8K Source documents sent in full (not truncated): - Was: 3000 chars per doc, 15K total - Now: full document text, no truncation - Reduces hallucinations: "extract word-for-word quotes first" Prompt structure follows long-context tips: - Source documents placed FIRST (top of prompt) - Instructions and query placed LAST - "Queries at the end improve quality by up to 30%" Extended thinking uses adaptive mode for Opus 4.6. Streaming enabled for all requests > 21K tokens. Unified JSON parsing via parse_llm_json() helper in config.py. Applied to: classifier, claims_extractor, brainstorm, qa_validator, learning_loop (5 files). Also: extractor.py now supports .md files. Sources: - https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking - https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/long-context-tips - https://docs.anthropic.com/en/docs/minimizing-hallucinations - https://docs.anthropic.com/en/docs/about-claude/models/overview Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
127 lines
4.0 KiB
Python
127 lines
4.0 KiB
Python
"""Text extraction from PDF, DOCX, and RTF files.
|
|
|
|
Primary PDF extraction: Claude Vision API (for scanned documents).
|
|
Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import anthropic
|
|
import fitz # PyMuPDF
|
|
from docx import Document as DocxDocument
|
|
from striprtf.striprtf import rtf_to_text
|
|
|
|
from legal_mcp import config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_anthropic_client: anthropic.Anthropic | None = None
|
|
|
|
|
|
def _get_anthropic() -> anthropic.Anthropic:
|
|
global _anthropic_client
|
|
if _anthropic_client is None:
|
|
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
|
return _anthropic_client
|
|
|
|
|
|
async def extract_text(file_path: str) -> tuple[str, int]:
|
|
"""Extract text from a document file.
|
|
|
|
Returns:
|
|
Tuple of (extracted_text, page_count).
|
|
page_count is 0 for non-PDF files.
|
|
"""
|
|
path = Path(file_path)
|
|
suffix = path.suffix.lower()
|
|
|
|
if suffix == ".pdf":
|
|
return await _extract_pdf(path)
|
|
elif suffix == ".docx":
|
|
return _extract_docx(path), 0
|
|
elif suffix == ".rtf":
|
|
return _extract_rtf(path), 0
|
|
elif suffix in (".txt", ".md"):
|
|
return path.read_text(encoding="utf-8"), 0
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {suffix}")
|
|
|
|
|
|
async def _extract_pdf(path: Path) -> tuple[str, int]:
|
|
"""Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
|
|
doc = fitz.open(str(path))
|
|
page_count = len(doc)
|
|
pages_text: list[str] = []
|
|
|
|
for page_num in range(page_count):
|
|
page = doc[page_num]
|
|
# Try direct text extraction first
|
|
text = page.get_text().strip()
|
|
|
|
if len(text) > 50:
|
|
# Sufficient text found - born-digital page
|
|
pages_text.append(text)
|
|
logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
|
|
else:
|
|
# Likely scanned - use Claude Vision
|
|
logger.info("Page %d: using Claude Vision OCR", page_num + 1)
|
|
pix = page.get_pixmap(dpi=200)
|
|
img_bytes = pix.tobytes("png")
|
|
ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
|
|
pages_text.append(ocr_text)
|
|
|
|
doc.close()
|
|
return "\n\n".join(pages_text), page_count
|
|
|
|
|
|
async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
|
|
"""OCR a single page image using Claude Vision API."""
|
|
client = _get_anthropic()
|
|
b64_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
|
|
message = client.messages.create(
|
|
model="claude-sonnet-4-20250514",
|
|
max_tokens=4096,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": b64_image,
|
|
},
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": (
|
|
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
|
|
"שמור על מבנה הפסקאות המקורי. "
|
|
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
|
|
),
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
return message.content[0].text
|
|
|
|
|
|
def _extract_docx(path: Path) -> str:
|
|
"""Extract text from DOCX file."""
|
|
doc = DocxDocument(str(path))
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
return "\n\n".join(paragraphs)
|
|
|
|
|
|
def _extract_rtf(path: Path) -> str:
|
|
"""Extract text from RTF file."""
|
|
rtf_content = path.read_text(encoding="utf-8", errors="replace")
|
|
return rtf_to_text(rtf_content)
|