feat(reindex): reindex_case_law from stored text + mark_indexed on ingest (GAP-09, FU-3)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -182,6 +182,7 @@ async def ingest_document(
|
||||
|
||||
try:
|
||||
stored_chunks = await _chunk_embed_store(case_law_id, raw_text, page_offsets, page_count, progress)
|
||||
await db.mark_indexed(case_law_id)
|
||||
|
||||
# Step 9: multimodal — uniform: flag + PDF + page_count, NOT intake type.
|
||||
if (config.MULTIMODAL_ENABLED and page_count > 0
|
||||
@@ -256,3 +257,27 @@ async def _chunk_embed_store(case_law_id, text, page_offsets, page_count, progre
|
||||
for c, v in zip(chunks, chunk_vectors)
|
||||
]
|
||||
return await db.store_precedent_chunks(case_law_id, chunk_dicts)
|
||||
|
||||
|
||||
async def reindex_case_law(
|
||||
case_law_id: "UUID | str",
|
||||
progress: ProgressCb | None = None,
|
||||
) -> dict:
|
||||
"""Re-chunk + re-embed an existing case_law row from its STORED full_text (GAP-09).
|
||||
|
||||
No re-extract / no re-OCR (uses the stored text — see feedback_no_reocr_retrofit)
|
||||
and no LLM/CLI (only chunker + voyage embeddings), so it is safe to run anywhere.
|
||||
Idempotent: store_precedent_chunks(_hierarchical) is DELETE-then-INSERT.
|
||||
"""
|
||||
progress = progress or _noop_progress
|
||||
cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
|
||||
row = await db.get_case_law(cid)
|
||||
if not row:
|
||||
raise ValueError(f"case_law not found: {cid}")
|
||||
text = (row.get("full_text") or "").strip()
|
||||
if not text:
|
||||
raise ValueError("case_law has no stored full_text to re-index")
|
||||
stored = await _chunk_embed_store(cid, text, None, 0, progress)
|
||||
await db.mark_indexed(cid)
|
||||
await progress("completed", 100, f"הוטמע מחדש: {stored} chunks")
|
||||
return {"status": "completed", "case_law_id": str(cid), "chunks": stored, "reindexed": True}
|
||||
|
||||
Reference in New Issue
Block a user