FU-3: re-index on content change (GAP-09) #14

Merged
chaim merged 11 commits from fix/fu3-reindex-on-change into main 2026-05-30 22:13:54 +00:00
Showing only changes of commit c7c7a1e119 - Show all commits

View File

@@ -182,6 +182,7 @@ async def ingest_document(
try:
stored_chunks = await _chunk_embed_store(case_law_id, raw_text, page_offsets, page_count, progress)
await db.mark_indexed(case_law_id)
# Step 9: multimodal — uniform: flag + PDF + page_count, NOT intake type.
if (config.MULTIMODAL_ENABLED and page_count > 0
@@ -256,3 +257,27 @@ async def _chunk_embed_store(case_law_id, text, page_offsets, page_count, progre
for c, v in zip(chunks, chunk_vectors)
]
return await db.store_precedent_chunks(case_law_id, chunk_dicts)
async def reindex_case_law(
case_law_id: "UUID | str",
progress: ProgressCb | None = None,
) -> dict:
"""Re-chunk + re-embed an existing case_law row from its STORED full_text (GAP-09).
No re-extract / no re-OCR (uses the stored text — see feedback_no_reocr_retrofit)
and no LLM/CLI (only chunker + voyage embeddings), so it is safe to run anywhere.
Idempotent: store_precedent_chunks(_hierarchical) is DELETE-then-INSERT.
"""
progress = progress or _noop_progress
cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
row = await db.get_case_law(cid)
if not row:
raise ValueError(f"case_law not found: {cid}")
text = (row.get("full_text") or "").strip()
if not text:
raise ValueError("case_law has no stored full_text to re-index")
stored = await _chunk_embed_store(cid, text, None, 0, progress)
await db.mark_indexed(cid)
await progress("completed", 100, f"הוטמע מחדש: {stored} chunks")
return {"status": "completed", "case_law_id": str(cid), "chunks": stored, "reindexed": True}