From f256eddbb1c66dd24965c1ad9a87ac87294063ac Mon Sep 17 00:00:00 2001 From: Chaim Date: Thu, 30 Apr 2026 18:27:36 +0000 Subject: [PATCH] git_sync: full case-dir backup to Gitea (sweep + explicit commits) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The case repo is the user's backup, so anything in the dir must end up on Gitea. Two layers: 1. Periodic sweep (every 30s) — git_sync.sweep_loop runs as a FastAPI background task. It scans every case dir, runs git status --porcelain on each, and commit_and_push's any dirty changes with an auto-built Hebrew message ("אוטו: טיוטות (2) · מסמכים"). Catches files written outside the API path: agent research artefacts, manual edits, etc. 2. Explicit commits at known write paths — DOCX export, interim draft, apply_user_edit, revise_draft, mark-final, analysis DOCX export. These give immediate feedback with descriptive messages instead of waiting up to 30s for the sweep. safe.directory injection added to _git_env so sweep + explicit commits work even when the running uid differs from the case-dir owner (host runs vs. uniform-root container). Co-Authored-By: Claude Opus 4.7 (1M context) --- mcp-server/src/legal_mcp/services/git_sync.py | 122 +++++++++++++++++- mcp-server/src/legal_mcp/tools/drafting.py | 17 ++- web/app.py | 23 +++- 3 files changed, 154 insertions(+), 8 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/git_sync.py b/mcp-server/src/legal_mcp/services/git_sync.py index 4afaeef..5040bd0 100644 --- a/mcp-server/src/legal_mcp/services/git_sync.py +++ b/mcp-server/src/legal_mcp/services/git_sync.py @@ -6,15 +6,23 @@ rotated in Infisical, repos created with the old token will fail to push silently — only logged at WARNING level. ``commit_and_push`` re-injects the *current* token into the existing origin URL on every call, so push survives token rotation. + +This module also runs a periodic ``sweep_loop`` that catches files +written outside the API path (most importantly: agents writing research +artefacts directly to the case dir). The full case repo is the user's +backup, so anything in the dir must end up on Gitea. """ from __future__ import annotations +import asyncio import logging import os import subprocess from pathlib import Path +from legal_mcp import config + logger = logging.getLogger(__name__) @@ -22,8 +30,8 @@ def _gitea_token() -> str: return os.environ.get("GITEA_ACCESS_TOKEN") or os.environ.get("GITEA_TOKEN", "") -def _git_env() -> dict: - return { +def _git_env(case_dir: str | Path | None = None) -> dict: + env = { "GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local", "GIT_COMMITTER_NAME": "Ezer Mishpati", @@ -31,6 +39,13 @@ def _git_env() -> dict: "PATH": os.environ.get("PATH", "/usr/bin:/bin"), "GIT_TERMINAL_PROMPT": "0", } + if case_dir is not None: + # Trust the case dir even when the running uid differs from the + # owner (prod container is uniform-root, but host runs may not be). + env["GIT_CONFIG_COUNT"] = "1" + env["GIT_CONFIG_KEY_0"] = "safe.directory" + env["GIT_CONFIG_VALUE_0"] = str(case_dir) + return env def _refresh_remote_url(case_dir: Path, env: dict) -> bool: @@ -68,7 +83,7 @@ def commit_and_push(case_dir: str | Path, message: str) -> bool: if not (case_dir / ".git").exists(): return False - env = _git_env() + env = _git_env(case_dir) subprocess.run(["git", "add", "."], cwd=case_dir, capture_output=True, env=env) commit = subprocess.run( @@ -90,3 +105,104 @@ def commit_and_push(case_dir: str | Path, message: str) -> bool: logger.warning("Git push failed in %s: %s", case_dir, push.stderr) return False return True + + +# ── Periodic sweep ──────────────────────────────────────────────── +# +# The user's expectation is that "anything I or an agent puts into a case +# dir ends up on Gitea". Explicit commit_and_push calls cover the API +# write paths, but agents write research/draft files directly to disk. +# A short periodic sweep is the safety net. + +_SWEEP_INTERVAL_SEC = 30 + + +def _porcelain_changes(case_dir: Path, env: dict) -> list[str]: + """Return list of `git status --porcelain` lines, or [] if clean/error.""" + res = subprocess.run( + ["git", "status", "--porcelain"], + cwd=case_dir, capture_output=True, text=True, env=env, + ) + if res.returncode != 0: + return [] + return [ln for ln in res.stdout.splitlines() if ln.strip()] + + +def _auto_message(changes: list[str]) -> str: + """Build a Hebrew commit message from porcelain output. + + Groups by top-level subdir under the case dir so a sweep that picks up + one DOCX export plus one research file produces a useful summary + instead of "auto-sync". + """ + groups: dict[str, int] = {} + sample: dict[str, str] = {} + for line in changes: + path = line[3:].strip().strip('"') + if "->" in path: # rename + path = path.split("->", 1)[1].strip().strip('"') + first = path.split("/", 1)[0] + groups[first] = groups.get(first, 0) + 1 + sample.setdefault(first, path) + + label_map = { + "documents": "מסמכים", + "drafts": "טיוטות", + "exports": "גרסאות", + "case.json": "מטא", + "notes.md": "הערות", + } + parts: list[str] = [] + for top, count in groups.items(): + label = label_map.get(top, top) + parts.append(f"{label} ({count})" if count > 1 else label) + summary = " · ".join(parts) or "שינויים" + return f"אוטו: {summary}" + + +def sweep_once() -> dict: + """Walk every case dir and commit+push any dirty changes. + + Synchronous (subprocess-based) but cheap — `git status --porcelain` on + a clean dir is a sub-millisecond operation. Returns a small report + suitable for logging. + """ + base: Path = config.CASES_DIR + if not base.exists(): + return {"checked": 0, "synced": 0, "errors": 0} + + checked = synced = errors = 0 + for case_dir in base.iterdir(): + if not case_dir.is_dir() or not (case_dir / ".git").exists(): + continue + checked += 1 + changes = _porcelain_changes(case_dir, _git_env(case_dir)) + if not changes: + continue + msg = _auto_message(changes) + ok = commit_and_push(case_dir, msg) + if ok: + synced += 1 + logger.info("auto-sync committed %d change(s) in %s", len(changes), case_dir.name) + else: + errors += 1 + return {"checked": checked, "synced": synced, "errors": errors} + + +async def sweep_loop(interval_sec: int = _SWEEP_INTERVAL_SEC) -> None: + """Background task: run sweep_once forever every interval_sec. + + Cancellation-safe; logs and continues on transient errors. + """ + logger.info("git_sync.sweep_loop started (interval=%ds)", interval_sec) + while True: + try: + await asyncio.sleep(interval_sec) + # Run the sync subprocess work in a thread to avoid blocking + # the FastAPI event loop. + await asyncio.to_thread(sweep_once) + except asyncio.CancelledError: + logger.info("git_sync.sweep_loop cancelled") + raise + except Exception as exc: + logger.warning("git_sync sweep iteration failed: %s", exc) diff --git a/mcp-server/src/legal_mcp/tools/drafting.py b/mcp-server/src/legal_mcp/tools/drafting.py index eb955f2..114f716 100644 --- a/mcp-server/src/legal_mcp/tools/drafting.py +++ b/mcp-server/src/legal_mcp/tools/drafting.py @@ -7,7 +7,7 @@ from pathlib import Path from uuid import UUID from legal_mcp import config -from legal_mcp.services import db, embeddings, research_md +from legal_mcp.services import db, embeddings, git_sync, research_md from legal_mcp.services.lessons import ( CITATION_GUIDANCE, DECISION_TEMPLATES, @@ -403,6 +403,9 @@ async def export_docx(case_number: str, output_path: str = "") -> str: path = await docx_exporter.export_decision(case_id, output_path or None) # Register this export as the new source of truth await db.set_active_draft_path(case_id, path) + case_dir = config.find_case_dir(case_number) + if case_dir.exists(): + git_sync.commit_and_push(case_dir, f"ייצוא DOCX: {Path(path).name}") return json.dumps({ "status": "completed", "path": path, @@ -528,6 +531,9 @@ async def export_interim_draft(case_number: str, output_path: str = "") -> str: case_id, output_path or None, mode="interim", ) await db.set_active_draft_path(case_id, path) + case_dir = config.find_case_dir(case_number) + if case_dir.exists(): + git_sync.commit_and_push(case_dir, f"טיוטת ביניים: {Path(path).name}") return json.dumps({ "status": "completed", "mode": "interim", @@ -571,6 +577,9 @@ async def apply_user_edit(case_number: str, edit_filename: str) -> str: try: retrofit_result = docx_retrofit.retrofit_bookmarks(edit_path) await db.set_active_draft_path(case_id, str(edit_path)) + case_dir = config.find_case_dir(case_number) + if case_dir.exists(): + git_sync.commit_and_push(case_dir, f"גרסת עריכה: {edit_path.name}") return json.dumps({ "status": "completed", "active_draft_path": str(edit_path), @@ -681,6 +690,12 @@ async def revise_draft(case_number: str, revisions_json: str, active_path, output_path, revisions, author=author, ) await db.set_active_draft_path(case_id, str(output_path)) + case_dir = config.find_case_dir(case_number) + if case_dir.exists(): + git_sync.commit_and_push( + case_dir, + f"revise: טיוטה-v{next_ver} ({result.applied} שינויים, {result.failed} נכשלו)", + ) return json.dumps({ "status": "completed", "output_path": str(output_path), diff --git a/web/app.py b/web/app.py index 2cc83bf..3ef78b0 100644 --- a/web/app.py +++ b/web/app.py @@ -28,7 +28,7 @@ from pydantic import BaseModel import asyncpg from legal_mcp import config -from legal_mcp.services import chunker, db, embeddings, extractor, processor, proofreader, research_md +from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, processor, proofreader, research_md from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools, precedents as precedents_tools # Import integration clients (same directory) @@ -69,9 +69,17 @@ _progress = ProgressStore(config.REDIS_URL, ttl_seconds=PROGRESS_TTL_SECONDS) async def lifespan(app: FastAPI): UPLOAD_DIR.mkdir(parents=True, exist_ok=True) await db.init_schema() - yield - await db.close_pool() - await _progress.close() + sync_task = asyncio.create_task(git_sync.sweep_loop()) + try: + yield + finally: + sync_task.cancel() + try: + await sync_task + except asyncio.CancelledError: + pass + await db.close_pool() + await _progress.close() app = FastAPI(title="העלאת מסמכים משפטיים", lifespan=lifespan) @@ -1832,6 +1840,9 @@ async def api_research_analysis_export_docx(case_number: str): except Exception as e: logger.exception("Failed to export analysis DOCX for %s", case_number) raise HTTPException(500, f"שגיאה בייצוא: {e}") + case_dir = config.find_case_dir(case_number) + if case_dir.exists(): + commit_and_push(case_dir, f"ניתוח משפטי: {path.name}") return FileResponse( path, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -2260,6 +2271,10 @@ async def api_mark_final(case_number: str, filename: str): UUID(case["id"]), ) + case_dir = config.find_case_dir(case_number) + if case_dir.exists(): + commit_and_push(case_dir, f"גרסה סופית: {final_name}") + return { "final_filename": final_name, "training_copy": str(training_dest),