legal-ai/scripts/convert_decision_template.py

"""Convert דפנה's decision .dotx template to a loadable .docx file.

python-docx cannot open .dotx files directly (content type is
`...template.main+xml` rather than `...document.main+xml`). This script
produces a sibling .docx by rewriting [Content_Types].xml and dropping
the `word/glossary/` part (which is template-specific and can interfere
with plain Document() loading).

The output preserves every style definition, numbering, fonts, and
section properties — the only things we want from the template.

Run once (or whenever the source .dotx changes):

    python scripts/convert_decision_template.py

Input:  data/training/טיוטת החלטה.dotx
Output: skills/docx/decision_template.docx
"""

from __future__ import annotations

import re
import sys
import zipfile
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
SRC = REPO_ROOT / "data" / "training" / "טיוטת החלטה.dotx"
DST = REPO_ROOT / "skills" / "docx" / "decision_template.docx"

TEMPLATE_CONTENT_TYPE = (
    "application/vnd.openxmlformats-officedocument."
    "wordprocessingml.template.main+xml"
)
DOCUMENT_CONTENT_TYPE = (
    "application/vnd.openxmlformats-officedocument."
    "wordprocessingml.document.main+xml"
)


def convert(src: Path, dst: Path) -> None:
    if not src.exists():
        raise FileNotFoundError(f"Template not found: {src}")
    dst.parent.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(src, "r") as zin:
        names = zin.namelist()
        with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
            for name in names:
                # Drop glossary part — template-only, confuses Document()
                if name.startswith("word/glossary/"):
                    continue
                data = zin.read(name)
                if name == "[Content_Types].xml":
                    text = data.decode("utf-8")
                    text = text.replace(
                        TEMPLATE_CONTENT_TYPE, DOCUMENT_CONTENT_TYPE
                    )
                    # Drop every <Override> that points at /word/glossary/...
                    text = re.sub(
                        r'<Override\s+PartName="/word/glossary/[^"]*"[^>]*?/>',
                        "",
                        text,
                    )
                    data = text.encode("utf-8")
                elif name == "word/_rels/document.xml.rels":
                    # Strip the glossaryDocument relationship — the target
                    # part is being removed, so the ref would dangle.
                    text = data.decode("utf-8")
                    text = re.sub(
                        r'<Relationship\s+[^>]*?glossaryDocument[^>]*?/>',
                        "",
                        text,
                    )
                    data = text.encode("utf-8")
                zout.writestr(name, data)


def verify(dst: Path) -> None:
    """Load with python-docx and print a few style names to confirm it works."""
    from docx import Document

    doc = Document(str(dst))
    key_styles = {"Normal", "Heading 2", "Quote", "List Paragraph", "Title"}
    found = {s.name for s in doc.styles if s.name in key_styles}
    missing = key_styles - found
    if missing:
        print(f"WARN: missing styles: {missing}", file=sys.stderr)
    else:
        print(f"OK — all key styles present: {sorted(found)}")


def main() -> None:
    print(f"Source: {SRC}")
    print(f"Dest:   {DST}")
    convert(SRC, DST)
    print(f"Wrote {DST.stat().st_size:,} bytes")
    verify(DST)


if __name__ == "__main__":
    main()