Updates accumulated from prior sessions: - HEARTBEAT: company-based filtering (CMP/CMPA) rules - legal-qa, legal-researcher: routine updates - analysis_docx_exporter: new service for analysis DOCX export - compose page: "הורד כ-DOCX" button for analysis - decision_template.docx: template for exporter Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
"""Convert דפנה's decision .dotx template to a loadable .docx file.
|
|
|
|
python-docx cannot open .dotx files directly (content type is
|
|
`...template.main+xml` rather than `...document.main+xml`). This script
|
|
produces a sibling .docx by rewriting [Content_Types].xml and dropping
|
|
the `word/glossary/` part (which is template-specific and can interfere
|
|
with plain Document() loading).
|
|
|
|
The output preserves every style definition, numbering, fonts, and
|
|
section properties — the only things we want from the template.
|
|
|
|
Run once (or whenever the source .dotx changes):
|
|
|
|
python scripts/convert_decision_template.py
|
|
|
|
Input: data/training/טיוטת החלטה.dotx
|
|
Output: skills/docx/decision_template.docx
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
SRC = REPO_ROOT / "data" / "training" / "טיוטת החלטה.dotx"
|
|
DST = REPO_ROOT / "skills" / "docx" / "decision_template.docx"
|
|
|
|
TEMPLATE_CONTENT_TYPE = (
|
|
"application/vnd.openxmlformats-officedocument."
|
|
"wordprocessingml.template.main+xml"
|
|
)
|
|
DOCUMENT_CONTENT_TYPE = (
|
|
"application/vnd.openxmlformats-officedocument."
|
|
"wordprocessingml.document.main+xml"
|
|
)
|
|
|
|
|
|
def convert(src: Path, dst: Path) -> None:
|
|
if not src.exists():
|
|
raise FileNotFoundError(f"Template not found: {src}")
|
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with zipfile.ZipFile(src, "r") as zin:
|
|
names = zin.namelist()
|
|
with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
|
|
for name in names:
|
|
# Drop glossary part — template-only, confuses Document()
|
|
if name.startswith("word/glossary/"):
|
|
continue
|
|
data = zin.read(name)
|
|
if name == "[Content_Types].xml":
|
|
text = data.decode("utf-8")
|
|
text = text.replace(
|
|
TEMPLATE_CONTENT_TYPE, DOCUMENT_CONTENT_TYPE
|
|
)
|
|
# Drop every <Override> that points at /word/glossary/...
|
|
text = re.sub(
|
|
r'<Override\s+PartName="/word/glossary/[^"]*"[^>]*?/>',
|
|
"",
|
|
text,
|
|
)
|
|
data = text.encode("utf-8")
|
|
elif name == "word/_rels/document.xml.rels":
|
|
# Strip the glossaryDocument relationship — the target
|
|
# part is being removed, so the ref would dangle.
|
|
text = data.decode("utf-8")
|
|
text = re.sub(
|
|
r'<Relationship\s+[^>]*?glossaryDocument[^>]*?/>',
|
|
"",
|
|
text,
|
|
)
|
|
data = text.encode("utf-8")
|
|
zout.writestr(name, data)
|
|
|
|
|
|
def verify(dst: Path) -> None:
|
|
"""Load with python-docx and print a few style names to confirm it works."""
|
|
from docx import Document
|
|
|
|
doc = Document(str(dst))
|
|
key_styles = {"Normal", "Heading 2", "Quote", "List Paragraph", "Title"}
|
|
found = {s.name for s in doc.styles if s.name in key_styles}
|
|
missing = key_styles - found
|
|
if missing:
|
|
print(f"WARN: missing styles: {missing}", file=sys.stderr)
|
|
else:
|
|
print(f"OK — all key styles present: {sorted(found)}")
|
|
|
|
|
|
def main() -> None:
|
|
print(f"Source: {SRC}")
|
|
print(f"Dest: {DST}")
|
|
convert(SRC, DST)
|
|
print(f"Wrote {DST.stat().st_size:,} bytes")
|
|
verify(DST)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|