From e96e3bc77b26b81aa80d4018b0bc121403cd50ec Mon Sep 17 00:00:00 2001
From: Eetu Purontaus <eetu.purontaus@gmail.com>
Date: Tue, 2 Sep 2025 15:34:46 +0300
Subject: [PATCH] Escape user input going to xmp metadata  (#80)

* Autodetect minimum and basic-wl profiles when adding xml to pdf

* Test attaching to and reading back from pdf

Demonstrates issue with xmp metadata and characters
in facturx payload (ampersands) that should be escaped

* Escape user metadata before constructing XMP
---
 drafthorse/pdf.py       | 19 +++++++++++++------
 tests/test_roundtrip.py | 16 ++++++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/drafthorse/pdf.py b/drafthorse/pdf.py
index 421b9cf..a5bc124 100644
--- a/drafthorse/pdf.py
+++ b/drafthorse/pdf.py
@@ -37,6 +37,7 @@ from pypdf.generic import (
     NumberObject,
     create_string_object,
 )
+from xml.sax.saxutils import escape as xml_escape
 
 from drafthorse.xmp_schema import XMP_SCHEMA
 
@@ -155,12 +156,18 @@ def _prepare_xmp_metadata(profile, pdf_metadata):
     :param pdf_metadata: PDF metadata
     :return: metadata XML
     """
+    # Input metadata gets embedded in the XMP metadata inside XML nodes.
+    # All values _should_ be strings, but that's not asserted anywhere so convert just in case
+    escaped_metadata = {
+        key: xml_escape(str(value)) for key, value in pdf_metadata.items()
+    }
+
     xml_str = XMP_SCHEMA.format(
-        title=pdf_metadata.get("title", ""),
-        author=pdf_metadata.get("author", ""),
-        subject=pdf_metadata.get("subject", ""),
-        producer=pdf_metadata.get("producer", "pypdf"),
-        creator_tool=pdf_metadata.get("creator", "python-drafthorse"),
+        title=escaped_metadata.get("title", ""),
+        author=escaped_metadata.get("author", ""),
+        subject=escaped_metadata.get("subject", ""),
+        producer=escaped_metadata.get("producer", "pypdf"),
+        creator_tool=escaped_metadata.get("creator", "python-drafthorse"),
         timestamp=datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00"),
         urn="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#",
         documenttype="INVOICE",
@@ -340,7 +347,7 @@ def _extract_xml_info(xml_data, level=None, metadata=None):
     if level is None:
         # autodetection of Factur-X profile
         profile = doc_id.split(":")[-1]
-        if doc_id.split(":")[-1] in ["basic", "extended"]:
+        if doc_id.split(":")[-1] in ["minimum", "basic", "basicwl", "extended"]:
             profile = doc_id.split(":")[-1]
         elif doc_id.split(":")[-1].startswith("xrechnung"):
             profile = "xrechnung"
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py
index 2b98e13..d745cca 100644
--- a/tests/test_roundtrip.py
+++ b/tests/test_roundtrip.py
@@ -2,9 +2,12 @@ import lxml.etree
 import os
 import pytest
 from difflib import unified_diff
+from io import BytesIO
+from pypdf import PdfReader
 from xml.dom import minidom
 
 from drafthorse.models.document import Document
+from drafthorse.pdf import attach_xml
 from drafthorse.utils import validate_xml
 
 samples = [
@@ -44,6 +47,19 @@ def test_sample_roundtrip(filename):
     # Validate that the sample file is valid, otherwise the test is moot
     validate_xml(xmlout=origxml, schema=schema)
 
+    # Attach the XML to an empty PDF doc
+    with open(
+        os.path.join(os.path.dirname(__file__), "samples", "Empty.pdf"), "rb"
+    ) as f:
+        original_pdf_bytes = f.read()
+
+    created_pdf_bytes = attach_xml(original_pdf_bytes, origxml)
+
+    # Read back the PDF. We don't support extensive parsing, but this way we can assert that metadata is at least present
+    # and syntactically valid.
+    pdf_reader = PdfReader(BytesIO(created_pdf_bytes))
+    assert pdf_reader.xmp_metadata
+
     # Parse the sample file into our internal python structure
     doc = Document.parse(origxml)
 
-- 
2.47.3