]> git.ipfire.org Git - thirdparty/python-drafthorse.git/commitdiff
Escape user input going to xmp metadata (#80)
authorEetu Purontaus <eetu.purontaus@gmail.com>
Tue, 2 Sep 2025 12:34:46 +0000 (15:34 +0300)
committerGitHub <noreply@github.com>
Tue, 2 Sep 2025 12:34:46 +0000 (14:34 +0200)
* Autodetect minimum and basic-wl profiles when adding xml to pdf

* Test attaching to and reading back from pdf

Demonstrates issue with xmp metadata and characters
in facturx payload (ampersands) that should be escaped

* Escape user metadata before constructing XMP

drafthorse/pdf.py
tests/test_roundtrip.py

index 421b9cf9c6aa8ec0ce26a2eddd3a2e2a33cbd89f..a5bc12413693c4aa1f8758451121d948cd0ecd36 100644 (file)
@@ -37,6 +37,7 @@ from pypdf.generic import (
     NumberObject,
     create_string_object,
 )
+from xml.sax.saxutils import escape as xml_escape
 
 from drafthorse.xmp_schema import XMP_SCHEMA
 
@@ -155,12 +156,18 @@ def _prepare_xmp_metadata(profile, pdf_metadata):
     :param pdf_metadata: PDF metadata
     :return: metadata XML
     """
+    # Input metadata gets embedded in the XMP metadata inside XML nodes.
+    # All values _should_ be strings, but that's not asserted anywhere so convert just in case
+    escaped_metadata = {
+        key: xml_escape(str(value)) for key, value in pdf_metadata.items()
+    }
+
     xml_str = XMP_SCHEMA.format(
-        title=pdf_metadata.get("title", ""),
-        author=pdf_metadata.get("author", ""),
-        subject=pdf_metadata.get("subject", ""),
-        producer=pdf_metadata.get("producer", "pypdf"),
-        creator_tool=pdf_metadata.get("creator", "python-drafthorse"),
+        title=escaped_metadata.get("title", ""),
+        author=escaped_metadata.get("author", ""),
+        subject=escaped_metadata.get("subject", ""),
+        producer=escaped_metadata.get("producer", "pypdf"),
+        creator_tool=escaped_metadata.get("creator", "python-drafthorse"),
         timestamp=datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00"),
         urn="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#",
         documenttype="INVOICE",
@@ -340,7 +347,7 @@ def _extract_xml_info(xml_data, level=None, metadata=None):
     if level is None:
         # autodetection of Factur-X profile
         profile = doc_id.split(":")[-1]
-        if doc_id.split(":")[-1] in ["basic", "extended"]:
+        if doc_id.split(":")[-1] in ["minimum", "basic", "basicwl", "extended"]:
             profile = doc_id.split(":")[-1]
         elif doc_id.split(":")[-1].startswith("xrechnung"):
             profile = "xrechnung"
index 2b98e138e255514e80123be57126925520e5c208..d745cca2132f55948162ead39781d23075a16713 100644 (file)
@@ -2,9 +2,12 @@ import lxml.etree
 import os
 import pytest
 from difflib import unified_diff
+from io import BytesIO
+from pypdf import PdfReader
 from xml.dom import minidom
 
 from drafthorse.models.document import Document
+from drafthorse.pdf import attach_xml
 from drafthorse.utils import validate_xml
 
 samples = [
@@ -44,6 +47,19 @@ def test_sample_roundtrip(filename):
     # Validate that the sample file is valid, otherwise the test is moot
     validate_xml(xmlout=origxml, schema=schema)
 
+    # Attach the XML to an empty PDF doc
+    with open(
+        os.path.join(os.path.dirname(__file__), "samples", "Empty.pdf"), "rb"
+    ) as f:
+        original_pdf_bytes = f.read()
+
+    created_pdf_bytes = attach_xml(original_pdf_bytes, origxml)
+
+    # Read back the PDF. We don't support extensive parsing, but this way we can assert that metadata is at least present
+    # and syntactically valid.
+    pdf_reader = PdfReader(BytesIO(created_pdf_bytes))
+    assert pdf_reader.xmp_metadata
+
     # Parse the sample file into our internal python structure
     doc = Document.parse(origxml)