Escape user input going to xmp metadata (#80)

author Eetu Purontaus <eetu.purontaus@gmail.com>

Tue, 2 Sep 2025 12:34:46 +0000 (15:34 +0300)

committer GitHub <noreply@github.com>

Tue, 2 Sep 2025 12:34:46 +0000 (14:34 +0200)
author Eetu Purontaus <eetu.purontaus@gmail.com>
Tue, 2 Sep 2025 12:34:46 +0000 (15:34 +0300)
committer GitHub <noreply@github.com>
Tue, 2 Sep 2025 12:34:46 +0000 (14:34 +0200)
diff --git a/drafthorse/pdf.py b/drafthorse/pdf.py

index 421b9cf9c6aa8ec0ce26a2eddd3a2e2a33cbd89f..a5bc12413693c4aa1f8758451121d948cd0ecd36 100644 (file)
--- a/drafthorse/pdf.py
+++ b/drafthorse/pdf.py
@@ -37,6 +37,7 @@ from pypdf.generic import (
      NumberObject,
      create_string_object,
  )
+from xml.sax.saxutils import escape as xml_escape
  
  from drafthorse.xmp_schema import XMP_SCHEMA
  
@@ -155,12 +156,18 @@ def _prepare_xmp_metadata(profile, pdf_metadata):
      :param pdf_metadata: PDF metadata
      :return: metadata XML
      """
+    # Input metadata gets embedded in the XMP metadata inside XML nodes.
+    # All values _should_ be strings, but that's not asserted anywhere so convert just in case
+    escaped_metadata = {
+        key: xml_escape(str(value)) for key, value in pdf_metadata.items()
+    }
+
      xml_str = XMP_SCHEMA.format(
-        title=pdf_metadata.get("title", ""),
-        author=pdf_metadata.get("author", ""),
-        subject=pdf_metadata.get("subject", ""),
-        producer=pdf_metadata.get("producer", "pypdf"),
-        creator_tool=pdf_metadata.get("creator", "python-drafthorse"),
+        title=escaped_metadata.get("title", ""),
+        author=escaped_metadata.get("author", ""),
+        subject=escaped_metadata.get("subject", ""),
+        producer=escaped_metadata.get("producer", "pypdf"),
+        creator_tool=escaped_metadata.get("creator", "python-drafthorse"),
          timestamp=datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00"),
          urn="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#",
          documenttype="INVOICE",
@@ -340,7 +347,7 @@ def _extract_xml_info(xml_data, level=None, metadata=None):
      if level is None:
          # autodetection of Factur-X profile
          profile = doc_id.split(":")[-1]
-        if doc_id.split(":")[-1] in ["basic", "extended"]:
+        if doc_id.split(":")[-1] in ["minimum", "basic", "basicwl", "extended"]:
              profile = doc_id.split(":")[-1]
          elif doc_id.split(":")[-1].startswith("xrechnung"):
              profile = "xrechnung"
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py

index 2b98e138e255514e80123be57126925520e5c208..d745cca2132f55948162ead39781d23075a16713 100644 (file)
--- a/tests/test_roundtrip.py
+++ b/tests/test_roundtrip.py
@@ -2,9 +2,12 @@ import lxml.etree
  import os
  import pytest
  from difflib import unified_diff
+from io import BytesIO
+from pypdf import PdfReader
  from xml.dom import minidom
  
  from drafthorse.models.document import Document
+from drafthorse.pdf import attach_xml
  from drafthorse.utils import validate_xml
  
  samples = [
@@ -44,6 +47,19 @@ def test_sample_roundtrip(filename):
      # Validate that the sample file is valid, otherwise the test is moot
      validate_xml(xmlout=origxml, schema=schema)
  
+    # Attach the XML to an empty PDF doc
+    with open(
+        os.path.join(os.path.dirname(__file__), "samples", "Empty.pdf"), "rb"
+    ) as f:
+        original_pdf_bytes = f.read()
+
+    created_pdf_bytes = attach_xml(original_pdf_bytes, origxml)
+
+    # Read back the PDF. We don't support extensive parsing, but this way we can assert that metadata is at least present
+    # and syntactically valid.
+    pdf_reader = PdfReader(BytesIO(created_pdf_bytes))
+    assert pdf_reader.xmp_metadata
+
      # Parse the sample file into our internal python structure
      doc = Document.parse(origxml)
author	Eetu Purontaus <eetu.purontaus@gmail.com>
	Tue, 2 Sep 2025 12:34:46 +0000 (15:34 +0300)
committer	GitHub <noreply@github.com>
	Tue, 2 Sep 2025 12:34:46 +0000 (14:34 +0200)
drafthorse/pdf.py		patch \| blob \| blame \| history
tests/test_roundtrip.py		patch \| blob \| blame \| history