From e96e3bc77b26b81aa80d4018b0bc121403cd50ec Mon Sep 17 00:00:00 2001 From: Eetu Purontaus Date: Tue, 2 Sep 2025 15:34:46 +0300 Subject: [PATCH] Escape user input going to xmp metadata (#80) * Autodetect minimum and basic-wl profiles when adding xml to pdf * Test attaching to and reading back from pdf Demonstrates issue with xmp metadata and characters in facturx payload (ampersands) that should be escaped * Escape user metadata before constructing XMP --- drafthorse/pdf.py | 19 +++++++++++++------ tests/test_roundtrip.py | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/drafthorse/pdf.py b/drafthorse/pdf.py index 421b9cf..a5bc124 100644 --- a/drafthorse/pdf.py +++ b/drafthorse/pdf.py @@ -37,6 +37,7 @@ from pypdf.generic import ( NumberObject, create_string_object, ) +from xml.sax.saxutils import escape as xml_escape from drafthorse.xmp_schema import XMP_SCHEMA @@ -155,12 +156,18 @@ def _prepare_xmp_metadata(profile, pdf_metadata): :param pdf_metadata: PDF metadata :return: metadata XML """ + # Input metadata gets embedded in the XMP metadata inside XML nodes. + # All values _should_ be strings, but that's not asserted anywhere so convert just in case + escaped_metadata = { + key: xml_escape(str(value)) for key, value in pdf_metadata.items() + } + xml_str = XMP_SCHEMA.format( - title=pdf_metadata.get("title", ""), - author=pdf_metadata.get("author", ""), - subject=pdf_metadata.get("subject", ""), - producer=pdf_metadata.get("producer", "pypdf"), - creator_tool=pdf_metadata.get("creator", "python-drafthorse"), + title=escaped_metadata.get("title", ""), + author=escaped_metadata.get("author", ""), + subject=escaped_metadata.get("subject", ""), + producer=escaped_metadata.get("producer", "pypdf"), + creator_tool=escaped_metadata.get("creator", "python-drafthorse"), timestamp=datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00"), urn="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#", documenttype="INVOICE", @@ -340,7 +347,7 @@ def _extract_xml_info(xml_data, level=None, metadata=None): if level is None: # autodetection of Factur-X profile profile = doc_id.split(":")[-1] - if doc_id.split(":")[-1] in ["basic", "extended"]: + if doc_id.split(":")[-1] in ["minimum", "basic", "basicwl", "extended"]: profile = doc_id.split(":")[-1] elif doc_id.split(":")[-1].startswith("xrechnung"): profile = "xrechnung" diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index 2b98e13..d745cca 100644 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -2,9 +2,12 @@ import lxml.etree import os import pytest from difflib import unified_diff +from io import BytesIO +from pypdf import PdfReader from xml.dom import minidom from drafthorse.models.document import Document +from drafthorse.pdf import attach_xml from drafthorse.utils import validate_xml samples = [ @@ -44,6 +47,19 @@ def test_sample_roundtrip(filename): # Validate that the sample file is valid, otherwise the test is moot validate_xml(xmlout=origxml, schema=schema) + # Attach the XML to an empty PDF doc + with open( + os.path.join(os.path.dirname(__file__), "samples", "Empty.pdf"), "rb" + ) as f: + original_pdf_bytes = f.read() + + created_pdf_bytes = attach_xml(original_pdf_bytes, origxml) + + # Read back the PDF. We don't support extensive parsing, but this way we can assert that metadata is at least present + # and syntactically valid. + pdf_reader = PdfReader(BytesIO(created_pdf_bytes)) + assert pdf_reader.xmp_metadata + # Parse the sample file into our internal python structure doc = Document.parse(origxml) -- 2.47.3