NumberObject,
create_string_object,
)
+from xml.sax.saxutils import escape as xml_escape
from drafthorse.xmp_schema import XMP_SCHEMA
:param pdf_metadata: PDF metadata
:return: metadata XML
"""
+ # Input metadata gets embedded in the XMP metadata inside XML nodes.
+ # All values _should_ be strings, but that's not asserted anywhere so convert just in case
+ escaped_metadata = {
+ key: xml_escape(str(value)) for key, value in pdf_metadata.items()
+ }
+
xml_str = XMP_SCHEMA.format(
- title=pdf_metadata.get("title", ""),
- author=pdf_metadata.get("author", ""),
- subject=pdf_metadata.get("subject", ""),
- producer=pdf_metadata.get("producer", "pypdf"),
- creator_tool=pdf_metadata.get("creator", "python-drafthorse"),
+ title=escaped_metadata.get("title", ""),
+ author=escaped_metadata.get("author", ""),
+ subject=escaped_metadata.get("subject", ""),
+ producer=escaped_metadata.get("producer", "pypdf"),
+ creator_tool=escaped_metadata.get("creator", "python-drafthorse"),
timestamp=datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00"),
urn="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#",
documenttype="INVOICE",
if level is None:
# autodetection of Factur-X profile
profile = doc_id.split(":")[-1]
- if doc_id.split(":")[-1] in ["basic", "extended"]:
+ if doc_id.split(":")[-1] in ["minimum", "basic", "basicwl", "extended"]:
profile = doc_id.split(":")[-1]
elif doc_id.split(":")[-1].startswith("xrechnung"):
profile = "xrechnung"
import os
import pytest
from difflib import unified_diff
+from io import BytesIO
+from pypdf import PdfReader
from xml.dom import minidom
from drafthorse.models.document import Document
+from drafthorse.pdf import attach_xml
from drafthorse.utils import validate_xml
samples = [
# Validate that the sample file is valid, otherwise the test is moot
validate_xml(xmlout=origxml, schema=schema)
+ # Attach the XML to an empty PDF doc
+ with open(
+ os.path.join(os.path.dirname(__file__), "samples", "Empty.pdf"), "rb"
+ ) as f:
+ original_pdf_bytes = f.read()
+
+ created_pdf_bytes = attach_xml(original_pdf_bytes, origxml)
+
+ # Read back the PDF. We don't support extensive parsing, but this way we can assert that metadata is at least present
+ # and syntactically valid.
+ pdf_reader = PdfReader(BytesIO(created_pdf_bytes))
+ assert pdf_reader.xmp_metadata
+
# Parse the sample file into our internal python structure
doc = Document.parse(origxml)