Add PDF generation

author Raphael Michel <mail@raphaelmichel.de>

Sat, 27 Oct 2018 21:23:56 +0000 (23:23 +0200)

committer Raphael Michel <mail@raphaelmichel.de>

Sat, 27 Oct 2018 21:23:56 +0000 (23:23 +0200)
author Raphael Michel <mail@raphaelmichel.de>
Sat, 27 Oct 2018 21:23:56 +0000 (23:23 +0200)
committer Raphael Michel <mail@raphaelmichel.de>
Sat, 27 Oct 2018 21:23:56 +0000 (23:23 +0200)
diff --git a/README.rst b/README.rst

index 55ce1e6873fdf15c1899d3423e35f21821310b61..48984ad641924912effff10c0443a29c17e3da71 100644 (file)
--- a/README.rst
+++ b/README.rst
@@ -12,7 +12,8 @@ drafthorse -- Basic ZUGFeRD implementation in Python
  
  This is a low-level python implementation of the ZUGFeRD XML format. ZUGFeRD is a German
  format for sending digital invoices. ZUGFeRD XML files are to be attached to a PDF
-file, which can be done using the ``factur-x`` Python library. This library can be used to generate or parse the contents of this XML file.
+file. This library can be used to generate or parse the contents of this XML file as well as
+attach it to a PDF. We do not support parsing PDF files (for now).
  
  By low-level, we mean that this library models the ZUGFeRD data model 1:1 without any further
  abstractions or simplifications. You can set and parse all parameters defined in ZUGFeRD 1.0.
@@ -48,9 +49,14 @@ Generating::
      >>> doc.header.notes.add(n)
      >>> doc.trade.agreement.seller.name = "Lieferant GmbH"
  
-    >>> doc.serialize()
+    >>> xml = doc.serialize()
+    >>> xml
      b'<?xml version="1.0" encoding="UTF-8"?><rsm:CrossIndustryDocument …'
  
+    >>> new_pdf_bytes = attach_xml(original_pdf_bytes, xml, 'BASIC')
+
+
+
  
  Development
  -----------
diff --git a/drafthorse/models/elements.py b/drafthorse/models/elements.py

index 8ffd4abee4c781c8a2fd50b5eb3b6b2524673843..439af9cda742cad8518dc8615a25118a24096fe3 100644 (file)
--- a/drafthorse/models/elements.py
+++ b/drafthorse/models/elements.py
@@ -33,7 +33,7 @@ class Element(metaclass=BaseElementMeta):
      def __init__(self, **kwargs):
          self.required = kwargs.get('required', False)
          self._data = OrderedDict([
-            (f.name, f.initialize() if f.default else None) for f in self._fields
+            (f.name, f.initialize() if f.default or f.required else None) for f in self._fields
          ])
          for k, v in kwargs.items():
              setattr(self, k, v)
diff --git a/drafthorse/pdf.py b/drafthorse/pdf.py

new file mode 100644 (file)

index 0000000..fa1cc3f
--- /dev/null
+++ b/drafthorse/pdf.py
@@ -0,0 +1,259 @@
+# Based on code of the factur-x Python package:
+# Copyright 2016-2018, Alexis de Lattre <alexis.delattre@akretion.com>
+# Modified for ZUGFeRD and Python 3 by Raphael Michel, 2018
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#    * The name of the authors may not be used to endorse or promote products
+#      derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import datetime
+import hashlib
+from PyPDF2 import PdfFileReader, PdfFileWriter
+from PyPDF2.generic import createStringObject, DictionaryObject, NameObject, DecodedStreamObject, ArrayObject
+from io import BytesIO
+from lxml import etree
+
+
+def attach_xml(original_pdf, xml_data, level='BASIC'):
+    if not isinstance(original_pdf, bytes):
+        raise TypeError("Please supply original PDF as bytes.")
+    if not isinstance(xml_data, bytes):
+        raise TypeError("Please supply XML data as bytes.")
+
+    reader = PdfFileReader(BytesIO(original_pdf))
+    output = PdfFileWriter()
+    # for page in reader.pages:
+    #    output.addPage(page)
+
+    output._header = "%PDF-1.6".encode()
+    output.appendPagesFromReader(reader)
+
+    original_pdf_id = reader.trailer.get('/ID')
+    if original_pdf_id:
+        output._ID = original_pdf_id
+        # else : generate some ?
+
+    _facturx_update_metadata_add_attachment(
+        output, xml_data, {}, level,
+        output_intents=_get_original_output_intents(reader),
+    )
+
+    outbuffer = BytesIO()
+    output.write(outbuffer)
+    outbuffer.seek(0)
+    return outbuffer
+
+
+def _get_original_output_intents(original_pdf):
+    output_intents = []
+    try:
+        pdf_root = original_pdf.trailer['/Root']
+        ori_output_intents = pdf_root['/OutputIntents']
+        for ori_output_intent in ori_output_intents:
+            ori_output_intent_dict = ori_output_intent.getObject()
+            dest_output_profile_dict = \
+                ori_output_intent_dict['/DestOutputProfile'].getObject()
+            output_intents.append(
+                (ori_output_intent_dict, dest_output_profile_dict))
+    except:
+        pass
+    return output_intents
+
+
+def _prepare_pdf_metadata_txt(pdf_metadata):
+    pdf_date = datetime.datetime.now().isoformat()
+    info_dict = {
+        '/Author': pdf_metadata.get('author', ''),
+        '/CreationDate': pdf_date,
+        '/Creator': 'pretix',
+        '/Keywords': pdf_metadata.get('keywords', ''),
+        '/ModDate': pdf_date,
+        '/Subject': pdf_metadata.get('subject', ''),
+        '/Title': pdf_metadata.get('title', ''),
+    }
+    return info_dict
+
+
+def _prepare_pdf_metadata_xml(level, pdf_metadata):
+    nsmap_x = {'x': 'adobe:ns:meta/'}
+    nsmap_rdf = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}
+    nsmap_dc = {'dc': 'http://purl.org/dc/elements/1.1/'}
+    nsmap_pdf = {'pdf': 'http://ns.adobe.com/pdf/1.3/'}
+    nsmap_xmp = {'xmp': 'http://ns.adobe.com/xap/1.0/'}
+    nsmap_pdfaid = {'pdfaid': 'http://www.aiim.org/pdfa/ns/id/'}
+    nsmap_zf = {'zf': 'urn:ferd:pdfa:CrossIndustryDocument:invoice:1p0#'}
+    ns_x = '{%s}' % nsmap_x['x']
+    ns_dc = '{%s}' % nsmap_dc['dc']
+    ns_rdf = '{%s}' % nsmap_rdf['rdf']
+    ns_pdf = '{%s}' % nsmap_pdf['pdf']
+    ns_xmp = '{%s}' % nsmap_xmp['xmp']
+    ns_pdfaid = '{%s}' % nsmap_pdfaid['pdfaid']
+    ns_zf = '{%s}' % nsmap_zf['zf']
+    ns_xml = '{http://www.w3.org/XML/1998/namespace}'
+
+    root = etree.Element(ns_x + 'xmpmeta', nsmap=nsmap_x)
+    rdf = etree.SubElement(root, ns_rdf + 'RDF', nsmap=nsmap_rdf)
+    desc_pdfaid = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_pdfaid)
+    desc_pdfaid.set(ns_rdf + 'about', '')
+    etree.SubElement(desc_pdfaid, ns_pdfaid + 'part').text = '3'
+    etree.SubElement(desc_pdfaid, ns_pdfaid + 'conformance').text = 'B'
+    desc_dc = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_dc)
+    desc_dc.set(ns_rdf + 'about', '')
+    dc_title = etree.SubElement(desc_dc, ns_dc + 'title')
+    dc_title_alt = etree.SubElement(dc_title, ns_rdf + 'Alt')
+    dc_title_alt_li = etree.SubElement(dc_title_alt, ns_rdf + 'li')
+    dc_title_alt_li.text = pdf_metadata.get('title', '')
+    dc_title_alt_li.set(ns_xml + 'lang', 'x-default')
+    dc_creator = etree.SubElement(desc_dc, ns_dc + 'creator')
+    dc_creator_seq = etree.SubElement(dc_creator, ns_rdf + 'Seq')
+    etree.SubElement(dc_creator_seq, ns_rdf + 'li').text = pdf_metadata.get('author', '')
+    dc_desc = etree.SubElement(desc_dc, ns_dc + 'description')
+    dc_desc_alt = etree.SubElement(dc_desc, ns_rdf + 'Alt')
+    dc_desc_alt_li = etree.SubElement(dc_desc_alt, ns_rdf + 'li')
+    dc_desc_alt_li.text = pdf_metadata.get('subject', '')
+    dc_desc_alt_li.set(ns_xml + 'lang', 'x-default')
+    desc_adobe = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_pdf)
+    desc_adobe.set(ns_rdf + 'about', '')
+    producer = etree.SubElement(desc_adobe, ns_pdf + 'Producer')
+    producer.text = 'PyPDF2'
+    desc_xmp = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_xmp)
+    desc_xmp.set(ns_rdf + 'about', '')
+    creator = etree.SubElement(desc_xmp, ns_xmp + 'CreatorTool')
+    creator.text = 'pretix'
+    timestamp = datetime.datetime.now().isoformat()
+    etree.SubElement(desc_xmp, ns_xmp + 'CreateDate').text = timestamp
+    etree.SubElement(desc_xmp, ns_xmp + 'ModifyDate').text = timestamp
+
+    """
+    xmp_file = resource_filename(__name__, 'xmp/Factur-X_extension_schema.xmp')
+    # Reason for defining a parser below:
+    # http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output
+    parser = etree.XMLParser(remove_blank_text=True)
+    facturx_ext_schema_root = etree.parse(open(xmp_file), parser)
+    # The Factur-X extension schema must be embedded into each PDF document
+    facturx_ext_schema_desc_xpath = facturx_ext_schema_root.xpath(
+        '//rdf:Description', namespaces=nsmap_rdf)
+    rdf.append(facturx_ext_schema_desc_xpath[1])
+    """
+    # Now is the ZUGFeRD description tag
+    zugferd_desc = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_zf)
+    zugferd_desc.set(ns_rdf + 'about', '')
+    fx_doc_type = etree.SubElement(zugferd_desc, ns_zf + 'DocumentType', nsmap=nsmap_zf)
+    fx_doc_type.text = 'INVOICE'
+    fx_doc_filename = etree.SubElement(zugferd_desc, ns_zf + 'DocumentFileName', nsmap=nsmap_zf)
+    fx_doc_filename.text = 'ZUGFeRD-invoice.xml'
+    fx_doc_version = etree.SubElement(zugferd_desc, ns_zf + 'Version', nsmap=nsmap_zf)
+    fx_doc_version.text = '1.0'
+    fx_conformance_level = etree.SubElement(zugferd_desc, ns_zf + 'ConformanceLevel', nsmap=nsmap_zf)
+    fx_conformance_level.text = level
+
+    # TODO: should be UTF-16be ??
+    xml_str = etree.tostring(root, pretty_print=True, encoding="UTF-8", xml_declaration=False)
+    head = u'<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>'.encode('utf-8')
+    tail = u'<?xpacket end="w"?>'.encode('utf-8')
+    xml_final_str = head + xml_str + tail
+    return xml_final_str
+
+
+def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level,
+                                            output_intents):
+    md5sum = hashlib.md5(facturx_xml_str).hexdigest()
+    md5sum_obj = createStringObject(md5sum)
+    params_dict = DictionaryObject({
+        NameObject('/CheckSum'): md5sum_obj,
+        NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()),
+        NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
+    })
+    file_entry = DecodedStreamObject()
+    file_entry.setData(facturx_xml_str)  # here we integrate the file itself
+    file_entry.update({
+        NameObject("/Type"): NameObject("/EmbeddedFile"),
+        NameObject("/Params"): params_dict,
+        # 2F is '/' in hexadecimal
+        NameObject("/Subtype"): NameObject("/text#2Fxml"),
+    })
+    file_entry_obj = pdf_filestream._addObject(file_entry)
+    # The Filespec entry
+    ef_dict = DictionaryObject({
+        NameObject("/F"): file_entry_obj,
+        NameObject('/UF'): file_entry_obj,
+    })
+
+    fname_obj = createStringObject("ZUGFeRD-invoice.xml")
+    filespec_dict = DictionaryObject({
+        NameObject("/AFRelationship"): NameObject("/Data"),
+        NameObject("/Desc"): createStringObject("Factur-X Invoice"),
+        NameObject("/Type"): NameObject("/Filespec"),
+        NameObject("/F"): fname_obj,
+        NameObject("/EF"): ef_dict,
+        NameObject("/UF"): fname_obj,
+    })
+    filespec_obj = pdf_filestream._addObject(filespec_dict)
+    name_arrayobj_cdict = {fname_obj: filespec_obj}
+    name_arrayobj_content_sort = list(
+        sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
+    name_arrayobj_content_final = []
+    af_list = []
+    for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
+        name_arrayobj_content_final += [fname_obj, filespec_obj]
+        af_list.append(filespec_obj)
+    embedded_files_names_dict = DictionaryObject({
+        NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
+    })
+    # Then create the entry for the root, as it needs a
+    # reference to the Filespec
+    embedded_files_dict = DictionaryObject({
+        NameObject("/EmbeddedFiles"): embedded_files_names_dict,
+    })
+    res_output_intents = []
+    for output_intent_dict, dest_output_profile_dict in output_intents:
+        dest_output_profile_obj = pdf_filestream._addObject(
+            dest_output_profile_dict)
+        # TODO detect if there are no other objects in output_intent_dest_obj
+        # than /DestOutputProfile
+        output_intent_dict.update({
+            NameObject("/DestOutputProfile"): dest_output_profile_obj,
+        })
+        output_intent_obj = pdf_filestream._addObject(output_intent_dict)
+        res_output_intents.append(output_intent_obj)
+    # Update the root
+    metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata)
+    metadata_file_entry = DecodedStreamObject()
+    metadata_file_entry.setData(metadata_xml_str)
+    metadata_file_entry.update({
+        NameObject('/Subtype'): NameObject('/XML'),
+        NameObject('/Type'): NameObject('/Metadata'),
+    })
+    metadata_obj = pdf_filestream._addObject(metadata_file_entry)
+    af_value_obj = pdf_filestream._addObject(ArrayObject(af_list))
+    pdf_filestream._root_object.update({
+        NameObject("/AF"): af_value_obj,
+        NameObject("/Metadata"): metadata_obj,
+        NameObject("/Names"): embedded_files_dict,
+        # show attachments when opening PDF
+        NameObject("/PageMode"): NameObject("/UseAttachments"),
+    })
+    if res_output_intents:
+        pdf_filestream._root_object.update({
+            NameObject("/OutputIntents"): ArrayObject(res_output_intents),
+        })
+    metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
+    pdf_filestream.addMetadata(metadata_txt_dict)
diff --git a/requirements_dev.txt b/requirements_dev.txt

index 8e3354abae318be5cbe6bbe38165f92cdb470884..330297acdc593b636666d3378450317e59316bea 100644 (file)
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -1,4 +1,5 @@
  lxml
+PyPDF2
  pytest
  flake8
  isort
diff --git a/setup.py b/setup.py

index 00d342118db1c5383c6aa1dedc631cb372d2bc5c..97074b027ab7deb493eeb9610ee2d31ab86777e6 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -35,8 +35,9 @@ setup(
  
      keywords='xml banking sepa',
      install_requires=[
-                         'lxml'
-                     ] + (['isoweek'] if sys.version_info < (3, 6) else []),
+        'lxml',
+        'PyPDF2'
+    ] + (['isoweek'] if sys.version_info < (3, 6) else []),
  
      packages=find_packages(include=['drafthorse', 'drafthorse.*']),
      include_package_data=True,
author	Raphael Michel <mail@raphaelmichel.de>
	Sat, 27 Oct 2018 21:23:56 +0000 (23:23 +0200)
committer	Raphael Michel <mail@raphaelmichel.de>
	Sat, 27 Oct 2018 21:23:56 +0000 (23:23 +0200)
README.rst		patch \| blob \| blame \| history
drafthorse/models/elements.py		patch \| blob \| blame \| history
drafthorse/pdf.py	[new file with mode: 0644]	patch \| blob
requirements_dev.txt		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history