From 9ffd33add67a6a4f1d16ddb096d5357509defb71 Mon Sep 17 00:00:00 2001 From: Raphael Michel Date: Sat, 27 Oct 2018 23:23:56 +0200 Subject: [PATCH] Add PDF generation --- README.rst | 10 +- drafthorse/models/elements.py | 2 +- drafthorse/pdf.py | 259 ++++++++++++++++++++++++++++++++++ requirements_dev.txt | 1 + setup.py | 5 +- 5 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 drafthorse/pdf.py diff --git a/README.rst b/README.rst index 55ce1e6..48984ad 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,8 @@ drafthorse -- Basic ZUGFeRD implementation in Python This is a low-level python implementation of the ZUGFeRD XML format. ZUGFeRD is a German format for sending digital invoices. ZUGFeRD XML files are to be attached to a PDF -file, which can be done using the ``factur-x`` Python library. This library can be used to generate or parse the contents of this XML file. +file. This library can be used to generate or parse the contents of this XML file as well as +attach it to a PDF. We do not support parsing PDF files (for now). By low-level, we mean that this library models the ZUGFeRD data model 1:1 without any further abstractions or simplifications. You can set and parse all parameters defined in ZUGFeRD 1.0. @@ -48,9 +49,14 @@ Generating:: >>> doc.header.notes.add(n) >>> doc.trade.agreement.seller.name = "Lieferant GmbH" - >>> doc.serialize() + >>> xml = doc.serialize() + >>> xml b'>> new_pdf_bytes = attach_xml(original_pdf_bytes, xml, 'BASIC') + + + Development ----------- diff --git a/drafthorse/models/elements.py b/drafthorse/models/elements.py index 8ffd4ab..439af9c 100644 --- a/drafthorse/models/elements.py +++ b/drafthorse/models/elements.py @@ -33,7 +33,7 @@ class Element(metaclass=BaseElementMeta): def __init__(self, **kwargs): self.required = kwargs.get('required', False) self._data = OrderedDict([ - (f.name, f.initialize() if f.default else None) for f in self._fields + (f.name, f.initialize() if f.default or f.required else None) for f in self._fields ]) for k, v in kwargs.items(): setattr(self, k, v) diff --git a/drafthorse/pdf.py b/drafthorse/pdf.py new file mode 100644 index 0000000..fa1cc3f --- /dev/null +++ b/drafthorse/pdf.py @@ -0,0 +1,259 @@ +# Based on code of the factur-x Python package: +# Copyright 2016-2018, Alexis de Lattre +# Modified for ZUGFeRD and Python 3 by Raphael Michel, 2018 +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * The name of the authors may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import datetime +import hashlib +from PyPDF2 import PdfFileReader, PdfFileWriter +from PyPDF2.generic import createStringObject, DictionaryObject, NameObject, DecodedStreamObject, ArrayObject +from io import BytesIO +from lxml import etree + + +def attach_xml(original_pdf, xml_data, level='BASIC'): + if not isinstance(original_pdf, bytes): + raise TypeError("Please supply original PDF as bytes.") + if not isinstance(xml_data, bytes): + raise TypeError("Please supply XML data as bytes.") + + reader = PdfFileReader(BytesIO(original_pdf)) + output = PdfFileWriter() + # for page in reader.pages: + # output.addPage(page) + + output._header = "%PDF-1.6".encode() + output.appendPagesFromReader(reader) + + original_pdf_id = reader.trailer.get('/ID') + if original_pdf_id: + output._ID = original_pdf_id + # else : generate some ? + + _facturx_update_metadata_add_attachment( + output, xml_data, {}, level, + output_intents=_get_original_output_intents(reader), + ) + + outbuffer = BytesIO() + output.write(outbuffer) + outbuffer.seek(0) + return outbuffer + + +def _get_original_output_intents(original_pdf): + output_intents = [] + try: + pdf_root = original_pdf.trailer['/Root'] + ori_output_intents = pdf_root['/OutputIntents'] + for ori_output_intent in ori_output_intents: + ori_output_intent_dict = ori_output_intent.getObject() + dest_output_profile_dict = \ + ori_output_intent_dict['/DestOutputProfile'].getObject() + output_intents.append( + (ori_output_intent_dict, dest_output_profile_dict)) + except: + pass + return output_intents + + +def _prepare_pdf_metadata_txt(pdf_metadata): + pdf_date = datetime.datetime.now().isoformat() + info_dict = { + '/Author': pdf_metadata.get('author', ''), + '/CreationDate': pdf_date, + '/Creator': 'pretix', + '/Keywords': pdf_metadata.get('keywords', ''), + '/ModDate': pdf_date, + '/Subject': pdf_metadata.get('subject', ''), + '/Title': pdf_metadata.get('title', ''), + } + return info_dict + + +def _prepare_pdf_metadata_xml(level, pdf_metadata): + nsmap_x = {'x': 'adobe:ns:meta/'} + nsmap_rdf = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'} + nsmap_dc = {'dc': 'http://purl.org/dc/elements/1.1/'} + nsmap_pdf = {'pdf': 'http://ns.adobe.com/pdf/1.3/'} + nsmap_xmp = {'xmp': 'http://ns.adobe.com/xap/1.0/'} + nsmap_pdfaid = {'pdfaid': 'http://www.aiim.org/pdfa/ns/id/'} + nsmap_zf = {'zf': 'urn:ferd:pdfa:CrossIndustryDocument:invoice:1p0#'} + ns_x = '{%s}' % nsmap_x['x'] + ns_dc = '{%s}' % nsmap_dc['dc'] + ns_rdf = '{%s}' % nsmap_rdf['rdf'] + ns_pdf = '{%s}' % nsmap_pdf['pdf'] + ns_xmp = '{%s}' % nsmap_xmp['xmp'] + ns_pdfaid = '{%s}' % nsmap_pdfaid['pdfaid'] + ns_zf = '{%s}' % nsmap_zf['zf'] + ns_xml = '{http://www.w3.org/XML/1998/namespace}' + + root = etree.Element(ns_x + 'xmpmeta', nsmap=nsmap_x) + rdf = etree.SubElement(root, ns_rdf + 'RDF', nsmap=nsmap_rdf) + desc_pdfaid = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_pdfaid) + desc_pdfaid.set(ns_rdf + 'about', '') + etree.SubElement(desc_pdfaid, ns_pdfaid + 'part').text = '3' + etree.SubElement(desc_pdfaid, ns_pdfaid + 'conformance').text = 'B' + desc_dc = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_dc) + desc_dc.set(ns_rdf + 'about', '') + dc_title = etree.SubElement(desc_dc, ns_dc + 'title') + dc_title_alt = etree.SubElement(dc_title, ns_rdf + 'Alt') + dc_title_alt_li = etree.SubElement(dc_title_alt, ns_rdf + 'li') + dc_title_alt_li.text = pdf_metadata.get('title', '') + dc_title_alt_li.set(ns_xml + 'lang', 'x-default') + dc_creator = etree.SubElement(desc_dc, ns_dc + 'creator') + dc_creator_seq = etree.SubElement(dc_creator, ns_rdf + 'Seq') + etree.SubElement(dc_creator_seq, ns_rdf + 'li').text = pdf_metadata.get('author', '') + dc_desc = etree.SubElement(desc_dc, ns_dc + 'description') + dc_desc_alt = etree.SubElement(dc_desc, ns_rdf + 'Alt') + dc_desc_alt_li = etree.SubElement(dc_desc_alt, ns_rdf + 'li') + dc_desc_alt_li.text = pdf_metadata.get('subject', '') + dc_desc_alt_li.set(ns_xml + 'lang', 'x-default') + desc_adobe = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_pdf) + desc_adobe.set(ns_rdf + 'about', '') + producer = etree.SubElement(desc_adobe, ns_pdf + 'Producer') + producer.text = 'PyPDF2' + desc_xmp = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_xmp) + desc_xmp.set(ns_rdf + 'about', '') + creator = etree.SubElement(desc_xmp, ns_xmp + 'CreatorTool') + creator.text = 'pretix' + timestamp = datetime.datetime.now().isoformat() + etree.SubElement(desc_xmp, ns_xmp + 'CreateDate').text = timestamp + etree.SubElement(desc_xmp, ns_xmp + 'ModifyDate').text = timestamp + + """ + xmp_file = resource_filename(__name__, 'xmp/Factur-X_extension_schema.xmp') + # Reason for defining a parser below: + # http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output + parser = etree.XMLParser(remove_blank_text=True) + facturx_ext_schema_root = etree.parse(open(xmp_file), parser) + # The Factur-X extension schema must be embedded into each PDF document + facturx_ext_schema_desc_xpath = facturx_ext_schema_root.xpath( + '//rdf:Description', namespaces=nsmap_rdf) + rdf.append(facturx_ext_schema_desc_xpath[1]) + """ + # Now is the ZUGFeRD description tag + zugferd_desc = etree.SubElement(rdf, ns_rdf + 'Description', nsmap=nsmap_zf) + zugferd_desc.set(ns_rdf + 'about', '') + fx_doc_type = etree.SubElement(zugferd_desc, ns_zf + 'DocumentType', nsmap=nsmap_zf) + fx_doc_type.text = 'INVOICE' + fx_doc_filename = etree.SubElement(zugferd_desc, ns_zf + 'DocumentFileName', nsmap=nsmap_zf) + fx_doc_filename.text = 'ZUGFeRD-invoice.xml' + fx_doc_version = etree.SubElement(zugferd_desc, ns_zf + 'Version', nsmap=nsmap_zf) + fx_doc_version.text = '1.0' + fx_conformance_level = etree.SubElement(zugferd_desc, ns_zf + 'ConformanceLevel', nsmap=nsmap_zf) + fx_conformance_level.text = level + + # TODO: should be UTF-16be ?? + xml_str = etree.tostring(root, pretty_print=True, encoding="UTF-8", xml_declaration=False) + head = u''.encode('utf-8') + tail = u''.encode('utf-8') + xml_final_str = head + xml_str + tail + return xml_final_str + + +def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, + output_intents): + md5sum = hashlib.md5(facturx_xml_str).hexdigest() + md5sum_obj = createStringObject(md5sum) + params_dict = DictionaryObject({ + NameObject('/CheckSum'): md5sum_obj, + NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()), + NameObject('/Size'): NameObject(str(len(facturx_xml_str))), + }) + file_entry = DecodedStreamObject() + file_entry.setData(facturx_xml_str) # here we integrate the file itself + file_entry.update({ + NameObject("/Type"): NameObject("/EmbeddedFile"), + NameObject("/Params"): params_dict, + # 2F is '/' in hexadecimal + NameObject("/Subtype"): NameObject("/text#2Fxml"), + }) + file_entry_obj = pdf_filestream._addObject(file_entry) + # The Filespec entry + ef_dict = DictionaryObject({ + NameObject("/F"): file_entry_obj, + NameObject('/UF'): file_entry_obj, + }) + + fname_obj = createStringObject("ZUGFeRD-invoice.xml") + filespec_dict = DictionaryObject({ + NameObject("/AFRelationship"): NameObject("/Data"), + NameObject("/Desc"): createStringObject("Factur-X Invoice"), + NameObject("/Type"): NameObject("/Filespec"), + NameObject("/F"): fname_obj, + NameObject("/EF"): ef_dict, + NameObject("/UF"): fname_obj, + }) + filespec_obj = pdf_filestream._addObject(filespec_dict) + name_arrayobj_cdict = {fname_obj: filespec_obj} + name_arrayobj_content_sort = list( + sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) + name_arrayobj_content_final = [] + af_list = [] + for (fname_obj, filespec_obj) in name_arrayobj_content_sort: + name_arrayobj_content_final += [fname_obj, filespec_obj] + af_list.append(filespec_obj) + embedded_files_names_dict = DictionaryObject({ + NameObject("/Names"): ArrayObject(name_arrayobj_content_final), + }) + # Then create the entry for the root, as it needs a + # reference to the Filespec + embedded_files_dict = DictionaryObject({ + NameObject("/EmbeddedFiles"): embedded_files_names_dict, + }) + res_output_intents = [] + for output_intent_dict, dest_output_profile_dict in output_intents: + dest_output_profile_obj = pdf_filestream._addObject( + dest_output_profile_dict) + # TODO detect if there are no other objects in output_intent_dest_obj + # than /DestOutputProfile + output_intent_dict.update({ + NameObject("/DestOutputProfile"): dest_output_profile_obj, + }) + output_intent_obj = pdf_filestream._addObject(output_intent_dict) + res_output_intents.append(output_intent_obj) + # Update the root + metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) + metadata_file_entry = DecodedStreamObject() + metadata_file_entry.setData(metadata_xml_str) + metadata_file_entry.update({ + NameObject('/Subtype'): NameObject('/XML'), + NameObject('/Type'): NameObject('/Metadata'), + }) + metadata_obj = pdf_filestream._addObject(metadata_file_entry) + af_value_obj = pdf_filestream._addObject(ArrayObject(af_list)) + pdf_filestream._root_object.update({ + NameObject("/AF"): af_value_obj, + NameObject("/Metadata"): metadata_obj, + NameObject("/Names"): embedded_files_dict, + # show attachments when opening PDF + NameObject("/PageMode"): NameObject("/UseAttachments"), + }) + if res_output_intents: + pdf_filestream._root_object.update({ + NameObject("/OutputIntents"): ArrayObject(res_output_intents), + }) + metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) + pdf_filestream.addMetadata(metadata_txt_dict) diff --git a/requirements_dev.txt b/requirements_dev.txt index 8e3354a..330297a 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,4 +1,5 @@ lxml +PyPDF2 pytest flake8 isort diff --git a/setup.py b/setup.py index 00d3421..97074b0 100644 --- a/setup.py +++ b/setup.py @@ -35,8 +35,9 @@ setup( keywords='xml banking sepa', install_requires=[ - 'lxml' - ] + (['isoweek'] if sys.version_info < (3, 6) else []), + 'lxml', + 'PyPDF2' + ] + (['isoweek'] if sys.version_info < (3, 6) else []), packages=find_packages(include=['drafthorse', 'drafthorse.*']), include_package_data=True, -- 2.47.2