separated the adjacency list version into the "basic" and the "optimized" version

author Mike Bayer <mike_mp@zzzcomputing.com>

Sun, 1 Jul 2007 17:43:23 +0000 (17:43 +0000)

committer Mike Bayer <mike_mp@zzzcomputing.com>

Sun, 1 Jul 2007 17:43:23 +0000 (17:43 +0000)
author Mike Bayer <mike_mp@zzzcomputing.com>
Sun, 1 Jul 2007 17:43:23 +0000 (17:43 +0000)
committer Mike Bayer <mike_mp@zzzcomputing.com>
Sun, 1 Jul 2007 17:43:23 +0000 (17:43 +0000)
diff --git a/examples/elementtree/adjacency_list.py b/examples/elementtree/adjacency_list.py

new file mode 100644 (file)

index 0000000..204662f
--- /dev/null
+++ b/examples/elementtree/adjacency_list.py
@@ -0,0 +1,215 @@
+"""illustrates an explicit way to persist an XML document expressed using ElementTree.
+
+This example explicitly marshals/unmarshals the ElementTree document into 
+mapped entities which have their own tables.  Compare to pickle.py which 
+uses pickle to accomplish the same task.  Note that the usage of both
+styles of persistence are identical, as is the structure of the main Document class.
+"""
+
+################################# PART I - Imports/Coniguration ###########################################
+from sqlalchemy import *
+from sqlalchemy.orm import *
+
+import sys, os, StringIO, re
+
+import logging
+logging.basicConfig()
+
+# uncomment to show SQL statements
+#logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
+
+# uncomment to show SQL statements and result sets
+#logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
+
+
+from elementtree import ElementTree
+from elementtree.ElementTree import Element, SubElement
+
+meta = MetaData()
+meta.engine = 'sqlite://'
+
+################################# PART II - Table Metadata ###########################################
+    
+# stores a top level record of an XML document.  
+documents = Table('documents', meta,
+    Column('document_id', Integer, primary_key=True),
+    Column('filename', String(30), unique=True),
+    Column('element_id', Integer, ForeignKey('elements.element_id'))
+)
+
+# stores XML nodes in an adjacency list model.  This corresponds to 
+# Element and SubElement objects.
+elements = Table('elements', meta,
+    Column('element_id', Integer, primary_key=True),
+    Column('parent_id', Integer, ForeignKey('elements.element_id')),
+    Column('tag', Unicode(30), nullable=False),
+    Column('text', Unicode),
+    Column('tail', Unicode)
+    )
+
+# stores attributes.  This corresponds to the dictionary of attributes
+# stored by an Element or SubElement.
+attributes = Table('attributes', meta,
+    Column('element_id', Integer, ForeignKey('elements.element_id'), primary_key=True),
+    Column('name', Unicode(100), nullable=False, primary_key=True),
+    Column('value', Unicode(255)))
+
+meta.create_all()
+
+#################################### PART III - Model #############################################
+
+# our document class.  contains a string name,
+# and the ElementTree root element.  
+class Document(object):
+    def __init__(self, name, element):
+        self.filename = name
+        self.element = element
+        
+    def __str__(self):
+        buf = StringIO.StringIO()
+        self.element.write(buf)
+        return buf.getvalue()
+
+#################################### PART IV - Persistence Mapping ###################################
+
+# Node class.  a non-public class which will represent 
+# the DB-persisted Element/SubElement object.  We cannot create mappers for
+# ElementTree elements directly because they are at the very least not new-style 
+# classes, and also may be backed by native implementations.
+# so here we construct an adapter.
+class _Node(object):
+    pass
+
+# Attribute class.  also internal, this will represent the key/value attributes stored for 
+# a particular Node.
+class _Attribute(object):
+    def __init__(self, name, value):
+        self.name = name
+        self.value = value
+
+# setup mappers.  Document will eagerly load a list of _Node objects.
+mapper(Document, documents, properties={
+    '_root':relation(_Node, lazy=False, cascade="all")
+})
+
+mapper(_Node, elements, properties={
+    'children':relation(_Node, cascade="all"),  
+    'attributes':relation(_Attribute, lazy=False, cascade="all, delete-orphan"), # eagerly load attributes
+})
+
+mapper(_Attribute, attributes)
+
+# define marshalling functions that convert from _Node/_Attribute to/from ElementTree objects.
+# this will set the ElementTree element as "document._element", and append the root _Node
+# object to the "_root" mapped collection.
+class ElementTreeMarshal(object):
+    def __get__(self, document, owner):
+        if document is None:
+            return self
+            
+        if hasattr(document, '_element'):
+            return document._element
+        
+        def traverse(node, parent=None):
+            if parent is not None:
+                elem = ElementTree.SubElement(parent, node.tag)
+            else:
+                elem = ElementTree.Element(node.tag)
+            elem.text = node.text
+            elem.tail = node.tail
+            for attr in node.attributes:
+                elem.attrib[attr.name] = attr.value
+            for child in node.children:
+                traverse(child, parent=elem)
+            return elem
+
+        document._element = ElementTree.ElementTree(traverse(document._root))
+        return document._element
+    
+    def __set__(self, document, element):
+        def traverse(node):
+            n = _Node()
+            n.tag = node.tag
+            n.text = node.text
+            n.tail = node.tail
+            n.children = [traverse(n2) for n2 in node]
+            n.attributes = [_Attribute(k, v) for k, v in node.attrib.iteritems()]
+            return n
+
+        document._root = traverse(element.getroot())
+        document._element = element
+    
+    def __delete__(self, document):
+        del document._element
+        document._root = []
+
+# override Document's "element" attribute with the marshaller.
+Document.element = ElementTreeMarshal()
+
+########################################### PART V - Basic Persistence Example ############################
+
+line = "\n--------------------------------------------------------"
+
+# save to DB
+session = create_session()
+
+# get ElementTree documents
+for file in ('test.xml', 'test2.xml', 'test3.xml'):
+    filename = os.path.join(os.path.dirname(sys.argv[0]), file)
+    doc = ElementTree.parse(filename)
+    session.save(Document(file, doc))
+
+print "\nSaving three documents...", line
+session.flush()
+print "Done."
+
+# clear session (to illustrate a full load), restore
+session.clear()
+
+print "\nFull text of document 'text.xml':", line
+document = session.query(Document).filter_by(filename="test.xml").first()
+
+print document
+
+############################################ PART VI - Searching for Paths #######################################
+
+# manually search for a document which contains "/somefile/header/field1:hi"
+print "\nManual search for /somefile/header/field1=='hi':", line
+n1 = elements.alias('n1')
+n2 = elements.alias('n2')
+n3 = elements.alias('n3')
+j = documents.join(n1).join(n2, n1.c.element_id==n2.c.parent_id).join(n3, n2.c.element_id==n3.c.parent_id)
+d = session.query(Document).select_from(j).filter(n1.c.tag=='somefile').filter(n2.c.tag=='header').filter(and_(n3.c.tag=='field1', n3.c.text=='hi')).one()
+print d
+
+# generalize the above approach into an extremely impoverished xpath function:
+def find_document(path, compareto):
+    j = documents
+    prev_elements = None
+    query = session.query(Document)
+    for i, match in enumerate(re.finditer(r'/([\w_]+)(?:\[@([\w_]+)(?:=(.*))?\])?', path)):
+        (token, attrname, attrvalue) = match.group(1, 2, 3)
+        a = elements.alias("n%d" % i)
+        query = query.filter(a.c.tag==token)
+        if attrname:
+            attr_alias = attributes.alias('a%d' % i)
+            if attrvalue:
+                query = query.filter(and_(a.c.element_id==attr_alias.c.element_id, attr_alias.c.name==attrname, attr_alias.c.value==attrvalue))
+            else:
+                query = query.filter(and_(a.c.element_id==attr_alias.c.element_id, attr_alias.c.name==attrname))
+        if prev_elements is not None:
+            j = j.join(a, prev_elements.c.element_id==a.c.parent_id)
+        else:
+            j = j.join(a)
+        prev_elements = a
+    return query.options(lazyload('_root')).select_from(j).filter(prev_elements.c.text==compareto).all()
+
+for path, compareto in (
+        ('/somefile/header/field1', 'hi'),
+        ('/somefile/field1', 'hi'),
+        ('/somefile/header/field2', 'there'),
+        ('/somefile/header/field2[@attr=foo]', 'there')
+    ):
+    print "\nDocuments containing '%s=%s':" % (path, compareto), line
+    print [d.filename for d in find_document(path, compareto)]
+
diff --git a/examples/elementtree/explicit.py b/examples/elementtree/optimized_al.py

similarity index 95%

rename from examples/elementtree/explicit.py

rename to examples/elementtree/optimized_al.py

index ee941f73806f308b960fb04859896e6c7bd40f9a..915680353a3551bca8559ccc5311e53fd03a7271 100644 (file)
--- a/examples/elementtree/explicit.py
+++ b/examples/elementtree/optimized_al.py
@@ -1,9 +1,8 @@
-"""illustrates an explicit way to persist an XML document expressed using ElementTree.
+"""This script duplicates adjacency_list.py, but optimizes the loading
+of XML nodes to be based on a "flattened" datamodel. Any number of XML documents, 
+each of arbitrary complexity, can be loaded in their entirety via a single query 
+which joins on only three tables.
  
-This example explicitly marshals/unmarshals the ElementTree document into 
-mapped entities which have their own tables.  Compare to pickle.py which 
-uses pickle to accomplish the same task.  Note that the usage of both
-styles of persistence are identical, as is the structure of the main Document class.
  """
  
  ################################# PART I - Imports/Coniguration ###########################################
author	Mike Bayer <mike_mp@zzzcomputing.com>
	Sun, 1 Jul 2007 17:43:23 +0000 (17:43 +0000)
committer	Mike Bayer <mike_mp@zzzcomputing.com>
	Sun, 1 Jul 2007 17:43:23 +0000 (17:43 +0000)
examples/elementtree/adjacency_list.py	[new file with mode: 0644]	patch \| blob
examples/elementtree/optimized_al.py	[moved from examples/elementtree/explicit.py with 95% similarity]	patch \| blob \| blame \| history