]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
Scrit by Marc-Andre Lemburg to generate htmlentitydefs.py.
authorGuido van Rossum <guido@python.org>
Thu, 19 Aug 1999 16:00:41 +0000 (16:00 +0000)
committerGuido van Rossum <guido@python.org>
Thu, 19 Aug 1999 16:00:41 +0000 (16:00 +0000)
Tools/scripts/parseentities.py [new file with mode: 0755]

diff --git a/Tools/scripts/parseentities.py b/Tools/scripts/parseentities.py
new file mode 100755 (executable)
index 0000000..b708116
--- /dev/null
@@ -0,0 +1,65 @@
+#!/usr/local/bin/python
+""" Utility for parsing HTML entity definitions available from:
+
+      http://www.w3.org/ as e.g.
+      http://www.w3.org/TR/REC-html40/HTMLlat1.ent
+
+    Input is read from stdin, output is written to stdout in form of a
+    Python snippet defining a dictionary "entitydefs" mapping literal
+    entity name to character or numeric entity.
+
+    Marc-Andre Lemburg, mal@lemburg.com, 1999. 
+    Use as you like. NO WARRANTIES.
+
+"""
+import re,sys
+import TextTools
+
+entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
+
+def parse(text,pos=0,endpos=None):
+
+    pos = 0
+    if endpos is None:
+       endpos = len(text)
+    d = {}
+    while 1:
+       m = entityRE.search(text,pos,endpos)
+       if not m:
+           break
+       name,charcode,comment = m.groups()
+       d[name] = charcode,comment
+       pos = m.end()
+    return d
+
+def writefile(f,defs):
+
+    f.write("entitydefs = {\n")
+    items = defs.items()
+    items.sort()
+    for name,(charcode,comment) in items:
+       if charcode[:2] == '&#':
+           code = int(charcode[2:-1])
+           if code < 256:
+               charcode = "'\%o'" % code
+           else:
+               charcode = repr(charcode)
+       else:
+           charcode = repr(charcode)
+       comment = TextTools.collapse(comment)
+       f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
+    f.write('\n}\n')
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+       infile = open(sys.argv[1])
+    else:
+       infile = sys.stdin
+    if len(sys.argv) > 2:
+       outfile = open(sys.argv[2],'w')
+    else:
+       outfile = sys.stdout
+    text = infile.read()
+    defs = parse(text)
+    writefile(outfile,defs)
+