Add basic MO file reading in preparation for #54.

author Christopher Lenz <cmlenz@gmail.com>

Tue, 10 Jun 2008 17:05:52 +0000 (17:05 +0000)

committer Christopher Lenz <cmlenz@gmail.com>

Tue, 10 Jun 2008 17:05:52 +0000 (17:05 +0000)
author Christopher Lenz <cmlenz@gmail.com>
Tue, 10 Jun 2008 17:05:52 +0000 (17:05 +0000)
committer Christopher Lenz <cmlenz@gmail.com>
Tue, 10 Jun 2008 17:05:52 +0000 (17:05 +0000)
diff --git a/babel/messages/mofile.py b/babel/messages/mofile.py

index bc0f3a8c955568549e084e231db14a95e47108c5..47f04973773cb3dd508ac0ff969f60f0cd5a1b97 100644 (file)
--- a/babel/messages/mofile.py
+++ b/babel/messages/mofile.py
@@ -21,9 +21,109 @@
  import array
  import struct
  
-__all__ = ['write_mo']
+from babel.messages.catalog import Catalog, Message
+
+__all__ = ['read_mo', 'write_mo']
  __docformat__ = 'restructuredtext en'
  
+
+LE_MAGIC = 0x950412deL
+BE_MAGIC = 0xde120495L
+
+def read_mo(fileobj):
+    """Read a binary MO file from the given file-like object and return a
+    corresponding `Catalog` object.
+    
+    :param fileobj: the file-like object to read the MO file from
+    :return: a catalog object representing the parsed MO file
+    :rtype: `Catalog`
+    
+    :note: The implementation of this function is heavily based on the
+           ``GNUTranslations._parse`` method of the ``gettext`` module in the
+           standard library.
+    """
+    catalog = Catalog()
+    headers = {}
+
+    unpack = struct.unpack
+    filename = getattr(fileobj, 'name', '')
+    charset = None
+
+    buf = fileobj.read()
+    buflen = len(buf)
+
+    # Parse the .mo file header, which consists of 5 little endian 32
+    # bit words.
+    magic = unpack('<I', buf[:4])[0] # Are we big endian or little endian?
+    if magic == LE_MAGIC:
+        version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
+        ii = '<II'
+    elif magic == BE_MAGIC:
+        version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
+        ii = '>II'
+    else:
+        raise IOError(0, 'Bad magic number', filename)
+
+    # Now put all messages from the .mo file buffer into the catalog
+    # dictionary
+    for i in xrange(0, msgcount):
+        mlen, moff = unpack(ii, buf[masteridx:masteridx + 8])
+        mend = moff + mlen
+        tlen, toff = unpack(ii, buf[transidx:transidx + 8])
+        tend = toff + tlen
+        if mend < buflen and tend < buflen:
+            msg = buf[moff:mend]
+            tmsg = buf[toff:tend]
+        else:
+            raise IOError(0, 'File is corrupt', filename)
+
+        # See if we're looking at GNU .mo conventions for metadata
+        if mlen == 0:
+            # Catalog description
+            lastkey = key = None
+            for item in tmsg.splitlines():
+                item = item.strip()
+                if not item:
+                    continue
+                if ':' in item:
+                    key, value = item.split(':', 1)
+                    lastkey = key = key.strip().lower()
+                    value = value.strip()
+                    headers[key] = value
+                    if key == 'content-type':
+                        charset = value.split('charset=')[1]
+                elif lastkey:
+                    self._info[lastkey] += '\n' + item
+
+        # Note: we unconditionally convert both msgids and msgstrs to
+        # Unicode using the character encoding specified in the charset
+        # parameter of the Content-Type header.  The gettext documentation
+        # strongly encourages msgids to be us-ascii, but some appliations
+        # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
+        # traditional gettext applications, the msgid conversion will
+        # cause no problems since us-ascii should always be a subset of
+        # the charset encoding.  We may want to fall back to 8-bit msgids
+        # if the Unicode conversion fails.
+        if '\x00' in msg:
+            # Plural forms
+            msg = msg.split('\x00')
+            tmsg = tmsg.split('\x00')
+            if charset:
+                msg = [unicode(x, charset) for x in msg]
+                tmsg = [unicode(x, charset) for x in tmsg]
+        else:
+            if charset:
+                msg = unicode(msg, charset)
+                tmsg = unicode(tmsg, charset)
+        catalog[msg] = Message(msg, tmsg)
+
+        # advance to next entry in the seek tables
+        masteridx += 8
+        transidx += 8
+
+    catalog.mime_headers = headers.items()
+    return catalog
+
  def write_mo(fileobj, catalog, use_fuzzy=False):
      """Write a catalog to the specified file-like object using the GNU MO file
      format.
@@ -112,7 +212,7 @@ def write_mo(fileobj, catalog, use_fuzzy=False):
      offsets = koffsets + voffsets
  
      fileobj.write(struct.pack('Iiiiiii',
-        0x950412deL,                # magic
+        LE_MAGIC,                   # magic
          0,                          # version
          len(messages),              # number of entries
          7 * 4,                      # start of key index
diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py

index 2b041a503f6749c547e3d0645dff39f935237e3a..3e73e1342653867256aa567d054bd57f2b94ff86 100644 (file)
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -122,8 +122,8 @@ def read_po(fileobj, locale=None, domain=None, ignore_obsolete=False):
                     means it's a template)
      :param domain: the message domain
      :param ignore_obsolete: whether to ignore obsolete messages in the input
-    :return: an iterator over ``(message, translation, location)`` tuples
-    :rtype: ``iterator``
+    :return: a catalog object representing the parsed PO file
+    :rtype: `Catalog`
      """
      catalog = Catalog(locale=locale, domain=domain)
  
diff --git a/babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.mo b/babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.mo

new file mode 100644 (file)

index 0000000..21b1727

Binary files /dev/null and b/babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.mo differ
diff --git a/babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.po b/babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.po

new file mode 100644 (file)

index 0000000..c5c9748
--- /dev/null
+++ b/babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.po
@@ -0,0 +1,32 @@
+# German (Germany) translations for TestProject.
+# Copyright (C) 2007 FooBar, Inc.
+# This file is distributed under the same license as the TestProject
+# project.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: TestProject 0.1\n"
+"Report-Msgid-Bugs-To: bugs.address@email.tld\n"
+"POT-Creation-Date: 2007-04-01 15:30+0200\n"
+"PO-Revision-Date: 2007-07-30 22:18+0200\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: de_DE <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1)\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 0.9dev-r245\n"
+
+#. This will be a translator coment,
+#. that will include several lines
+#: project/file1.py:8
+msgid "bar"
+msgstr "Stange"
+
+#: project/file2.py:9
+msgid "foobar"
+msgid_plural "foobars"
+msgstr[0] "Fuhstange"
+msgstr[1] "Fuhstangen"
+
diff --git a/babel/messages/tests/mofile.py b/babel/messages/tests/mofile.py

index 3199c75f1fb2681ea5adc087993ee00d8f743ade..5a32b4cdecf919016637824fb835effbbb1a7b16 100644 (file)
--- a/babel/messages/tests/mofile.py
+++ b/babel/messages/tests/mofile.py
@@ -13,11 +13,33 @@
  
  import doctest
  import gettext
+import os
  import unittest
  from StringIO import StringIO
  
  from babel.messages import mofile, Catalog
  
+
+class ReadMoTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self.datadir = os.path.join(os.path.dirname(__file__), 'data')
+
+    def test_basics(self):
+        mo_file = open(os.path.join(self.datadir, 'project', 'i18n', 'de',
+                                    'LC_MESSAGES', 'messages.mo'))
+        try:
+            catalog = mofile.read_mo(mo_file)
+            self.assertEqual(2, len(catalog))
+            self.assertEqual('TestProject', catalog.project)
+            self.assertEqual('0.1', catalog.version)
+            self.assertEqual('Stange', catalog['bar'].string)
+            self.assertEqual(['Fuhstange', 'Fuhstangen'],
+                             catalog['foobar'].string)
+        finally:
+            mo_file.close()
+
+
  class WriteMoTestCase(unittest.TestCase):
  
      def test_sorting(self):
@@ -57,6 +79,7 @@ class WriteMoTestCase(unittest.TestCase):
  def suite():
      suite = unittest.TestSuite()
      suite.addTest(doctest.DocTestSuite(mofile))
+    suite.addTest(unittest.makeSuite(ReadMoTestCase))
      suite.addTest(unittest.makeSuite(WriteMoTestCase))
      return suite
author	Christopher Lenz <cmlenz@gmail.com>
	Tue, 10 Jun 2008 17:05:52 +0000 (17:05 +0000)
committer	Christopher Lenz <cmlenz@gmail.com>
	Tue, 10 Jun 2008 17:05:52 +0000 (17:05 +0000)
babel/messages/mofile.py		patch \| blob \| blame \| history
babel/messages/pofile.py		patch \| blob \| blame \| history
babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.mo	[new file with mode: 0644]	patch \| blob
babel/messages/tests/data/project/i18n/de/LC_MESSAGES/messages.po	[new file with mode: 0644]	patch \| blob
babel/messages/tests/mofile.py		patch \| blob \| blame \| history