From: Eoin Nugent <eoin@yelp.com>
Date: Mon, 11 Jan 2016 22:43:58 +0000 (-0800)
Subject: extraction: Babel now supports extraction by filename as well as by dir
X-Git-Tag: 2.3.1~27^2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=19957e21470615d42fb7b6e2c1a580cd679d33c8;p=thirdparty%2Fbabel.git

extraction: Babel now supports extraction by filename as well as by dir

One can now supply a filename or a directory to be extracted. For
large codebases, this allows the consumer to optimize their
string extraction process by, for instance, only supplying the
files that have actually been changed on the given dev's branch
compared to master.

Relates to https://github.com/python-babel/babel/issues/253 . I
don't want to say "fixes", but makes further optimization
unnecessary for most use cases.
---

diff --git a/babel/messages/extract.py b/babel/messages/extract.py
index 8fe3f606..8183d527 100644
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -142,28 +142,72 @@ def extract_from_dir(dirname=None, method_map=DEFAULT_MAPPING,
         dirnames.sort()
         filenames.sort()
         for filename in filenames:
-            filename = relpath(
-                os.path.join(root, filename).replace(os.sep, '/'),
-                dirname
-            )
-            for pattern, method in method_map:
-                if pathmatch(pattern, filename):
-                    filepath = os.path.join(absname, filename)
-                    options = {}
-                    for opattern, odict in options_map.items():
-                        if pathmatch(opattern, filename):
-                            options = odict
-                    if callback:
-                        callback(filename, method, options)
-                    for lineno, message, comments, context in \
-                          extract_from_file(method, filepath,
-                                            keywords=keywords,
-                                            comment_tags=comment_tags,
-                                            options=options,
-                                            strip_comment_tags=
-                                                strip_comment_tags):
-                        yield filename, lineno, message, comments, context
-                    break
+            filepath = os.path.join(root, filename).replace(os.sep, '/')
+
+            for message_tuple in check_and_call_extract_file(
+                filepath,
+                method_map,
+                options_map,
+                callback,
+                keywords,
+                comment_tags,
+                strip_comment_tags,
+                dirpath=absname,
+            ):
+                yield message_tuple
+
+
+def check_and_call_extract_file(filepath, method_map, options_map,
+                                callback, keywords, comment_tags,
+                                strip_comment_tags, dirpath=None):
+    """Checks if the given file matches an extraction method mapping, and if so, calls extract_from_file.
+
+    Note that the extraction method mappings are based relative to dirpath.
+    So, given an absolute path to a file `filepath`, we want to check using
+    just the relative path from `dirpath` to `filepath`.
+
+    :param filepath: An absolute path to a file that exists.
+    :param method_map: a list of ``(pattern, method)`` tuples that maps of
+                       extraction method names to extended glob patterns
+    :param options_map: a dictionary of additional options (optional)
+    :param callback: a function that is called for every file that message are
+                     extracted from, just before the extraction itself is
+                     performed; the function is passed the filename, the name
+                     of the extraction method and and the options dictionary as
+                     positional arguments, in that order
+    :param keywords: a dictionary mapping keywords (i.e. names of functions
+                     that should be recognized as translation functions) to
+                     tuples that specify which of their arguments contain
+                     localizable strings
+    :param comment_tags: a list of tags of translator comments to search for
+                         and include in the results
+    :param strip_comment_tags: a flag that if set to `True` causes all comment
+                               tags to be removed from the collected comments.
+    :param dirpath: the path to the directory to extract messages from.
+    """
+    # filename is the relative path from dirpath to the actual file
+    filename = relpath(filepath, dirpath)
+
+    for pattern, method in method_map:
+        if not pathmatch(pattern, filename):
+            continue
+
+        options = {}
+        for opattern, odict in options_map.items():
+            if pathmatch(opattern, filename):
+                options = odict
+        if callback:
+            callback(filename, method, options)
+        for message_tuple in extract_from_file(
+            method, filepath,
+            keywords=keywords,
+            comment_tags=comment_tags,
+            options=options,
+            strip_comment_tags=strip_comment_tags
+        ):
+            yield (filename, ) + message_tuple
+
+        break
 
 
 def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS,
diff --git a/babel/messages/frontend.py b/babel/messages/frontend.py
old mode 100755
new mode 100644
index d9919f63..8c6fd825
--- a/babel/messages/frontend.py
+++ b/babel/messages/frontend.py
@@ -25,7 +25,7 @@ from babel import Locale, localedata
 from babel._compat import StringIO, string_types
 from babel.core import UnknownLocaleError
 from babel.messages.catalog import Catalog
-from babel.messages.extract import DEFAULT_KEYWORDS, DEFAULT_MAPPING, extract_from_dir
+from babel.messages.extract import DEFAULT_KEYWORDS, DEFAULT_MAPPING, check_and_call_extract_file, extract_from_dir
 from babel.messages.mofile import write_mo
 from babel.messages.pofile import read_po, write_po
 from babel.util import LOCALTZ, odict
@@ -245,15 +245,15 @@ class extract_messages(Command):
          'output file. Separate multiple TAGs with commas(,)'),
         ('strip-comments', None,
          'strip the comment TAGs from the comments.'),
-        ('input-dirs=', None,
-         'directories that should be scanned for messages. Separate multiple '
-         'directories with commas(,)'),
+        ('input-paths=', None,
+         'files or directories that should be scanned for messages. Separate multiple '
+         'files or directories with commas(,)'),
     ]
     boolean_options = [
         'no-default-keywords', 'no-location', 'omit-header', 'no-wrap',
         'sort-output', 'sort-by-file', 'strip-comments'
     ]
-    as_args = 'input-dirs'
+    as_args = 'input-paths'
     multiple_value_options = ('add-comments',)
 
     def initialize_options(self):
@@ -265,7 +265,7 @@ class extract_messages(Command):
         self.no_location = False
         self.omit_header = False
         self.output_file = None
-        self.input_dirs = None
+        self.input_paths = None
         self.width = None
         self.no_wrap = False
         self.sort_output = False
@@ -300,17 +300,21 @@ class extract_messages(Command):
             raise DistutilsOptionError("'--sort-output' and '--sort-by-file' "
                                        "are mutually exclusive")
 
-        if self.input_dirs:
-            if isinstance(self.input_dirs, string_types):
-                self.input_dirs = re.split(',\s*', self.input_dirs)
+        if self.input_paths:
+            if isinstance(self.input_paths, string_types):
+                self.input_paths = re.split(',\s*', self.input_paths)
         else:
-            self.input_dirs = dict.fromkeys([
+            self.input_paths = dict.fromkeys([
                 k.split('.', 1)[0]
                 for k in (self.distribution.packages or ())
             ]).keys()
 
-        if not self.input_dirs:
-            raise DistutilsOptionError("no input directories specified")
+        if not self.input_paths:
+            raise DistutilsOptionError("no input files or directories specified")
+
+        for path in self.input_paths:
+            if not os.path.exists(path):
+                raise DistutilsOptionError("Input path: %s does not exist" % path)
 
         if self.add_comments:
             if isinstance(self.add_comments, string_types):
@@ -333,29 +337,51 @@ class extract_messages(Command):
                               copyright_holder=self.copyright_holder,
                               charset=self.charset)
 
-            for dirname, (method_map, options_map) in mappings.items():
+            for path, (method_map, options_map) in mappings.items():
                 def callback(filename, method, options):
                     if method == 'ignore':
                         return
-                    filepath = os.path.normpath(os.path.join(dirname, filename))
+
+                    # If we explicitly provide a full filepath, just use that.
+                    # Otherwise, path will be the directory path and filename
+                    # is the relative path from that dir to the file.
+                    # So we can join those to get the full filepath.
+                    if os.path.isfile(path):
+                        filepath = path
+                    else:
+                        filepath = os.path.normpath(os.path.join(path, filename))
+
                     optstr = ''
                     if options:
                         optstr = ' (%s)' % ', '.join(['%s="%s"' % (k, v) for
                                                       k, v in options.items()])
                     self.log.info('extracting messages from %s%s', filepath, optstr)
 
-                extracted = extract_from_dir(
-                    dirname, method_map, options_map,
-                    keywords=self._keywords,
-                    comment_tags=self.add_comments,
-                    callback=callback,
-                    strip_comment_tags=self.strip_comments
-                )
+                if os.path.isfile(path):
+                    current_dir = os.getcwd()
+                    extracted = check_and_call_extract_file(
+                        path, method_map, options_map,
+                        callback, self._keywords, self.add_comments,
+                        self.strip_comments, current_dir
+                    )
+                else:
+                    extracted = extract_from_dir(
+                        path, method_map, options_map,
+                        keywords=self._keywords,
+                        comment_tags=self.add_comments,
+                        callback=callback,
+                        strip_comment_tags=self.strip_comments
+                    )
                 for filename, lineno, message, comments, context in extracted:
-                    filepath = os.path.normpath(os.path.join(dirname, filename))
+                    if os.path.isfile(path):
+                        filepath = filename  # already normalized
+                    else:
+                        filepath = os.path.normpath(os.path.join(path, filename))
+
                     catalog.add(message, None, [(filepath, lineno)],
                                 auto_comments=comments, context=context)
 
+
             self.log.info('writing PO template file to %s' % self.output_file)
             write_po(outfile, catalog, width=self.width,
                      no_location=self.no_location,
@@ -370,14 +396,14 @@ class extract_messages(Command):
             fileobj = open(self.mapping_file, 'U')
             try:
                 method_map, options_map = parse_mapping(fileobj)
-                for dirname in self.input_dirs:
-                    mappings[dirname] = method_map, options_map
+                for path in self.input_paths:
+                    mappings[path] = method_map, options_map
             finally:
                 fileobj.close()
 
         elif getattr(self.distribution, 'message_extractors', None):
             message_extractors = self.distribution.message_extractors
-            for dirname, mapping in message_extractors.items():
+            for path, mapping in message_extractors.items():
                 if isinstance(mapping, string_types):
                     method_map, options_map = parse_mapping(StringIO(mapping))
                 else:
@@ -385,11 +411,11 @@ class extract_messages(Command):
                     for pattern, method, options in mapping:
                         method_map.append((pattern, method))
                         options_map[pattern] = options or {}
-                mappings[dirname] = method_map, options_map
+                mappings[path] = method_map, options_map
 
         else:
-            for dirname in self.input_dirs:
-                mappings[dirname] = DEFAULT_MAPPING, {}
+            for path in self.input_paths:
+                mappings[path] = DEFAULT_MAPPING, {}
 
         return mappings
 
diff --git a/tests/messages/test_frontend.py b/tests/messages/test_frontend.py
index f958b805..975876a5 100644
--- a/tests/messages/test_frontend.py
+++ b/tests/messages/test_frontend.py
@@ -108,8 +108,13 @@ class ExtractMessagesTestCase(unittest.TestCase):
         self.cmd.sort_by_file = True
         self.assertRaises(DistutilsOptionError, self.cmd.finalize_options)
 
-    def test_input_dirs_is_treated_as_list(self):
-        self.cmd.input_dirs = self.datadir
+    def test_invalid_file_or_dir_input_path(self):
+        self.cmd.input_paths = 'nonexistent_path'
+        self.cmd.output_file = 'dummy'
+        self.assertRaises(DistutilsOptionError, self.cmd.finalize_options)
+
+    def test_input_paths_is_treated_as_list(self):
+        self.cmd.input_paths = self.datadir
         self.cmd.output_file = self._pot_file()
         self.cmd.finalize_options()
         self.cmd.run()
@@ -120,12 +125,12 @@ class ExtractMessagesTestCase(unittest.TestCase):
         self.assertEqual(1, len(msg.locations))
         self.assertTrue('file1.py' in msg.locations[0][0])
 
-    def test_input_dirs_handle_spaces_after_comma(self):
-        self.cmd.input_dirs = 'foo,  bar'
+    def test_input_paths_handle_spaces_after_comma(self):
+        self.cmd.input_paths = '%s,  %s' % (this_dir, self.datadir)
         self.cmd.output_file = self._pot_file()
         self.cmd.finalize_options()
 
-        self.assertEqual(['foo', 'bar'], self.cmd.input_dirs)
+        self.assertEqual([this_dir, self.datadir], self.cmd.input_paths)
 
     def test_extraction_with_default_mapping(self):
         self.cmd.copyright_holder = 'FooBar, Inc.'
@@ -861,6 +866,54 @@ msgid_plural "foobars"
 msgstr[0] ""
 msgstr[1] ""
 
+""" % {'version': VERSION,
+       'year': time.strftime('%Y'),
+       'date': format_datetime(datetime.now(LOCALTZ), 'yyyy-MM-dd HH:mmZ',
+                               tzinfo=LOCALTZ, locale='en')}
+        with open(pot_file, 'U') as f:
+            actual_content = f.read()
+        self.assertEqual(expected_content, actual_content)
+
+    def test_extract_with_exact_file(self):
+        """Tests that we can call extract with a particular file and only
+        strings from that file get extracted. (Note the absence of strings from file1.py)
+        """
+        pot_file = self._pot_file()
+        file_to_extract = os.path.join(self.datadir, 'project', 'file2.py')
+        self.cli.run(sys.argv + ['extract',
+            '--copyright-holder', 'FooBar, Inc.',
+            '--project', 'TestProject', '--version', '0.1',
+            '--msgid-bugs-address', 'bugs.address@email.tld',
+            '--mapping', os.path.join(self.datadir, 'mapping.cfg'),
+            '-c', 'TRANSLATOR', '-c', 'TRANSLATORS:',
+            '-o', pot_file, file_to_extract])
+        self.assert_pot_file_exists()
+        expected_content = r"""# Translations template for TestProject.
+# Copyright (C) %(year)s FooBar, Inc.
+# This file is distributed under the same license as the TestProject
+# project.
+# FIRST AUTHOR <EMAIL@ADDRESS>, %(year)s.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: TestProject 0.1\n"
+"Report-Msgid-Bugs-To: bugs.address@email.tld\n"
+"POT-Creation-Date: %(date)s\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel %(version)s\n"
+
+#: project/file2.py:9
+msgid "foobar"
+msgid_plural "foobars"
+msgstr[0] ""
+msgstr[1] ""
+
 """ % {'version': VERSION,
        'year': time.strftime('%Y'),
        'date': format_datetime(datetime.now(LOCALTZ), 'yyyy-MM-dd HH:mmZ',