Turn off difflib "autojunk" heuristic in fuzzy matching

author Jean Abou Samra <jean@abou-samra.fr>

Sun, 12 Feb 2023 01:49:28 +0000 (02:49 +0100)

committer Jean Abou Samra <jean@abou-samra.fr>

Sun, 12 Feb 2023 01:54:38 +0000 (02:54 +0100)
author Jean Abou Samra <jean@abou-samra.fr>
Sun, 12 Feb 2023 01:49:28 +0000 (02:49 +0100)
committer Jean Abou Samra <jean@abou-samra.fr>
Sun, 12 Feb 2023 01:54:38 +0000 (02:54 +0100)
diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py

index 190264346133c98dfccb475b7292579953c6f87a..a500e77b78566cf44d0146bf5423bd1cf83d9b2f 100644 (file)
--- a/babel/messages/catalog.py
+++ b/babel/messages/catalog.py
@@ -14,8 +14,9 @@ import re
  from collections import OrderedDict
  from collections.abc import Iterable, Iterator
  from copy import copy
-from difflib import get_close_matches
+from difflib import SequenceMatcher
  from email import message_from_string
+from heapq import nlargest
  from typing import TYPE_CHECKING
  
  from babel import __version__ as VERSION
@@ -31,6 +32,31 @@ if TYPE_CHECKING:
  
  __all__ = ['Message', 'Catalog', 'TranslationError']
  
+def get_close_matches(word, possibilities, n=3, cutoff=0.6):
+    """A modified version of ``difflib.get_close_matches``.
+
+    It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
+    around https://github.com/python/cpython/issues/90825.
+    """
+    if not n >  0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
+    s.set_seq2(word)
+    for x in possibilities:
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+           s.quick_ratio() >= cutoff and \
+           s.ratio() >= cutoff:
+            result.append((s.ratio(), x))
+
+    # Move the best scorers to head of list
+    result = nlargest(n, result)
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+
  
  PYTHON_FORMAT = re.compile(r'''
      \%
diff --git a/tests/messages/test_catalog.py b/tests/messages/test_catalog.py

index c2e7aeda0474cf7ceed757f830813f9e3f3cf201..b9d72bc3957b5c25f201cb8b9d65fd584482ec82 100644 (file)
--- a/tests/messages/test_catalog.py
+++ b/tests/messages/test_catalog.py
@@ -209,6 +209,25 @@ class CatalogTestCase(unittest.TestCase):
          assert cat['fooo'].string == 'Vohe'
          assert cat['fooo'].fuzzy is True
  
+    def test_update_fuzzy_matching_long_string(self):
+        lipsum = "\
+Lorem Ipsum is simply dummy text of the printing and typesetting \
+industry. Lorem Ipsum has been the industry's standard dummy text ever \
+since the 1500s, when an unknown printer took a galley of type and \
+scrambled it to make a type specimen book. It has survived not only \
+five centuries, but also the leap into electronic typesetting, \
+remaining essentially unchanged. It was popularised in the 1960s with \
+the release of Letraset sheets containing Lorem Ipsum passages, and \
+more recently with desktop publishing software like Aldus PageMaker \
+including versions of Lorem Ipsum."
+        cat = catalog.Catalog()
+        cat.add("ZZZZZZ " + lipsum, "foo")
+        tmpl = catalog.Catalog()
+        tmpl.add(lipsum + " ZZZZZZ")
+        cat.update(tmpl)
+        assert cat[lipsum + " ZZZZZZ"].fuzzy is True
+        assert len(cat.obsolete) == 0
+
      def test_update_without_fuzzy_matching(self):
          cat = catalog.Catalog()
          cat.add('fo', 'Voh')
author	Jean Abou Samra <jean@abou-samra.fr>
	Sun, 12 Feb 2023 01:49:28 +0000 (02:49 +0100)
committer	Jean Abou Samra <jean@abou-samra.fr>
	Sun, 12 Feb 2023 01:54:38 +0000 (02:54 +0100)
babel/messages/catalog.py		patch \| blob \| blame \| history
tests/messages/test_catalog.py		patch \| blob \| blame \| history