Issue #4163: Use unicode-friendly word splitting in the textwrap functions when given...

author Antoine Pitrou <solipsis@pitrou.net>

Sat, 13 Dec 2008 23:12:30 +0000 (23:12 +0000)

committer Antoine Pitrou <solipsis@pitrou.net>

Sat, 13 Dec 2008 23:12:30 +0000 (23:12 +0000)
author Antoine Pitrou <solipsis@pitrou.net>
Sat, 13 Dec 2008 23:12:30 +0000 (23:12 +0000)
committer Antoine Pitrou <solipsis@pitrou.net>
Sat, 13 Dec 2008 23:12:30 +0000 (23:12 +0000)
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py

index 1eab90cfc44fd758598606ae88493e555e0b59fc..c91e242dfdb8655e26386e406e7b706efb46d52f 100644 (file)
--- a/Lib/test/test_textwrap.py
+++ b/Lib/test/test_textwrap.py
@@ -174,7 +174,7 @@ What a mess!
          text = ("Python 1.0.0 was released on 1994-01-26.  Python 1.0.1 was\n"
                  "released on 1994-02-15.")
  
-        self.check_wrap(text, 30, ['Python 1.0.0 was released on',
+        self.check_wrap(text, 35, ['Python 1.0.0 was released on',
                                     '1994-01-26.  Python 1.0.1 was',
                                     'released on 1994-02-15.'])
          self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.',
@@ -353,6 +353,14 @@ What a mess!
              otext = self.wrapper.fill(text)
              assert isinstance(otext, unicode)
  
+        def test_no_split_at_umlaut(self):
+            text = u"Die Empf\xe4nger-Auswahl"
+            self.check_wrap(text, 13, [u"Die", u"Empf\xe4nger-", u"Auswahl"])
+
+        def test_umlaut_followed_by_dash(self):
+            text = u"aa \xe4\xe4-\xe4\xe4"
+            self.check_wrap(text, 7, [u"aa \xe4\xe4-", u"\xe4\xe4"])
+
      def test_split(self):
          # Ensure that the standard _split() method works as advertised
          # in the comments
diff --git a/Lib/textwrap.py b/Lib/textwrap.py

index 53f2f1bac7c34534c5569112b60e65a9c71b1dd3..192b43b1df9459f2bc183637b1a3b3197661427e 100644 (file)
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -84,16 +84,16 @@ class TextWrapper:
      # splits into
      #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
      # (after stripping out empty strings).
-    wordsep_re = re.compile(
+    wordsep_re = (
          r'(\s+|'                                  # any whitespace
-        r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|'   # hyphenated words
+        r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
          r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
  
      # This less funky little regex just split on recognized spaces. E.g.
      #   "Hello there -- you goof-ball, use the -b option!"
      # splits into
      #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
-    wordsep_simple_re = re.compile(r'(\s+)')
+    wordsep_simple_re = r'(\s+)'
  
      # XXX this is not locale- or charset-aware -- string.lowercase
      # is US-ASCII only (and therefore English-only)
@@ -160,10 +160,12 @@ class TextWrapper:
            'use', ' ', 'the', ' ', '-b', ' ', option!'
          otherwise.
          """
-        if self.break_on_hyphens is True:
-            chunks = self.wordsep_re.split(text)
+        flags = re.UNICODE if isinstance(text, unicode) else 0
+        if self.break_on_hyphens:
+            pat = self.wordsep_re
          else:
-            chunks = self.wordsep_simple_re.split(text)
+            pat = self.wordsep_simple_re
+        chunks = re.compile(pat, flags).split(text)
          chunks = filter(None, chunks)  # remove empty chunks
          return chunks
  
diff --git a/Misc/NEWS b/Misc/NEWS

index 9a3e33813f2c57ca7554574fb88e624be7aa585f..1ebcfe32466e3004276e1da89f231b4bb3dd3fd9 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -74,6 +74,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #4163: Use unicode-friendly word splitting in the textwrap functions
+  when given an unicode string.
+
  - Issue #4616: TarFile.utime(): Restore directory times on Windows.
  
  - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
author	Antoine Pitrou <solipsis@pitrou.net>
	Sat, 13 Dec 2008 23:12:30 +0000 (23:12 +0000)
committer	Antoine Pitrou <solipsis@pitrou.net>
	Sat, 13 Dec 2008 23:12:30 +0000 (23:12 +0000)
Lib/test/test_textwrap.py		patch \| blob \| blame \| history
Lib/textwrap.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history