From: Xiang Zhang Date: Fri, 15 Jun 2018 13:26:55 +0000 (+0800) Subject: bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958) (GH... X-Git-Tag: v2.7.16rc1~248 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1889c4cbd62e200fa4cde3d6219e0aadf9bd8149;p=thirdparty%2FPython%2Fcpython.git bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958) (GH-7704) Hangul composition check boundaries are wrong for the second character ([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3) instead of [0x11A7, 0x11C3]).. (cherry picked from commit d134809cd3764c6a634eab7bb8995e3e2eff14d5) Co-authored-by: Wonsup Yoon --- diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index c30ecf4c5b5b..11f2cda82099 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -204,6 +204,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): b = u'C\u0338' * 20 + u'\xC7' self.assertEqual(self.db.normalize('NFC', a), b) + def test_issue29456(self): + # Fix #29456 + u1176_str_a = u'\u1100\u1176\u11a8' + u1176_str_b = u'\u1100\u1176\u11a8' + u11a7_str_a = u'\u1100\u1175\u11a7' + u11a7_str_b = u'\uae30\u11a7' + u11c3_str_a = u'\u1100\u1175\u11c3' + u11c3_str_b = u'\uae30\u11c3' + self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) + self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) + self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + + def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, 'a') diff --git a/Misc/ACKS b/Misc/ACKS index 295b933f4b95..7ec29fab3c72 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1578,6 +1578,7 @@ Jason Yeo EungJun Yi Bob Yodlowski Danny Yoo +Wonsup Yoon Rory Yorke George Yoshida Kazuhiro Yoshida diff --git a/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst new file mode 100644 index 000000000000..9b30bf654bd0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst @@ -0,0 +1 @@ +Fix bugs in hangul normalization: u1176, u11a7 and u11c3 diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 6b01fc7616b1..df6ffe343c3f 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -664,14 +664,18 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) pairs, since we always have decomposed data. */ if (LBase <= *i && *i < (LBase+LCount) && i + 1 < end && - VBase <= i[1] && i[1] <= (VBase+VCount)) { + VBase <= i[1] && i[1] < (VBase+VCount)) { + /* check L character is a modern leading consonant (0x1100 ~ 0x1112) + and V character is a modern vowel (0x1161 ~ 0x1175). */ int LIndex, VIndex; LIndex = i[0] - LBase; VIndex = i[1] - VBase; code = SBase + (LIndex*VCount+VIndex)*TCount; i+=2; if (i < end && - TBase <= *i && *i <= (TBase+TCount)) { + TBase < *i && *i < (TBase+TCount)) { + /* check T character is a modern trailing consonant + (0x11A8 ~ 0x11C2). */ code += *i-TBase; i++; }