From: Victor Stinner Date: Thu, 7 Jul 2011 23:45:13 +0000 (+0200) Subject: Issue #12016: Multibyte CJK decoders now resynchronize faster X-Git-Tag: v3.3.0a1~1937^2~7 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2cded9c3f31d2fea4b033f44eaa828e508f03391;p=thirdparty%2FPython%2Fcpython.git Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. --- diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index e5e18051a511..990085e0f00a 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -68,6 +68,29 @@ New, Improved, and Deprecated Modules * Stub +codecs +------ + +Multibyte CJK decoders now resynchronize faster. They only ignore the first +byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', +'replace') gives '�\n' instead of '�'. + +(http://bugs.python.org/issue12016) + +Don't reset incremental encoders of CJK codecs at each call to their encode() +method anymore. For example: :: + + $ ./python -q + >>> import codecs + >>> encoder = codecs.getincrementalencoder('hz')('strict') + >>> b''.join(encoder.encode(x) for x in '\u52ff\u65bd\u65bc\u4eba\u3002 Bye.') + b'~{NpJ)l6HK!#~} Bye.' + +This example gives b'~{Np~}~{J)~}~{l6~}~{HK~}~{!#~} Bye.' with older Python +versions. + +(http://bugs.python.org/issue12100) + faulthandler ------------ diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py index dca9f10b8365..ee3d1650cb51 100644 --- a/Lib/test/test_codecencodings_cn.py +++ b/Lib/test/test_codecencodings_cn.py @@ -15,8 +15,8 @@ class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x81\x81\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\u804a"), - (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"), + (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"), + (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"), (b"abc\x81\x81\xc1\xc4", "ignore", "abc\u804a"), (b"\xc1\x64", "strict", None), ) @@ -28,8 +28,8 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"), (b"\x83\x34\x83\x31", "strict", None), ("\u30fb", "strict", None), @@ -42,11 +42,14 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"), - (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd\u804a"), + (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"), ("\u30fb", "strict", b"\x819\xa79"), + (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'), + (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'), + (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'), ) has_iso10646 = True @@ -74,9 +77,11 @@ class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase): '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002' 'Bye.\n'), # invalid bytes - (b'ab~cd', 'replace', 'ab\uFFFDd'), + (b'ab~cd', 'replace', 'ab\uFFFDcd'), (b'ab\xffcd', 'replace', 'ab\uFFFDcd'), (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'), + (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'), + (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"), ) def test_main(): diff --git a/Lib/test/test_codecencodings_hk.py b/Lib/test/test_codecencodings_hk.py index ccdc0b4c5544..520df4349464 100644 --- a/Lib/test/test_codecencodings_hk.py +++ b/Lib/test/test_codecencodings_hk.py @@ -15,8 +15,8 @@ class Test_Big5HKSCS(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u8b10"), ) diff --git a/Lib/test/test_codecencodings_jp.py b/Lib/test/test_codecencodings_jp.py index f56a3738961a..87e4812482d2 100644 --- a/Lib/test/test_codecencodings_jp.py +++ b/Lib/test/test_codecencodings_jp.py @@ -15,50 +15,57 @@ class Test_CP932(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x81\x00\x81\x00\x82\x84", "strict", None), (b"abc\xf8", "strict", None), - (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\uff44"), - (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"), - (b"abc\x81\x00\x82\x84", "ignore", "abc\uff44"), + (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\x00\uff44"), + (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\x00\uff44\ufffd"), + (b"abc\x81\x00\x82\x84", "ignore", "abc\x00\uff44"), + (b"ab\xEBxy", "replace", "ab\uFFFDxy"), + (b"ab\xF0\x39xy", "replace", "ab\uFFFD9xy"), + (b"ab\xEA\xF0xy", "replace", 'ab\ufffd\ue038y'), # sjis vs cp932 (b"\\\x7e", "replace", "\\\x7e"), (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\uff3c\u2225\uff0d"), ) +euc_commontests = ( + # invalid bytes + (b"abc\x80\x80\xc1\xc4", "strict", None), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u7956"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u7956\ufffd"), + (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u7956"), + (b"abc\xc8", "strict", None), + (b"abc\x8f\x83\x83", "replace", "abc\ufffd\ufffd\ufffd"), + (b"\x82\xFCxy", "replace", "\ufffd\ufffdxy"), + (b"\xc1\x64", "strict", None), + (b"\xa1\xc0", "strict", "\uff3c"), + (b"\xa1\xc0\\", "strict", "\uff3c\\"), + (b"\x8eXY", "replace", "\ufffdXY"), +) + +class Test_EUC_JIS_2004(test_multibytecodec_support.TestBase, + unittest.TestCase): + encoding = 'euc_jis_2004' + tstring = test_multibytecodec_support.load_teststring('euc_jisx0213') + codectests = euc_commontests + xmlcharnametest = ( + "\xab\u211c\xbb = \u2329\u1234\u232a", + b"\xa9\xa8ℜ\xa9\xb2 = ⟨ሴ⟩" + ) + class Test_EUC_JISX0213(test_multibytecodec_support.TestBase, unittest.TestCase): encoding = 'euc_jisx0213' tstring = test_multibytecodec_support.load_teststring('euc_jisx0213') - codectests = ( - # invalid bytes - (b"abc\x80\x80\xc1\xc4", "strict", None), - (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"), - (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u7956"), - (b"abc\x8f\x83\x83", "replace", "abc\ufffd"), - (b"\xc1\x64", "strict", None), - (b"\xa1\xc0", "strict", "\uff3c"), - ) + codectests = euc_commontests xmlcharnametest = ( "\xab\u211c\xbb = \u2329\u1234\u232a", b"\xa9\xa8ℜ\xa9\xb2 = ⟨ሴ⟩" ) -eucjp_commontests = ( - (b"abc\x80\x80\xc1\xc4", "strict", None), - (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"), - (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u7956"), - (b"abc\x8f\x83\x83", "replace", "abc\ufffd"), - (b"\xc1\x64", "strict", None), -) - class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase): encoding = 'euc_jp' tstring = test_multibytecodec_support.load_teststring('euc_jp') - codectests = eucjp_commontests + ( - (b"\xa1\xc0\\", "strict", "\uff3c\\"), + codectests = euc_commontests + ( ("\xa5", "strict", b"\x5c"), ("\u203e", "strict", b"\x7e"), ) @@ -66,8 +73,6 @@ class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase, shiftjis_commonenctests = ( (b"abc\x80\x80\x82\x84", "strict", None), (b"abc\xf8", "strict", None), - (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"), - (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"), (b"abc\x80\x80\x82\x84def", "ignore", "abc\uff44def"), ) @@ -75,20 +80,41 @@ class Test_SJIS_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase): encoding = 'shift_jis' tstring = test_multibytecodec_support.load_teststring('shift_jis') codectests = shiftjis_commonenctests + ( + (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"), + (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"), + (b"\\\x7e", "strict", "\\\x7e"), (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\uff3c\u2016\u2212"), + (b"abc\x81\x39", "replace", "abc\ufffd9"), + (b"abc\xEA\xFC", "replace", "abc\ufffd\ufffd"), + (b"abc\xFF\x58", "replace", "abc\ufffdX"), + ) + +class Test_SJIS_2004(test_multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'shift_jis_2004' + tstring = test_multibytecodec_support.load_teststring('shift_jis') + codectests = shiftjis_commonenctests + ( + (b"\\\x7e", "strict", "\xa5\u203e"), + (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\\\u2016\u2212"), + (b"abc\xEA\xFC", "strict", "abc\u64bf"), + (b"\x81\x39xy", "replace", "\ufffd9xy"), + (b"\xFF\x58xy", "replace", "\ufffdXxy"), + (b"\x80\x80\x82\x84xy", "replace", "\ufffd\ufffd\uff44xy"), + (b"\x80\x80\x82\x84\x88xy", "replace", "\ufffd\ufffd\uff44\u5864y"), + (b"\xFC\xFBxy", "replace", '\ufffd\u95b4y'), + ) + xmlcharnametest = ( + "\xab\u211c\xbb = \u2329\u1234\u232a", + b"\x85Gℜ\x85Q = ⟨ሴ⟩" ) class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase): encoding = 'shift_jisx0213' tstring = test_multibytecodec_support.load_teststring('shift_jisx0213') - codectests = ( - # invalid bytes - (b"abc\x80\x80\x82\x84", "strict", None), - (b"abc\xf8", "strict", None), - (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"), - (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"), - (b"abc\x80\x80\x82\x84def", "ignore", "abc\uff44def"), + codectests = shiftjis_commonenctests + ( + (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"), + (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"), + # sjis vs cp932 (b"\\\x7e", "replace", "\xa5\u203e"), (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\x5c\u2016\u2212"), diff --git a/Lib/test/test_codecencodings_kr.py b/Lib/test/test_codecencodings_kr.py index de4da7f5b6df..4997e8349b0b 100644 --- a/Lib/test/test_codecencodings_kr.py +++ b/Lib/test/test_codecencodings_kr.py @@ -15,8 +15,8 @@ class Test_CP949(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\uc894"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\uc894"), ) @@ -27,8 +27,8 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", 'abc\ufffd\ufffd\uc894'), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\uc894"), # composed make-up sequence errors @@ -40,13 +40,14 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase): (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", "\uc4d4"), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", "\uc4d4x"), - (b"a\xa4\xd4\xa4\xb6\xa4", "replace", "a\ufffd"), + (b"a\xa4\xd4\xa4\xb6\xa4", "replace", 'a\ufffd'), (b"\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None), (b"\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None), - (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", "\ufffd"), - (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", "\ufffd"), - (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", "\ufffd"), + (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", '\ufffd\u6e21\ufffd\u3160\ufffd'), + (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", '\ufffd\u6e21\ub544\ufffd\ufffd'), + (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", '\ufffd\u6e21\ub544\u572d\ufffd'), + (b"\xa4\xd4\xff\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "replace", '\ufffd\ufffd\ufffd\uc4d4'), (b"\xc1\xc4", "strict", "\uc894"), ) @@ -57,9 +58,13 @@ class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ucd27"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ucd27\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\ucd27"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\ucd27\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\ucd27"), + (b"\xD8abc", "replace", "\uFFFDabc"), + (b"\xD8\xFFabc", "replace", "\uFFFD\uFFFDabc"), + (b"\x84bxy", "replace", "\uFFFDbxy"), + (b"\x8CBxy", "replace", "\uFFFDBxy"), ) def test_main(): diff --git a/Lib/test/test_codecencodings_tw.py b/Lib/test/test_codecencodings_tw.py index 12d3c9fa041e..f2f3c1802d51 100644 --- a/Lib/test/test_codecencodings_tw.py +++ b/Lib/test/test_codecencodings_tw.py @@ -15,8 +15,8 @@ class Test_Big5(test_multibytecodec_support.TestBase, unittest.TestCase): # invalid bytes (b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\xc8", "strict", None), - (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"), - (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"), + (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"), + (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u8b10"), ) diff --git a/Lib/test/test_codecmaps_tw.py b/Lib/test/test_codecmaps_tw.py index 6db5091fc3ae..412b9de8b25a 100644 --- a/Lib/test/test_codecmaps_tw.py +++ b/Lib/test/test_codecmaps_tw.py @@ -23,6 +23,9 @@ class TestCP950Map(test_multibytecodec_support.TestBase_Mapping, (b'\xa2\xcc', '\u5341'), (b'\xa2\xce', '\u5345'), ] + codectests = ( + (b"\xFFxy", "replace", "\ufffdxy"), + ) def test_main(): support.run_unittest(__name__) diff --git a/Misc/NEWS b/Misc/NEWS index f742f8afa2e5..404185d73063 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -219,6 +219,10 @@ Core and Builtins Library ------- +- Issue #12016: Multibyte CJK decoders now resynchronize faster. They only + ignore the first byte of an invalid byte sequence. For example, + b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. + - Issue #12459: time.sleep() now raises a ValueError if the sleep length is negative, instead of an infinite sleep on Windows or raising an IOError on Linux for example, to have the same behaviour on all platforms. diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index ab4e6593322e..9e9e96c4d1c1 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -85,7 +85,7 @@ DECODER(gb2312) TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) { NEXT(2, 1) } - else return 2; + else return 1; } return 0; @@ -141,7 +141,7 @@ DECODER(gbk) REQUIRE_INBUF(2) GBK_DECODE(c, IN2, **outbuf) - else return 2; + else return 1; NEXT(2, 1) } @@ -267,7 +267,7 @@ DECODER(gb18030) c3 = IN3; c4 = IN4; if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) - return 4; + return 1; c -= 0x81; c2 -= 0x30; c3 -= 0x81; c4 -= 0x30; @@ -292,12 +292,12 @@ DECODER(gb18030) continue; } } - return 4; + return 1; } GBK_DECODE(c, c2, **outbuf) else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); - else return 2; + else return 1; NEXT(2, 1) } @@ -400,7 +400,7 @@ DECODER(hz) else if (c2 == '\n') ; /* line-continuation */ else - return 2; + return 1; NEXT(2, 0); continue; } @@ -419,7 +419,7 @@ DECODER(hz) NEXT(2, 1) } else - return 2; + return 1; } } diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c index 558a42f89c8c..d3ad04b6dd44 100644 --- a/Modules/cjkcodecs/_codecs_hk.c +++ b/Modules/cjkcodecs/_codecs_hk.c @@ -161,7 +161,7 @@ DECODER(big5hkscs) case 0x8864: WRITE2(0x00ca, 0x030c); break; case 0x88a3: WRITE2(0x00ea, 0x0304); break; case 0x88a5: WRITE2(0x00ea, 0x030c); break; - default: return 2; + default: return 1; } NEXT(2, 2) /* all decoded codepoints are pairs, above. */ diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c index a05e01b32e5a..a500696e9312 100644 --- a/Modules/cjkcodecs/_codecs_jp.c +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -112,7 +112,7 @@ DECODER(cp932) TRYMAP_DEC(cp932ext, **outbuf, c, c2); else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) - return 2; + return 1; c = (c < 0xe0 ? c - 0x81 : c - 0xc1); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); @@ -120,7 +120,7 @@ DECODER(cp932) c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; TRYMAP_DEC(jisx0208, **outbuf, c, c2); - else return 2; + else return 1; } else if (c >= 0xf0 && c <= 0xf9) { if ((c2 >= 0x40 && c2 <= 0x7e) || @@ -128,10 +128,10 @@ DECODER(cp932) OUT1(0xe000 + 188 * (c - 0xf0) + (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41)) else - return 2; + return 1; } else - return 2; + return 1; NEXT(2, 1) } @@ -256,7 +256,7 @@ DECODER(euc_jis_2004) NEXT(2, 1) } else - return 2; + return 1; } else if (c == 0x8f) { unsigned char c2, c3; @@ -274,7 +274,7 @@ DECODER(euc_jis_2004) continue; } else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ; - else return 3; + else return 1; NEXT(3, 1) } else { @@ -300,7 +300,7 @@ DECODER(euc_jis_2004) NEXT(2, 2) continue; } - else return 2; + else return 1; NEXT(2, 1) } } @@ -388,7 +388,7 @@ DECODER(euc_jp) NEXT(2, 1) } else - return 2; + return 1; } else if (c == 0x8f) { unsigned char c2, c3; @@ -401,7 +401,7 @@ DECODER(euc_jp) NEXT(3, 1) } else - return 3; + return 1; } else { unsigned char c2; @@ -417,7 +417,7 @@ DECODER(euc_jp) #endif TRYMAP_DEC(jisx0208, **outbuf, c ^ 0x80, c2 ^ 0x80) ; - else return 2; + else return 1; NEXT(2, 1) } } @@ -502,7 +502,7 @@ DECODER(shift_jis) REQUIRE_INBUF(2) c2 = IN2; if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) - return 2; + return 1; c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); @@ -522,10 +522,10 @@ DECODER(shift_jis) continue; } else - return 2; + return 1; } else - return 2; + return 1; NEXT(1, 1) /* JIS X 0201 */ } @@ -645,7 +645,7 @@ DECODER(shift_jis_2004) REQUIRE_INBUF(2) c2 = IN2; if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) - return 2; + return 1; c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); @@ -671,7 +671,7 @@ DECODER(shift_jis_2004) NEXT_OUT(2) } else - return 2; + return 1; NEXT_IN(2) } else { /* Plane 2 */ @@ -689,13 +689,13 @@ DECODER(shift_jis_2004) continue; } else - return 2; + return 1; NEXT(2, 1) } continue; } else - return 2; + return 1; NEXT(1, 1) /* JIS X 0201 */ } diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c index 9272e363e1f7..f5697dd2f698 100644 --- a/Modules/cjkcodecs/_codecs_kr.c +++ b/Modules/cjkcodecs/_codecs_kr.c @@ -123,7 +123,7 @@ DECODER(euc_kr) if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE || (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE || (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE) - return 8; + return 1; c = (*inbuf)[3]; if (0xa1 <= c && c <= 0xbe) @@ -143,7 +143,7 @@ DECODER(euc_kr) jong = NONE; if (cho == NONE || jung == NONE || jong == NONE) - return 8; + return 1; OUT1(0xac00 + cho*588 + jung*28 + jong); NEXT(8, 1) @@ -152,7 +152,7 @@ DECODER(euc_kr) NEXT(2, 1) } else - return 2; + return 1; } return 0; @@ -208,7 +208,7 @@ DECODER(cp949) REQUIRE_INBUF(2) TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80); else TRYMAP_DEC(cp949ext, **outbuf, c, IN2); - else return 2; + else return 1; NEXT(2, 1) } @@ -375,7 +375,7 @@ DECODER(johab) i_jong = johabidx_jongseong[c_jong]; if (i_cho == NONE || i_jung == NONE || i_jong == NONE) - return 2; + return 1; /* we don't use U+1100 hangul jamo yet. */ if (i_cho == FILL) { @@ -391,7 +391,7 @@ DECODER(johab) OUT1(0x3100 | johabjamo_jungseong[c_jung]) else - return 2; + return 1; } } else { if (i_jung == FILL) { @@ -399,7 +399,7 @@ DECODER(johab) OUT1(0x3100 | johabjamo_choseong[c_cho]) else - return 2; + return 1; } else OUT1(0xac00 + @@ -414,7 +414,7 @@ DECODER(johab) c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) || (c2 & 0x7f) == 0x7f || (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3))) - return 2; + return 1; else { unsigned char t1, t2; @@ -425,7 +425,7 @@ DECODER(johab) t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21; TRYMAP_DEC(ksx1001, **outbuf, t1, t2); - else return 2; + else return 1; NEXT(2, 1) } } diff --git a/Modules/cjkcodecs/_codecs_tw.c b/Modules/cjkcodecs/_codecs_tw.c index 38cf7239b4b8..916298d16740 100644 --- a/Modules/cjkcodecs/_codecs_tw.c +++ b/Modules/cjkcodecs/_codecs_tw.c @@ -55,7 +55,7 @@ DECODER(big5) TRYMAP_DEC(big5, **outbuf, c, IN2) { NEXT(2, 1) } - else return 2; + else return 1; } return 0; @@ -109,7 +109,7 @@ DECODER(cp950) TRYMAP_DEC(cp950ext, **outbuf, c, IN2); else TRYMAP_DEC(big5, **outbuf, c, IN2); - else return 2; + else return 1; NEXT(2, 1) }