From: Michael Paquier Date: Thu, 4 Jun 2026 22:50:15 +0000 (+0900) Subject: Fix off-by-one with NFC recomposition for Hangul U+11A7 (TBASE) X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=82116023e424cba4ac7adefd261bd382ad6e40c8;p=thirdparty%2Fpostgresql.git Fix off-by-one with NFC recomposition for Hangul U+11A7 (TBASE) The NFC recomposition incorrectly included TBASE as a valid T syllable, which is incorrect based on the Unicode specification (TBASE is one below the start of the range, range beginning at U+11A8). This would cause the TBASE to be silently swallowed in the normalization, leading to an incorrect result. A couple of regression tests are added to check more patterns with Hangul recomposition and decomposition, on top of a test to check the problem with TBASE. Diego has submitted the code fix, and I have written the tests. Author: Diego Frias Co-authored-by: Michael Paquier Discussion: https://postgr.es/m/B92ED640-7D4A-4505-B09F-3548F58CBB16@dzfrias.dev Backpatch-through: 14 --- diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index a7d0261aaf0..b5b629b8b6c 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -236,7 +236,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) /* Check if two current characters are LV and T */ else if (start >= SBASE && start < (SBASE + SCOUNT) && ((start - SBASE) % TCOUNT) == 0 && - code >= TBASE && code < (TBASE + TCOUNT)) + code > TBASE && code < (TBASE + TCOUNT)) { /* make syllable of form LVT */ uint32 tindex = code - TBASE; diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index f2713a23268..ab0081165d2 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -87,3 +87,81 @@ ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error ERROR: invalid normalization form: def +-- Hangul NFC recomposition tests +-- L+V -> LV composition (first and last) +SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first; + hangul_lv_first +----------------- + t +(1 row) + +SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last; + hangul_lv_last +---------------- + t +(1 row) + +-- LV+T -> LVT composition +SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t; + hangul_lvt_first_t +-------------------- + t +(1 row) + +SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t; + hangul_lvt_last_t +------------------- + t +(1 row) + +SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv; + hangul_lvt_last_lv +-------------------- + t +(1 row) + +-- L+V+T -> LVT composition +SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt; + hangul_full_lvt +----------------- + t +(1 row) + +SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt; + hangul_full_lvt +----------------- + t +(1 row) + +-- TBASE invalid T syllable +SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined; + hangul_tbase_not_combined +--------------------------- + t +(1 row) + +SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate; + hangul_lv_tbase_separate +-------------------------- + t +(1 row) + +-- Hangul NFD decomposition tests +SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv; + hangul_nfd_lv +--------------- + t +(1 row) + +SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt; + hangul_nfd_lvt +---------------- + t +(1 row) + +SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last; + hangul_nfd_last +----------------- + t +(1 row) + diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index 63cd523f85f..95c5a7ac184 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -32,3 +32,23 @@ FROM ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error + +-- Hangul NFC recomposition tests +-- L+V -> LV composition (first and last) +SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first; +SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last; +-- LV+T -> LVT composition +SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t; +SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t; +SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv; +-- L+V+T -> LVT composition +SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt; +SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt; +-- TBASE invalid T syllable +SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined; +SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate; + +-- Hangul NFD decomposition tests +SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv; +SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt; +SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;