The NFC recomposition incorrectly included TBASE as a valid T syllable,
which is incorrect based on the Unicode specification (TBASE is one
below the start of the range, range beginning at U+11A8).
This would cause the TBASE to be silently swallowed in the
normalization, leading to an incorrect result.
A couple of regression tests are added to check more patterns with
Hangul recomposition and decomposition, on top of a test to check the
problem with TBASE. Diego has submitted the code fix, and I have
written the tests.
Author: Diego Frias <mail@dzfrias.dev>
Co-authored-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/
B92ED640-7D4A-4505-B09F-
3548F58CBB16@dzfrias.dev
Backpatch-through: 14
/* Check if two current characters are LV and T */
else if (start >= SBASE && start < (SBASE + SCOUNT) &&
((start - SBASE) % TCOUNT) == 0 &&
- code >= TBASE && code < (TBASE + TCOUNT))
+ code > TBASE && code < (TBASE + TCOUNT))
{
/* make syllable of form LVT */
uint32 tindex = code - TBASE;
SELECT is_normalized('abc', 'def'); -- run-time error
ERROR: invalid normalization form: def
+-- Hangul NFC recomposition tests
+-- L+V -> LV composition (first and last)
+SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first;
+ hangul_lv_first
+-----------------
+ t
+(1 row)
+
+SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last;
+ hangul_lv_last
+----------------
+ t
+(1 row)
+
+-- LV+T -> LVT composition
+SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t;
+ hangul_lvt_first_t
+--------------------
+ t
+(1 row)
+
+SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t;
+ hangul_lvt_last_t
+-------------------
+ t
+(1 row)
+
+SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv;
+ hangul_lvt_last_lv
+--------------------
+ t
+(1 row)
+
+-- L+V+T -> LVT composition
+SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt;
+ hangul_full_lvt
+-----------------
+ t
+(1 row)
+
+SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt;
+ hangul_full_lvt
+-----------------
+ t
+(1 row)
+
+-- TBASE invalid T syllable
+SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined;
+ hangul_tbase_not_combined
+---------------------------
+ t
+(1 row)
+
+SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate;
+ hangul_lv_tbase_separate
+--------------------------
+ t
+(1 row)
+
+-- Hangul NFD decomposition tests
+SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv;
+ hangul_nfd_lv
+---------------
+ t
+(1 row)
+
+SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt;
+ hangul_nfd_lvt
+----------------
+ t
+(1 row)
+
+SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;
+ hangul_nfd_last
+-----------------
+ t
+(1 row)
+
ORDER BY num;
SELECT is_normalized('abc', 'def'); -- run-time error
+
+-- Hangul NFC recomposition tests
+-- L+V -> LV composition (first and last)
+SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first;
+SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last;
+-- LV+T -> LVT composition
+SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t;
+SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t;
+SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv;
+-- L+V+T -> LVT composition
+SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt;
+SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt;
+-- TBASE invalid T syllable
+SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined;
+SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate;
+
+-- Hangul NFD decomposition tests
+SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv;
+SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt;
+SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last;