From: Bruno Haible Date: Wed, 24 Sep 2025 21:28:14 +0000 (+0200) Subject: unilbrk: Fix handling of future emojis in rule LB30b. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5cb21152cedbcc672483dcff933082319f673a99;p=thirdparty%2Fgnulib.git unilbrk: Fix handling of future emojis in rule LB30b. * lib/unilbrk/lbrktables.h (LBP_ID): Renamed from LBP_ID1. (LBP_EBF): Renamed from LBP_ID2. * lib/gen-uni-tables.c (LBP_ID): Renamed from LBP_ID1. (LBP_EBF): Renamed from LBP_ID2. (get_lbp): Update. Move is_property_extended_pictographic invocation. (debug_output_org_lbp): Augment the information from the LineBreak.txt file with the Extended_Pictographic property. Print the LBP_ID and LBP_EBF values distinctly. (fill_org_lbp): Assign value LBP_EBF to all unassigned Extended_Pictographic characters. (debug_output_org_lbp): Handle LBP_EBF. (lbp_value_to_string): Update. (output_lbrk_rules_as_tables): Treat LBP_EBF like LBP_EB instead of like LBP_ID. * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop): Update. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop): Likewise. * lib/unilbrk/lbrkprop2.h: Regenerated. * lib/unilbrk/lbrktables.c: Regenerated. * tests/unilbrk/test-uc-possible-linebreaks.c (main): Skip test cases that contain the U+1F8FF character. * modules/unilbrk/*-linebreaks: Bump required libunistring version. --- diff --git a/ChangeLog b/ChangeLog index a942e53a40..41c05f5d31 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +2025-09-24 Bruno Haible + + unilbrk: Fix handling of future emojis in rule LB30b. + * lib/unilbrk/lbrktables.h (LBP_ID): Renamed from LBP_ID1. + (LBP_EBF): Renamed from LBP_ID2. + * lib/gen-uni-tables.c (LBP_ID): Renamed from LBP_ID1. + (LBP_EBF): Renamed from LBP_ID2. + (get_lbp): Update. Move is_property_extended_pictographic invocation. + (debug_output_org_lbp): Augment the information from the LineBreak.txt + file with the Extended_Pictographic property. Print the LBP_ID and + LBP_EBF values distinctly. + (fill_org_lbp): Assign value LBP_EBF to all unassigned + Extended_Pictographic characters. + (debug_output_org_lbp): Handle LBP_EBF. + (lbp_value_to_string): Update. + (output_lbrk_rules_as_tables): Treat LBP_EBF like LBP_EB instead of like + LBP_ID. + * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop): + Update. + * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop): + Likewise. + * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop): + Likewise. + * lib/unilbrk/lbrkprop2.h: Regenerated. + * lib/unilbrk/lbrktables.c: Regenerated. + * tests/unilbrk/test-uc-possible-linebreaks.c (main): Skip test cases + that contain the U+1F8FF character. + * modules/unilbrk/*-linebreaks: Bump required libunistring version. + 2025-09-23 Bruno Haible gen-uni-tables: Fix compilation error (regression 2025-09-17). diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 00c52fe757..c7dcd560cb 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -7243,8 +7243,8 @@ enum LBP_H2 = 24, /* Hangul LV syllable */ LBP_H3 = 25, /* Hangul LVT syllable */ LBP_HL = 31, /* Hebrew letter */ - LBP_ID1 = 26, /* ideographic */ - LBP_ID2 = 27, /* ideographic and potential future emoji */ + LBP_ID = 26, /* ideographic */ + LBP_EBF = 27, /* future emoji base */ LBP_JL = 28, /* Hangul L Jamo */ LBP_JV = 29, /* Hangul V Jamo */ LBP_JT = 30, /* Hangul T Jamo */ @@ -7263,8 +7263,7 @@ enum LBP_CP = 100, /* LBP_CP1 or LBP_CP2 */ LBP_OP = 101, /* LBP_OP1 or LBP_OP2 */ LBP_QU = 102, /* LBP_QU1 or LBP_QU2 or LBP_QU3 */ - LBP_AL = 103, /* LBP_AL1 or LBP_AL2 */ - LBP_ID = 104 /* LBP_ID1 or LBP_ID2 */ + LBP_AL = 103 /* LBP_AL1 or LBP_AL2 */ }; /* Returns the line breaking EastAsian property for ch, as a bit. */ @@ -8357,7 +8356,7 @@ get_lbp (unsigned int ch) || (ch >= 0x3248 && ch <= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */) attr |= (int64_t) 1 << LBP_AI; else - attr |= (int64_t) 1 << LBP_ID1; + attr |= (int64_t) 1 << LBP_ID; } /* ordinary alphabetic and symbol characters */ @@ -8385,7 +8384,7 @@ get_lbp (unsigned int ch) || ch == 0x2064 /* INVISIBLE PLUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */) - if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP1) | ((int64_t) 1 << LBP_OP2) | ((int64_t) 1 << LBP_QU1) | ((int64_t) 1 << LBP_QU2) | ((int64_t) 1 << LBP_QU3) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_AP) | ((int64_t) 1 << LBP_AK) | ((int64_t) 1 << LBP_AS) | ((int64_t) 1 << LBP_VI) | ((int64_t) 1 << LBP_VF) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID1) | ((int64_t) 1 << LBP_ID2) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM))) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP1) | ((int64_t) 1 << LBP_OP2) | ((int64_t) 1 << LBP_QU1) | ((int64_t) 1 << LBP_QU2) | ((int64_t) 1 << LBP_QU3) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_AP) | ((int64_t) 1 << LBP_AK) | ((int64_t) 1 << LBP_AS) | ((int64_t) 1 << LBP_VI) | ((int64_t) 1 << LBP_VF) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM))) && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */ && !(ch >= 0x3248 && ch <= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */) { @@ -8594,53 +8593,53 @@ get_lbp (unsigned int ch) else { /* Unassigned character. */ - if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */ - || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */ - || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */ - || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */ - || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */ - || ch == 0x1F0C0 /* reserved */ - || ch == 0x1F0D0 /* reserved */ - || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */ - || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */ - || ch == 0x1F12F /* reserved */ - || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */ - || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */ - || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */ - || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */ - || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */ - || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */ - || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */ - || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */ - || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */ - || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */ - || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */ - || (ch >= 0x1F8B0 && ch <= 0x1F8BB) /* reserved */ - || (ch >= 0x1F8C0 && ch <= 0x1F8C1) /* reserved */ - || (ch >= 0x1F900 && ch <= 0x1F90F) /* reserved */ - || ch == 0x1F91F /* reserved */ - || ch == 0x1F93F /* reserved */ - || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */ - || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */ - || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */ - || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */ - || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */ - || (ch >= 0x1F9C1 && ch <= 0x1FB92) /* reserved */ - || (ch >= 0x1FB94 && ch <= 0x1FBCA) /* reserved */ - || (ch >= 0x1FBF0 && ch <= 0x1FBF9) /* reserved */ - || (ch >= 0x1FC00 && ch <= 0x1FFFD) /* reserved */ - || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */ - || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C, - Supplementary Ideographic Plane (Plane 2) outside of blocks */ - || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement, - Supplementary Ideographic Plane (Plane 2) outside of blocks */ - || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */) + if (is_property_extended_pictographic (ch)) + attr |= (int64_t) 1 << LBP_EBF; + else { - if (is_property_extended_pictographic (ch)) - attr |= (int64_t) 1 << LBP_ID2; - else - attr |= (int64_t) 1 << LBP_ID1; + if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */ + || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */ + || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */ + || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */ + || ch == 0x1F0C0 /* reserved */ + || ch == 0x1F0D0 /* reserved */ + || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */ + || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */ + || ch == 0x1F12F /* reserved */ + || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */ + || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */ + || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */ + || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */ + || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */ + || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */ + || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */ + || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */ + || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */ + || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */ + || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */ + || (ch >= 0x1F8B0 && ch <= 0x1F8BB) /* reserved */ + || (ch >= 0x1F8C0 && ch <= 0x1F8C1) /* reserved */ + || (ch >= 0x1F900 && ch <= 0x1F90F) /* reserved */ + || ch == 0x1F91F /* reserved */ + || ch == 0x1F93F /* reserved */ + || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */ + || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */ + || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */ + || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */ + || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */ + || (ch >= 0x1F9C1 && ch <= 0x1FB92) /* reserved */ + || (ch >= 0x1FB94 && ch <= 0x1FBCA) /* reserved */ + || (ch >= 0x1FBF0 && ch <= 0x1FBF9) /* reserved */ + || (ch >= 0x1FC00 && ch <= 0x1FFFD) /* reserved */ + || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */ + || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */) + attr |= (int64_t) 1 << LBP_ID; } } @@ -8709,8 +8708,8 @@ debug_output_lbp (FILE *stream) PRINT_BIT(attr,LBP_H2); PRINT_BIT(attr,LBP_H3); PRINT_BIT(attr,LBP_HL); - PRINT_BIT_ALT(attr,LBP_ID1,LBP_ID); - PRINT_BIT_ALT(attr,LBP_ID2,LBP_ID); + PRINT_BIT(attr,LBP_ID); + PRINT_BIT(attr,LBP_EBF); PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); @@ -8768,8 +8767,20 @@ fill_org_lbp (const char *linebreak_filename) char field2[FIELDLEN]; int lineno = 0; + /* For unassigned characters (General Category "Cn") that have property + Extended_Pictographic, the LineBreak.txt files is inconsistent: + For some of them, such as U+1F02C, it specifies LBP_ID, which then triggers + e.g. rule (LB23a). For others, such as U+1F8FF, it specifies nothing, + which implies LBP_XX, which by rule (LB1) maps to LBP_AL, which then + triggers e.g. rule (LB28) "Do not break between alphabetics". This is + nonsense; it should better behave like LBP_EB. + To fix this, in view of rule (LB30b), we map all unassigned + Extended_Pictographic characters to LBP_EBF, and ensure that they + behave like LBP_EB. */ + for (i = 0; i < 0x110000; i++) - unicode_org_lbp[i] = LBP_XX; + unicode_org_lbp[i] = + (is_property_extended_pictographic (i) ? LBP_EBF : LBP_XX); stream = fopen (linebreak_filename, "r"); if (stream == NULL) @@ -8872,18 +8883,28 @@ fill_org_lbp (const char *linebreak_filename) field1, linebreak_filename, lineno); exit (1); } + bool unassigned1 = (strncmp (field2, " Cn", 3) == 0); + bool unassigned2 = (strstr (field2, " 0) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1); + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL1); /* Don't break inside multibyte characters. */ memset (p, UC_BREAK_PROHIBITED, n); @@ -150,7 +150,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, break; case LBP_CB: /* This is arbitrary. */ - prop = LBP_ID1; + prop = LBP_ID; break; case LBP_SA: /* We don't handle complex scripts yet. diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index 606c4b6084..1e8ef67c87 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -43,7 +43,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, { if (n > 0) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1); + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL1); const uint32_t *s_end = s + n; /* We need 2 characters of lookahead: @@ -143,7 +143,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, break; case LBP_CB: /* This is arbitrary. */ - prop = LBP_ID1; + prop = LBP_ID; break; case LBP_SA: /* We don't handle complex scripts yet. diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index 2416e310a0..bc14209fc2 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -45,7 +45,7 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, { if (n > 0) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1); + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL1); /* Don't break inside multibyte characters. */ memset (p, UC_BREAK_PROHIBITED, n); @@ -150,7 +150,7 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, break; case LBP_CB: /* This is arbitrary. */ - prop = LBP_ID1; + prop = LBP_ID; break; case LBP_SA: /* We don't handle complex scripts yet. diff --git a/modules/unilbrk/u16-possible-linebreaks b/modules/unilbrk/u16-possible-linebreaks index c92d4b535a..60a5288108 100644 --- a/modules/unilbrk/u16-possible-linebreaks +++ b/modules/unilbrk/u16-possible-linebreaks @@ -13,7 +13,7 @@ unistr/u16-mbtouc-unsafe streq configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/u16-possible-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/u16-possible-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_U16_POSSIBLE_LINEBREAKS diff --git a/modules/unilbrk/u16-width-linebreaks b/modules/unilbrk/u16-width-linebreaks index 03a952c625..64f748f85c 100644 --- a/modules/unilbrk/u16-width-linebreaks +++ b/modules/unilbrk/u16-width-linebreaks @@ -11,7 +11,7 @@ uniwidth/width unistr/u16-mbtouc-unsafe configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/u16-width-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/u16-width-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_U16_WIDTH_LINEBREAKS diff --git a/modules/unilbrk/u32-possible-linebreaks b/modules/unilbrk/u32-possible-linebreaks index cc80a4453a..5921768fc0 100644 --- a/modules/unilbrk/u32-possible-linebreaks +++ b/modules/unilbrk/u32-possible-linebreaks @@ -12,7 +12,7 @@ unilbrk/tables streq configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/u32-possible-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/u32-possible-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_U32_POSSIBLE_LINEBREAKS diff --git a/modules/unilbrk/u32-width-linebreaks b/modules/unilbrk/u32-width-linebreaks index 7ad0540445..af5ee477a8 100644 --- a/modules/unilbrk/u32-width-linebreaks +++ b/modules/unilbrk/u32-width-linebreaks @@ -10,7 +10,7 @@ unilbrk/u32-possible-linebreaks uniwidth/width configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/u32-width-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/u32-width-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_U32_WIDTH_LINEBREAKS diff --git a/modules/unilbrk/u8-possible-linebreaks b/modules/unilbrk/u8-possible-linebreaks index 3dcaa0b915..bc47767d23 100644 --- a/modules/unilbrk/u8-possible-linebreaks +++ b/modules/unilbrk/u8-possible-linebreaks @@ -13,7 +13,7 @@ unistr/u8-mbtouc-unsafe streq configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/u8-possible-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/u8-possible-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_U8_POSSIBLE_LINEBREAKS diff --git a/modules/unilbrk/u8-width-linebreaks b/modules/unilbrk/u8-width-linebreaks index 44c9880b6a..89759bef0d 100644 --- a/modules/unilbrk/u8-width-linebreaks +++ b/modules/unilbrk/u8-width-linebreaks @@ -12,7 +12,7 @@ uniwidth/width unistr/u8-mbtouc-unsafe configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/u8-width-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/u8-width-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_U8_WIDTH_LINEBREAKS diff --git a/modules/unilbrk/ulc-possible-linebreaks b/modules/unilbrk/ulc-possible-linebreaks index 5b3a899b22..4f2b72a389 100644 --- a/modules/unilbrk/ulc-possible-linebreaks +++ b/modules/unilbrk/ulc-possible-linebreaks @@ -12,7 +12,7 @@ uniconv/u8-conv-from-enc c-ctype configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/ulc-possible-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/ulc-possible-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_ULC_POSSIBLE_LINEBREAKS diff --git a/modules/unilbrk/ulc-width-linebreaks b/modules/unilbrk/ulc-width-linebreaks index 35b6a5dc96..9dd29d274e 100644 --- a/modules/unilbrk/ulc-width-linebreaks +++ b/modules/unilbrk/ulc-width-linebreaks @@ -12,7 +12,7 @@ uniconv/u8-conv-from-enc c-ctype configure.ac: -gl_LIBUNISTRING_MODULE([1.3], [unilbrk/ulc-width-linebreaks]) +gl_LIBUNISTRING_MODULE([1.4], [unilbrk/ulc-width-linebreaks]) Makefile.am: if LIBUNISTRING_COMPILE_UNILBRK_ULC_WIDTH_LINEBREAKS diff --git a/tests/unilbrk/test-uc-possible-linebreaks.c b/tests/unilbrk/test-uc-possible-linebreaks.c index 05b86a3a45..a54699b6d0 100644 --- a/tests/unilbrk/test-uc-possible-linebreaks.c +++ b/tests/unilbrk/test-uc-possible-linebreaks.c @@ -137,28 +137,38 @@ main (int argc, char *argv[]) because regional indicators are supposed to come in pairs. */ if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF) && input[1] == 0x0308)) - /* There is a disagreement regarding whether to allow a line break - after a U+0020 SPACE character at the start of the text. - We consider that the start of the text is equivalent to the - state after a newline was seen; hence the loop starts with - property LBP_BK. By the rules (LB4,LB5,LB6) an extra line - break after a mandatory line break is undesired, even with - intervening spaces (because these rules come before (LB18)). - Whereas the LineBreakTest.txt file allows a line break after - the space. - Similarly when the first two characters at the start of the - text have property LBP_CM and LBP_ZWJ, respectively. (LB9). */ - if (!(((j == 1 || (j > 1 && ((input[j - 2] >= 0x000A && input[j - 2] <= 0x000D) || input[j - 2] == 0x0085))) - && input[j - 1] == 0x0020) - || ((j == 2 || (j > 2 && ((input[j - 3] >= 0x000A && input[j - 3] <= 0x000D) || input[j - 3] == 0x0085))) - && ((input[j - 2] == 0x0020 && input[j - 1] == 0x0020) - || (input[j - 2] == 0x0308 && input[j - 1] == 0x200D) - || (input[j - 2] == 0x200D && input[j - 1] == 0x0308))))) - matches &= (!(breaks[j] == UC_BREAK_PROHIBITED - || breaks[j] == UC_BREAK_MANDATORY - || breaks[j] == UC_BREAK_CR_BEFORE_LF) - || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) - == breaks_expected[j]; + /* It is nonsense to treat U+1F8FF differently than U+1F02C. + Both are unassigned Extended_Pictographic characters and + should therefore be treated like LBP_EB (or LBP_ID, if you + want), not like LBP_AL. See rule (LB30b). */ + if (!(input[j] == 0x1F8FF + || (j > 0 && input[j - 1] == 0x1F8FF) + /* Also consider intervening characters with property LBP_CM + or LBP_ZWJ, per (LB9). */ + || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) + && input[j - 2] == 0x1F8FF))) + /* There is a disagreement regarding whether to allow a line break + after a U+0020 SPACE character at the start of the text. + We consider that the start of the text is equivalent to the + state after a newline was seen; hence the loop starts with + property LBP_BK. By the rules (LB4,LB5,LB6) an extra line + break after a mandatory line break is undesired, even with + intervening spaces (because these rules come before (LB18)). + Whereas the LineBreakTest.txt file allows a line break after + the space. + Similarly when the first two characters at the start of the + text have property LBP_CM and LBP_ZWJ, respectively. (LB9). */ + if (!(((j == 1 || (j > 1 && ((input[j - 2] >= 0x000A && input[j - 2] <= 0x000D) || input[j - 2] == 0x0085))) + && input[j - 1] == 0x0020) + || ((j == 2 || (j > 2 && ((input[j - 3] >= 0x000A && input[j - 3] <= 0x000D) || input[j - 3] == 0x0085))) + && ((input[j - 2] == 0x0020 && input[j - 1] == 0x0020) + || (input[j - 2] == 0x0308 && input[j - 1] == 0x200D) + || (input[j - 2] == 0x200D && input[j - 1] == 0x0308))))) + matches &= (!(breaks[j] == UC_BREAK_PROHIBITED + || breaks[j] == UC_BREAK_MANDATORY + || breaks[j] == UC_BREAK_CR_BEFORE_LF) + || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) + == breaks_expected[j]; } } if (!matches)