]> git.ipfire.org Git - thirdparty/e2fsprogs.git/blob - lib/ext2fs/nls_utf8-norm.c
ext2fs: merge sparse fixes for unicode normalization
[thirdparty/e2fsprogs.git] / lib / ext2fs / nls_utf8-norm.c
1 /*
2 * Copyright (c) 2014 SGI.
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16 /*
17 * This code is adapted from the Linux Kernel. We have a
18 * userspace version here such that the hashes will match that
19 * implementation.
20 */
21
22 #include "utf8n.h"
23
24 struct utf8data {
25 unsigned int maxage;
26 unsigned int offset;
27 };
28
29 #define __INCLUDED_FROM_UTF8NORM_C__
30 #include "utf8data.h"
31 #undef __INCLUDED_FROM_UTF8NORM_C__
32
33 #define ARRAY_SIZE(array) \
34 (sizeof(array) / sizeof(array[0]))
35
36 int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
37 {
38 int i = ARRAY_SIZE(utf8agetab) - 1;
39 unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
40
41 while (i >= 0 && utf8agetab[i] != 0) {
42 if (sb_utf8version == utf8agetab[i])
43 return 1;
44 i--;
45 }
46 return 0;
47 }
48
49 int utf8version_latest(void)
50 {
51 return utf8vers;
52 }
53
54 /*
55 * UTF-8 valid ranges.
56 *
57 * The UTF-8 encoding spreads the bits of a 32bit word over several
58 * bytes. This table gives the ranges that can be held and how they'd
59 * be represented.
60 *
61 * 0x00000000 0x0000007F: 0xxxxxxx
62 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
63 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
64 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
65 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
66 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
67 *
68 * There is an additional requirement on UTF-8, in that only the
69 * shortest representation of a 32bit value is to be used. A decoder
70 * must not decode sequences that do not satisfy this requirement.
71 * Thus the allowed ranges have a lower bound.
72 *
73 * 0x00000000 0x0000007F: 0xxxxxxx
74 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
75 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
76 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
77 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
78 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
79 *
80 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
81 * 17 planes of 65536 values. This limits the sequences actually seen
82 * even more, to just the following.
83 *
84 * 0 - 0x7F: 0 - 0x7F
85 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
86 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
87 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
88 *
89 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
90 *
91 * Note that the longest sequence seen with valid usage is 4 bytes,
92 * the same a single UTF-32 character. This makes the UTF-8
93 * representation of Unicode strictly smaller than UTF-32.
94 *
95 * The shortest sequence requirement was introduced by:
96 * Corrigendum #1: UTF-8 Shortest Form
97 * It can be found here:
98 * http://www.unicode.org/versions/corrigendum1.html
99 *
100 */
101
102 /*
103 * Return the number of bytes used by the current UTF-8 sequence.
104 * Assumes the input points to the first byte of a valid UTF-8
105 * sequence.
106 */
107 static inline int utf8clen(const char *s)
108 {
109 unsigned char c = *s;
110
111 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
112 }
113
114 /*
115 * Decode a 3-byte UTF-8 sequence.
116 */
117 static unsigned int
118 utf8decode3(const char *str)
119 {
120 unsigned int uc;
121
122 uc = *str++ & 0x0F;
123 uc <<= 6;
124 uc |= *str++ & 0x3F;
125 uc <<= 6;
126 uc |= *str++ & 0x3F;
127
128 return uc;
129 }
130
131 /*
132 * Encode a 3-byte UTF-8 sequence.
133 */
134 static int
135 utf8encode3(char *str, unsigned int val)
136 {
137 str[2] = (val & 0x3F) | 0x80;
138 val >>= 6;
139 str[1] = (val & 0x3F) | 0x80;
140 val >>= 6;
141 str[0] = val | 0xE0;
142
143 return 3;
144 }
145
146 /*
147 * utf8trie_t
148 *
149 * A compact binary tree, used to decode UTF-8 characters.
150 *
151 * Internal nodes are one byte for the node itself, and up to three
152 * bytes for an offset into the tree. The first byte contains the
153 * following information:
154 * NEXTBYTE - flag - advance to next byte if set
155 * BITNUM - 3 bit field - the bit number to tested
156 * OFFLEN - 2 bit field - number of bytes in the offset
157 * if offlen == 0 (non-branching node)
158 * RIGHTPATH - 1 bit field - set if the following node is for the
159 * right-hand path (tested bit is set)
160 * TRIENODE - 1 bit field - set if the following node is an internal
161 * node, otherwise it is a leaf node
162 * if offlen != 0 (branching node)
163 * LEFTNODE - 1 bit field - set if the left-hand node is internal
164 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
165 *
166 * Due to the way utf8 works, there cannot be branching nodes with
167 * NEXTBYTE set, and moreover those nodes always have a righthand
168 * descendant.
169 */
170 typedef const unsigned char utf8trie_t;
171 #define BITNUM 0x07
172 #define NEXTBYTE 0x08
173 #define OFFLEN 0x30
174 #define OFFLEN_SHIFT 4
175 #define RIGHTPATH 0x40
176 #define TRIENODE 0x80
177 #define RIGHTNODE 0x40
178 #define LEFTNODE 0x80
179
180 /*
181 * utf8leaf_t
182 *
183 * The leaves of the trie are embedded in the trie, and so the same
184 * underlying datatype: unsigned char.
185 *
186 * leaf[0]: The unicode version, stored as a generation number that is
187 * an index into utf8agetab[]. With this we can filter code
188 * points based on the unicode version in which they were
189 * defined. The CCC of a non-defined code point is 0.
190 * leaf[1]: Canonical Combining Class. During normalization, we need
191 * to do a stable sort into ascending order of all characters
192 * with a non-zero CCC that occur between two characters with
193 * a CCC of 0, or at the begin or end of a string.
194 * The unicode standard guarantees that all CCC values are
195 * between 0 and 254 inclusive, which leaves 255 available as
196 * a special value.
197 * Code points with CCC 0 are known as stoppers.
198 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
199 * start of a NUL-terminated string that is the decomposition
200 * of the character.
201 * The CCC of a decomposable character is the same as the CCC
202 * of the first character of its decomposition.
203 * Some characters decompose as the empty string: these are
204 * characters with the Default_Ignorable_Code_Point property.
205 * These do affect normalization, as they all have CCC 0.
206 *
207 * The decompositions in the trie have been fully expanded, with the
208 * exception of Hangul syllables, which are decomposed algorithmically.
209 *
210 * Casefolding, if applicable, is also done using decompositions.
211 *
212 * The trie is constructed in such a way that leaves exist for all
213 * UTF-8 sequences that match the criteria from the "UTF-8 valid
214 * ranges" comment above, and only for those sequences. Therefore a
215 * lookup in the trie can be used to validate the UTF-8 input.
216 */
217 typedef const unsigned char utf8leaf_t;
218
219 #define LEAF_GEN(LEAF) ((LEAF)[0])
220 #define LEAF_CCC(LEAF) ((LEAF)[1])
221 #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
222
223 #define MINCCC (0)
224 #define MAXCCC (254)
225 #define STOPPER (0)
226 #define DECOMPOSE (255)
227
228 /* Marker for hangul syllable decomposition. */
229 #define HANGUL ((char)(255))
230 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
231 #define UTF8HANGULLEAF (12)
232
233 /*
234 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
235 *
236 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
237 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
238 *
239 * SBase = 0xAC00
240 * LBase = 0x1100
241 * VBase = 0x1161
242 * TBase = 0x11A7
243 * LCount = 19
244 * VCount = 21
245 * TCount = 28
246 * NCount = 588 (VCount * TCount)
247 * SCount = 11172 (LCount * NCount)
248 *
249 * Decomposition:
250 * SIndex = s - SBase
251 *
252 * LV (Canonical/Full)
253 * LIndex = SIndex / NCount
254 * VIndex = (Sindex % NCount) / TCount
255 * LPart = LBase + LIndex
256 * VPart = VBase + VIndex
257 *
258 * LVT (Canonical)
259 * LVIndex = (SIndex / TCount) * TCount
260 * TIndex = (Sindex % TCount)
261 * LVPart = SBase + LVIndex
262 * TPart = TBase + TIndex
263 *
264 * LVT (Full)
265 * LIndex = SIndex / NCount
266 * VIndex = (Sindex % NCount) / TCount
267 * TIndex = (Sindex % TCount)
268 * LPart = LBase + LIndex
269 * VPart = VBase + VIndex
270 * if (TIndex == 0) {
271 * d = <LPart, VPart>
272 * } else {
273 * TPart = TBase + TIndex
274 * d = <LPart, TPart, VPart>
275 * }
276 */
277
278 /* Constants */
279 #define SB (0xAC00)
280 #define LB (0x1100)
281 #define VB (0x1161)
282 #define TB (0x11A7)
283 #define LC (19)
284 #define VC (21)
285 #define TC (28)
286 #define NC (VC * TC)
287 #define SC (LC * NC)
288
289 /* Algorithmic decomposition of hangul syllable. */
290 static utf8leaf_t *
291 utf8hangul(const char *str, unsigned char *hangul)
292 {
293 unsigned int si;
294 unsigned int li;
295 unsigned int vi;
296 unsigned int ti;
297 unsigned char *h;
298
299 /* Calculate the SI, LI, VI, and TI values. */
300 si = utf8decode3(str) - SB;
301 li = si / NC;
302 vi = (si % NC) / TC;
303 ti = si % TC;
304
305 /* Fill in base of leaf. */
306 h = hangul;
307 LEAF_GEN(h) = 2;
308 LEAF_CCC(h) = DECOMPOSE;
309 h += 2;
310
311 /* Add LPart, a 3-byte UTF-8 sequence. */
312 h += utf8encode3((char *)h, li + LB);
313
314 /* Add VPart, a 3-byte UTF-8 sequence. */
315 h += utf8encode3((char *)h, vi + VB);
316
317 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
318 if (ti)
319 h += utf8encode3((char *)h, ti + TB);
320
321 /* Terminate string. */
322 h[0] = '\0';
323
324 return hangul;
325 }
326
327 /*
328 * Use trie to scan s, touching at most len bytes.
329 * Returns the leaf if one exists, NULL otherwise.
330 *
331 * A non-NULL return guarantees that the UTF-8 sequence starting at s
332 * is well-formed and corresponds to a known unicode code point. The
333 * shorthand for this will be "is valid UTF-8 unicode".
334 */
335 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
336 unsigned char *hangul, const char *s, size_t len)
337 {
338 utf8trie_t *trie;
339 int offlen;
340 int offset;
341 int mask;
342 int node;
343
344 if (!data)
345 return NULL;
346 if (len == 0)
347 return NULL;
348
349 trie = utf8data + data->offset;
350 node = 1;
351 while (node) {
352 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
353 if (*trie & NEXTBYTE) {
354 if (--len == 0)
355 return NULL;
356 s++;
357 }
358 mask = 1 << (*trie & BITNUM);
359 if (*s & mask) {
360 /* Right leg */
361 if (offlen) {
362 /* Right node at offset of trie */
363 node = (*trie & RIGHTNODE);
364 offset = trie[offlen];
365 while (--offlen) {
366 offset <<= 8;
367 offset |= trie[offlen];
368 }
369 trie += offset;
370 } else if (*trie & RIGHTPATH) {
371 /* Right node after this node */
372 node = (*trie & TRIENODE);
373 trie++;
374 } else {
375 /* No right node. */
376 return NULL;
377 }
378 } else {
379 /* Left leg */
380 if (offlen) {
381 /* Left node after this node. */
382 node = (*trie & LEFTNODE);
383 trie += offlen + 1;
384 } else if (*trie & RIGHTPATH) {
385 /* No left node. */
386 return NULL;
387 } else {
388 /* Left node after this node */
389 node = (*trie & TRIENODE);
390 trie++;
391 }
392 }
393 }
394 /*
395 * Hangul decomposition is done algorithmically. These are the
396 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
397 * always 3 bytes long, so s has been advanced twice, and the
398 * start of the sequence is at s-2.
399 */
400 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
401 trie = utf8hangul(s - 2, hangul);
402 return trie;
403 }
404
405 /*
406 * Use trie to scan s.
407 * Returns the leaf if one exists, NULL otherwise.
408 *
409 * Forwards to utf8nlookup().
410 */
411 static utf8leaf_t *utf8lookup(const struct utf8data *data,
412 unsigned char *hangul, const char *s)
413 {
414 return utf8nlookup(data, hangul, s, (size_t)-1);
415 }
416
417 /*
418 * Maximum age of any character in s.
419 * Return -1 if s is not valid UTF-8 unicode.
420 * Return 0 if only non-assigned code points are used.
421 */
422 int utf8agemax(const struct utf8data *data, const char *s)
423 {
424 utf8leaf_t *leaf;
425 int age = 0;
426 int leaf_age;
427 unsigned char hangul[UTF8HANGULLEAF];
428
429 if (!data)
430 return -1;
431
432 while (*s) {
433 leaf = utf8lookup(data, hangul, s);
434 if (!leaf)
435 return -1;
436
437 leaf_age = utf8agetab[LEAF_GEN(leaf)];
438 if (leaf_age <= data->maxage && leaf_age > age)
439 age = leaf_age;
440 s += utf8clen(s);
441 }
442 return age;
443 }
444
445 /*
446 * Minimum age of any character in s.
447 * Return -1 if s is not valid UTF-8 unicode.
448 * Return 0 if non-assigned code points are used.
449 */
450 int utf8agemin(const struct utf8data *data, const char *s)
451 {
452 utf8leaf_t *leaf;
453 int age;
454 int leaf_age;
455 unsigned char hangul[UTF8HANGULLEAF];
456
457 if (!data)
458 return -1;
459 age = data->maxage;
460 while (*s) {
461 leaf = utf8lookup(data, hangul, s);
462 if (!leaf)
463 return -1;
464 leaf_age = utf8agetab[LEAF_GEN(leaf)];
465 if (leaf_age <= data->maxage && leaf_age < age)
466 age = leaf_age;
467 s += utf8clen(s);
468 }
469 return age;
470 }
471
472 /*
473 * Maximum age of any character in s, touch at most len bytes.
474 * Return -1 if s is not valid UTF-8 unicode.
475 */
476 int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
477 {
478 utf8leaf_t *leaf;
479 int age = 0;
480 int leaf_age;
481 unsigned char hangul[UTF8HANGULLEAF];
482
483 if (!data)
484 return -1;
485
486 while (len && *s) {
487 leaf = utf8nlookup(data, hangul, s, len);
488 if (!leaf)
489 return -1;
490 leaf_age = utf8agetab[LEAF_GEN(leaf)];
491 if (leaf_age <= data->maxage && leaf_age > age)
492 age = leaf_age;
493 len -= utf8clen(s);
494 s += utf8clen(s);
495 }
496 return age;
497 }
498
499 /*
500 * Maximum age of any character in s, touch at most len bytes.
501 * Return -1 if s is not valid UTF-8 unicode.
502 */
503 int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
504 {
505 utf8leaf_t *leaf;
506 int leaf_age;
507 int age;
508 unsigned char hangul[UTF8HANGULLEAF];
509
510 if (!data)
511 return -1;
512 age = data->maxage;
513 while (len && *s) {
514 leaf = utf8nlookup(data, hangul, s, len);
515 if (!leaf)
516 return -1;
517 leaf_age = utf8agetab[LEAF_GEN(leaf)];
518 if (leaf_age <= data->maxage && leaf_age < age)
519 age = leaf_age;
520 len -= utf8clen(s);
521 s += utf8clen(s);
522 }
523 return age;
524 }
525
526 /*
527 * Length of the normalization of s.
528 * Return -1 if s is not valid UTF-8 unicode.
529 *
530 * A string of Default_Ignorable_Code_Point has length 0.
531 */
532 ssize_t utf8len(const struct utf8data *data, const char *s)
533 {
534 utf8leaf_t *leaf;
535 size_t ret = 0;
536 unsigned char hangul[UTF8HANGULLEAF];
537
538 if (!data)
539 return -1;
540 while (*s) {
541 leaf = utf8lookup(data, hangul, s);
542 if (!leaf)
543 return -1;
544 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
545 ret += utf8clen(s);
546 else if (LEAF_CCC(leaf) == DECOMPOSE)
547 ret += strlen(LEAF_STR(leaf));
548 else
549 ret += utf8clen(s);
550 s += utf8clen(s);
551 }
552 return ret;
553 }
554
555 /*
556 * Length of the normalization of s, touch at most len bytes.
557 * Return -1 if s is not valid UTF-8 unicode.
558 */
559 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
560 {
561 utf8leaf_t *leaf;
562 size_t ret = 0;
563 unsigned char hangul[UTF8HANGULLEAF];
564
565 if (!data)
566 return -1;
567 while (len && *s) {
568 leaf = utf8nlookup(data, hangul, s, len);
569 if (!leaf)
570 return -1;
571 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
572 ret += utf8clen(s);
573 else if (LEAF_CCC(leaf) == DECOMPOSE)
574 ret += strlen(LEAF_STR(leaf));
575 else
576 ret += utf8clen(s);
577 len -= utf8clen(s);
578 s += utf8clen(s);
579 }
580 return ret;
581 }
582
583 /*
584 * Set up an utf8cursor for use by utf8byte().
585 *
586 * u8c : pointer to cursor.
587 * data : const struct utf8data to use for normalization.
588 * s : string.
589 * len : length of s.
590 *
591 * Returns -1 on error, 0 on success.
592 */
593 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
594 const char *s, size_t len)
595 {
596 if (!data)
597 return -1;
598 if (!s)
599 return -1;
600 u8c->data = data;
601 u8c->s = s;
602 u8c->p = NULL;
603 u8c->ss = NULL;
604 u8c->sp = NULL;
605 u8c->len = len;
606 u8c->slen = 0;
607 u8c->ccc = STOPPER;
608 u8c->nccc = STOPPER;
609 /* Check we didn't clobber the maximum length. */
610 if (u8c->len != len)
611 return -1;
612 /* The first byte of s may not be an utf8 continuation. */
613 if (len > 0 && (*s & 0xC0) == 0x80)
614 return -1;
615 return 0;
616 }
617
618 /*
619 * Set up an utf8cursor for use by utf8byte().
620 *
621 * u8c : pointer to cursor.
622 * data : const struct utf8data to use for normalization.
623 * s : NUL-terminated string.
624 *
625 * Returns -1 on error, 0 on success.
626 */
627 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
628 const char *s)
629 {
630 return utf8ncursor(u8c, data, s, (unsigned int)-1);
631 }
632
633 /*
634 * Get one byte from the normalized form of the string described by u8c.
635 *
636 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
637 *
638 * The cursor keeps track of the location in the string in u8c->s.
639 * When a character is decomposed, the current location is stored in
640 * u8c->p, and u8c->s is set to the start of the decomposition. Note
641 * that bytes from a decomposition do not count against u8c->len.
642 *
643 * Characters are emitted if they match the current CCC in u8c->ccc.
644 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
645 * and the function returns 0 in that case.
646 *
647 * Sorting by CCC is done by repeatedly scanning the string. The
648 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
649 * the start of the scan. The first pass finds the lowest CCC to be
650 * emitted and stores it in u8c->nccc, the second pass emits the
651 * characters with this CCC and finds the next lowest CCC. This limits
652 * the number of passes to 1 + the number of different CCCs in the
653 * sequence being scanned.
654 *
655 * Therefore:
656 * u8c->p != NULL -> a decomposition is being scanned.
657 * u8c->ss != NULL -> this is a repeating scan.
658 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
659 */
660 int utf8byte(struct utf8cursor *u8c)
661 {
662 utf8leaf_t *leaf;
663 int ccc;
664
665 for (;;) {
666 /* Check for the end of a decomposed character. */
667 if (u8c->p && *u8c->s == '\0') {
668 u8c->s = u8c->p;
669 u8c->p = NULL;
670 }
671
672 /* Check for end-of-string. */
673 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
674 /* There is no next byte. */
675 if (u8c->ccc == STOPPER)
676 return 0;
677 /* End-of-string during a scan counts as a stopper. */
678 ccc = STOPPER;
679 goto ccc_mismatch;
680 } else if ((*u8c->s & 0xC0) == 0x80) {
681 /* This is a continuation of the current character. */
682 if (!u8c->p)
683 u8c->len--;
684 return (unsigned char)*u8c->s++;
685 }
686
687 /* Look up the data for the current character. */
688 if (u8c->p) {
689 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
690 } else {
691 leaf = utf8nlookup(u8c->data, u8c->hangul,
692 u8c->s, u8c->len);
693 }
694
695 /* No leaf found implies that the input is a binary blob. */
696 if (!leaf)
697 return -1;
698
699 ccc = LEAF_CCC(leaf);
700 /* Characters that are too new have CCC 0. */
701 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
702 ccc = STOPPER;
703 } else if (ccc == DECOMPOSE) {
704 u8c->len -= utf8clen(u8c->s);
705 u8c->p = u8c->s + utf8clen(u8c->s);
706 u8c->s = LEAF_STR(leaf);
707 /* Empty decomposition implies CCC 0. */
708 if (*u8c->s == '\0') {
709 if (u8c->ccc == STOPPER)
710 continue;
711 ccc = STOPPER;
712 goto ccc_mismatch;
713 }
714
715 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
716 ccc = LEAF_CCC(leaf);
717 }
718
719 /*
720 * If this is not a stopper, then see if it updates
721 * the next canonical class to be emitted.
722 */
723 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
724 u8c->nccc = ccc;
725
726 /*
727 * Return the current byte if this is the current
728 * combining class.
729 */
730 if (ccc == u8c->ccc) {
731 if (!u8c->p)
732 u8c->len--;
733 return (unsigned char)*u8c->s++;
734 }
735
736 /* Current combining class mismatch. */
737 ccc_mismatch:
738 if (u8c->nccc == STOPPER) {
739 /*
740 * Scan forward for the first canonical class
741 * to be emitted. Save the position from
742 * which to restart.
743 */
744 u8c->ccc = MINCCC - 1;
745 u8c->nccc = ccc;
746 u8c->sp = u8c->p;
747 u8c->ss = u8c->s;
748 u8c->slen = u8c->len;
749 if (!u8c->p)
750 u8c->len -= utf8clen(u8c->s);
751 u8c->s += utf8clen(u8c->s);
752 } else if (ccc != STOPPER) {
753 /* Not a stopper, and not the ccc we're emitting. */
754 if (!u8c->p)
755 u8c->len -= utf8clen(u8c->s);
756 u8c->s += utf8clen(u8c->s);
757 } else if (u8c->nccc != MAXCCC + 1) {
758 /* At a stopper, restart for next ccc. */
759 u8c->ccc = u8c->nccc;
760 u8c->nccc = MAXCCC + 1;
761 u8c->s = u8c->ss;
762 u8c->p = u8c->sp;
763 u8c->len = u8c->slen;
764 } else {
765 /* All done, proceed from here. */
766 u8c->ccc = STOPPER;
767 u8c->nccc = STOPPER;
768 u8c->sp = NULL;
769 u8c->ss = NULL;
770 u8c->slen = 0;
771 }
772 }
773 }
774
775 const struct utf8data *utf8nfdi(unsigned int maxage)
776 {
777 int i = ARRAY_SIZE(utf8nfdidata) - 1;
778
779 while (maxage < utf8nfdidata[i].maxage)
780 i--;
781 if (maxage > utf8nfdidata[i].maxage)
782 return NULL;
783 return &utf8nfdidata[i];
784 }
785
786 const struct utf8data *utf8nfdicf(unsigned int maxage)
787 {
788 int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
789
790 while (maxage < utf8nfdicfdata[i].maxage)
791 i--;
792 if (maxage > utf8nfdicfdata[i].maxage)
793 return NULL;
794 return &utf8nfdicfdata[i];
795 }