lib/ext2fs/nls_utf8-norm.c

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  */
  15
  16 /*
  17  * This code is adapted from the Linux Kernel.  We have a
  18  * userspace version here such that the hashes will match that
  19  * implementation.
  20  */
  21
  22 #include "utf8n.h"
  23
  24 struct utf8data {
  25         unsigned int maxage;
  26         unsigned int offset;
  27 };
  28
  29 #define __INCLUDED_FROM_UTF8NORM_C__
  30 #include "utf8data.h"
  31 #undef __INCLUDED_FROM_UTF8NORM_C__
  32
  33 #define ARRAY_SIZE(array)                       \
  34         (sizeof(array) / sizeof(array[0]))
  35
  36 int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
  37 {
  38         int i = ARRAY_SIZE(utf8agetab) - 1;
  39         unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
  40
  41         while (i >= 0 && utf8agetab[i] != 0) {
  42                 if (sb_utf8version == utf8agetab[i])
  43                         return 1;
  44                 i--;
  45         }
  46         return 0;
  47 }
  48
  49 int utf8version_latest(void)
  50 {
  51         return utf8vers;
  52 }
  53
  54 /*
  55  * UTF-8 valid ranges.
  56  *
  57  * The UTF-8 encoding spreads the bits of a 32bit word over several
  58  * bytes. This table gives the ranges that can be held and how they'd
  59  * be represented.
  60  *
  61  * 0x00000000 0x0000007F: 0xxxxxxx
  62  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
  63  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  64  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  65  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  66  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  67  *
  68  * There is an additional requirement on UTF-8, in that only the
  69  * shortest representation of a 32bit value is to be used.  A decoder
  70  * must not decode sequences that do not satisfy this requirement.
  71  * Thus the allowed ranges have a lower bound.
  72  *
  73  * 0x00000000 0x0000007F: 0xxxxxxx
  74  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
  75  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  76  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  77  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  78  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  79  *
  80  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
  81  * 17 planes of 65536 values.  This limits the sequences actually seen
  82  * even more, to just the following.
  83  *
  84  *          0 -     0x7F: 0                   - 0x7F
  85  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
  86  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
  87  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
  88  *
  89  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
  90  *
  91  * Note that the longest sequence seen with valid usage is 4 bytes,
  92  * the same a single UTF-32 character.  This makes the UTF-8
  93  * representation of Unicode strictly smaller than UTF-32.
  94  *
  95  * The shortest sequence requirement was introduced by:
  96  *    Corrigendum #1: UTF-8 Shortest Form
  97  * It can be found here:
  98  *    http://www.unicode.org/versions/corrigendum1.html
  99  *
 100  */
 101
 102 /*
 103  * Return the number of bytes used by the current UTF-8 sequence.
 104  * Assumes the input points to the first byte of a valid UTF-8
 105  * sequence.
 106  */
 107 static inline int utf8clen(const char *s)
 108 {
 109         unsigned char c = *s;
 110
 111         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
 112 }
 113
 114 /*
 115  * Decode a 3-byte UTF-8 sequence.
 116  */
 117 static unsigned int
 118 utf8decode3(const char *str)
 119 {
 120         unsigned int            uc;
 121
 122         uc = *str++ & 0x0F;
 123         uc <<= 6;
 124         uc |= *str++ & 0x3F;
 125         uc <<= 6;
 126         uc |= *str++ & 0x3F;
 127
 128         return uc;
 129 }
 130
 131 /*
 132  * Encode a 3-byte UTF-8 sequence.
 133  */
 134 static int
 135 utf8encode3(char *str, unsigned int val)
 136 {
 137         str[2] = (val & 0x3F) | 0x80;
 138         val >>= 6;
 139         str[1] = (val & 0x3F) | 0x80;
 140         val >>= 6;
 141         str[0] = val | 0xE0;
 142
 143         return 3;
 144 }
 145
 146 /*
 147  * utf8trie_t
 148  *
 149  * A compact binary tree, used to decode UTF-8 characters.
 150  *
 151  * Internal nodes are one byte for the node itself, and up to three
 152  * bytes for an offset into the tree.  The first byte contains the
 153  * following information:
 154  *  NEXTBYTE  - flag        - advance to next byte if set
 155  *  BITNUM    - 3 bit field - the bit number to tested
 156  *  OFFLEN    - 2 bit field - number of bytes in the offset
 157  * if offlen == 0 (non-branching node)
 158  *  RIGHTPATH - 1 bit field - set if the following node is for the
 159  *                            right-hand path (tested bit is set)
 160  *  TRIENODE  - 1 bit field - set if the following node is an internal
 161  *                            node, otherwise it is a leaf node
 162  * if offlen != 0 (branching node)
 163  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 164  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 165  *
 166  * Due to the way utf8 works, there cannot be branching nodes with
 167  * NEXTBYTE set, and moreover those nodes always have a righthand
 168  * descendant.
 169  */
 170 typedef const unsigned char utf8trie_t;
 171 #define BITNUM          0x07
 172 #define NEXTBYTE        0x08
 173 #define OFFLEN          0x30
 174 #define OFFLEN_SHIFT    4
 175 #define RIGHTPATH       0x40
 176 #define TRIENODE        0x80
 177 #define RIGHTNODE       0x40
 178 #define LEFTNODE        0x80
 179
 180 /*
 181  * utf8leaf_t
 182  *
 183  * The leaves of the trie are embedded in the trie, and so the same
 184  * underlying datatype: unsigned char.
 185  *
 186  * leaf[0]: The unicode version, stored as a generation number that is
 187  *          an index into utf8agetab[].  With this we can filter code
 188  *          points based on the unicode version in which they were
 189  *          defined.  The CCC of a non-defined code point is 0.
 190  * leaf[1]: Canonical Combining Class. During normalization, we need
 191  *          to do a stable sort into ascending order of all characters
 192  *          with a non-zero CCC that occur between two characters with
 193  *          a CCC of 0, or at the begin or end of a string.
 194  *          The unicode standard guarantees that all CCC values are
 195  *          between 0 and 254 inclusive, which leaves 255 available as
 196  *          a special value.
 197  *          Code points with CCC 0 are known as stoppers.
 198  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 199  *          start of a NUL-terminated string that is the decomposition
 200  *          of the character.
 201  *          The CCC of a decomposable character is the same as the CCC
 202  *          of the first character of its decomposition.
 203  *          Some characters decompose as the empty string: these are
 204  *          characters with the Default_Ignorable_Code_Point property.
 205  *          These do affect normalization, as they all have CCC 0.
 206  *
 207  * The decompositions in the trie have been fully expanded, with the
 208  * exception of Hangul syllables, which are decomposed algorithmically.
 209  *
 210  * Casefolding, if applicable, is also done using decompositions.
 211  *
 212  * The trie is constructed in such a way that leaves exist for all
 213  * UTF-8 sequences that match the criteria from the "UTF-8 valid
 214  * ranges" comment above, and only for those sequences.  Therefore a
 215  * lookup in the trie can be used to validate the UTF-8 input.
 216  */
 217 typedef const unsigned char utf8leaf_t;
 218
 219 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 220 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 221 #define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
 222
 223 #define MINCCC          (0)
 224 #define MAXCCC          (254)
 225 #define STOPPER         (0)
 226 #define DECOMPOSE       (255)
 227
 228 /* Marker for hangul syllable decomposition. */
 229 #define HANGUL          ((char)(255))
 230 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
 231 #define UTF8HANGULLEAF  (12)
 232
 233 /*
 234  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 235  *
 236  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 237  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 238  *
 239  * SBase = 0xAC00
 240  * LBase = 0x1100
 241  * VBase = 0x1161
 242  * TBase = 0x11A7
 243  * LCount = 19
 244  * VCount = 21
 245  * TCount = 28
 246  * NCount = 588 (VCount * TCount)
 247  * SCount = 11172 (LCount * NCount)
 248  *
 249  * Decomposition:
 250  *   SIndex = s - SBase
 251  *
 252  * LV (Canonical/Full)
 253  *   LIndex = SIndex / NCount
 254  *   VIndex = (Sindex % NCount) / TCount
 255  *   LPart = LBase + LIndex
 256  *   VPart = VBase + VIndex
 257  *
 258  * LVT (Canonical)
 259  *   LVIndex = (SIndex / TCount) * TCount
 260  *   TIndex = (Sindex % TCount)
 261  *   LVPart = SBase + LVIndex
 262  *   TPart = TBase + TIndex
 263  *
 264  * LVT (Full)
 265  *   LIndex = SIndex / NCount
 266  *   VIndex = (Sindex % NCount) / TCount
 267  *   TIndex = (Sindex % TCount)
 268  *   LPart = LBase + LIndex
 269  *   VPart = VBase + VIndex
 270  *   if (TIndex == 0) {
 271  *          d = <LPart, VPart>
 272  *   } else {
 273  *          TPart = TBase + TIndex
 274  *          d = <LPart, TPart, VPart>
 275  *   }
 276  */
 277
 278 /* Constants */
 279 #define SB      (0xAC00)
 280 #define LB      (0x1100)
 281 #define VB      (0x1161)
 282 #define TB      (0x11A7)
 283 #define LC      (19)
 284 #define VC      (21)
 285 #define TC      (28)
 286 #define NC      (VC * TC)
 287 #define SC      (LC * NC)
 288
 289 /* Algorithmic decomposition of hangul syllable. */
 290 static utf8leaf_t *
 291 utf8hangul(const char *str, unsigned char *hangul)
 292 {
 293         unsigned int    si;
 294         unsigned int    li;
 295         unsigned int    vi;
 296         unsigned int    ti;
 297         unsigned char   *h;
 298
 299         /* Calculate the SI, LI, VI, and TI values. */
 300         si = utf8decode3(str) - SB;
 301         li = si / NC;
 302         vi = (si % NC) / TC;
 303         ti = si % TC;
 304
 305         /* Fill in base of leaf. */
 306         h = hangul;
 307         LEAF_GEN(h) = 2;
 308         LEAF_CCC(h) = DECOMPOSE;
 309         h += 2;
 310
 311         /* Add LPart, a 3-byte UTF-8 sequence. */
 312         h += utf8encode3((char *)h, li + LB);
 313
 314         /* Add VPart, a 3-byte UTF-8 sequence. */
 315         h += utf8encode3((char *)h, vi + VB);
 316
 317         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
 318         if (ti)
 319                 h += utf8encode3((char *)h, ti + TB);
 320
 321         /* Terminate string. */
 322         h[0] = '\0';
 323
 324         return hangul;
 325 }
 326
 327 /*
 328  * Use trie to scan s, touching at most len bytes.
 329  * Returns the leaf if one exists, NULL otherwise.
 330  *
 331  * A non-NULL return guarantees that the UTF-8 sequence starting at s
 332  * is well-formed and corresponds to a known unicode code point.  The
 333  * shorthand for this will be "is valid UTF-8 unicode".
 334  */
 335 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
 336                                unsigned char *hangul, const char *s, size_t len)
 337 {
 338         utf8trie_t      *trie;
 339         int             offlen;
 340         int             offset;
 341         int             mask;
 342         int             node;
 343
 344         if (!data)
 345                 return NULL;
 346         if (len == 0)
 347                 return NULL;
 348
 349         trie = utf8data + data->offset;
 350         node = 1;
 351         while (node) {
 352                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
 353                 if (*trie & NEXTBYTE) {
 354                         if (--len == 0)
 355                                 return NULL;
 356                         s++;
 357                 }
 358                 mask = 1 << (*trie & BITNUM);
 359                 if (*s & mask) {
 360                         /* Right leg */
 361                         if (offlen) {
 362                                 /* Right node at offset of trie */
 363                                 node = (*trie & RIGHTNODE);
 364                                 offset = trie[offlen];
 365                                 while (--offlen) {
 366                                         offset <<= 8;
 367                                         offset |= trie[offlen];
 368                                 }
 369                                 trie += offset;
 370                         } else if (*trie & RIGHTPATH) {
 371                                 /* Right node after this node */
 372                                 node = (*trie & TRIENODE);
 373                                 trie++;
 374                         } else {
 375                                 /* No right node. */
 376                                 return NULL;
 377                         }
 378                 } else {
 379                         /* Left leg */
 380                         if (offlen) {
 381                                 /* Left node after this node. */
 382                                 node = (*trie & LEFTNODE);
 383                                 trie += offlen + 1;
 384                         } else if (*trie & RIGHTPATH) {
 385                                 /* No left node. */
 386                                 return NULL;
 387                         } else {
 388                                 /* Left node after this node */
 389                                 node = (*trie & TRIENODE);
 390                                 trie++;
 391                         }
 392                 }
 393         }
 394         /*
 395          * Hangul decomposition is done algorithmically. These are the
 396          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
 397          * always 3 bytes long, so s has been advanced twice, and the
 398          * start of the sequence is at s-2.
 399          */
 400         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
 401                 trie = utf8hangul(s - 2, hangul);
 402         return trie;
 403 }
 404
 405 /*
 406  * Use trie to scan s.
 407  * Returns the leaf if one exists, NULL otherwise.
 408  *
 409  * Forwards to utf8nlookup().
 410  */
 411 static utf8leaf_t *utf8lookup(const struct utf8data *data,
 412                               unsigned char *hangul, const char *s)
 413 {
 414         return utf8nlookup(data, hangul, s, (size_t)-1);
 415 }
 416
 417 /*
 418  * Maximum age of any character in s.
 419  * Return -1 if s is not valid UTF-8 unicode.
 420  * Return 0 if only non-assigned code points are used.
 421  */
 422 int utf8agemax(const struct utf8data *data, const char *s)
 423 {
 424         utf8leaf_t      *leaf;
 425         int             age = 0;
 426         int             leaf_age;
 427         unsigned char   hangul[UTF8HANGULLEAF];
 428
 429         if (!data)
 430                 return -1;
 431
 432         while (*s) {
 433                 leaf = utf8lookup(data, hangul, s);
 434                 if (!leaf)
 435                         return -1;
 436
 437                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 438                 if (leaf_age <= data->maxage && leaf_age > age)
 439                         age = leaf_age;
 440                 s += utf8clen(s);
 441         }
 442         return age;
 443 }
 444
 445 /*
 446  * Minimum age of any character in s.
 447  * Return -1 if s is not valid UTF-8 unicode.
 448  * Return 0 if non-assigned code points are used.
 449  */
 450 int utf8agemin(const struct utf8data *data, const char *s)
 451 {
 452         utf8leaf_t      *leaf;
 453         int             age;
 454         int             leaf_age;
 455         unsigned char   hangul[UTF8HANGULLEAF];
 456
 457         if (!data)
 458                 return -1;
 459         age = data->maxage;
 460         while (*s) {
 461                 leaf = utf8lookup(data, hangul, s);
 462                 if (!leaf)
 463                         return -1;
 464                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 465                 if (leaf_age <= data->maxage && leaf_age < age)
 466                         age = leaf_age;
 467                 s += utf8clen(s);
 468         }
 469         return age;
 470 }
 471
 472 /*
 473  * Maximum age of any character in s, touch at most len bytes.
 474  * Return -1 if s is not valid UTF-8 unicode.
 475  */
 476 int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
 477 {
 478         utf8leaf_t      *leaf;
 479         int             age = 0;
 480         int             leaf_age;
 481         unsigned char   hangul[UTF8HANGULLEAF];
 482
 483         if (!data)
 484                 return -1;
 485
 486         while (len && *s) {
 487                 leaf = utf8nlookup(data, hangul, s, len);
 488                 if (!leaf)
 489                         return -1;
 490                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 491                 if (leaf_age <= data->maxage && leaf_age > age)
 492                         age = leaf_age;
 493                 len -= utf8clen(s);
 494                 s += utf8clen(s);
 495         }
 496         return age;
 497 }
 498
 499 /*
 500  * Maximum age of any character in s, touch at most len bytes.
 501  * Return -1 if s is not valid UTF-8 unicode.
 502  */
 503 int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
 504 {
 505         utf8leaf_t      *leaf;
 506         int             leaf_age;
 507         int             age;
 508         unsigned char   hangul[UTF8HANGULLEAF];
 509
 510         if (!data)
 511                 return -1;
 512         age = data->maxage;
 513         while (len && *s) {
 514                 leaf = utf8nlookup(data, hangul, s, len);
 515                 if (!leaf)
 516                         return -1;
 517                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 518                 if (leaf_age <= data->maxage && leaf_age < age)
 519                         age = leaf_age;
 520                 len -= utf8clen(s);
 521                 s += utf8clen(s);
 522         }
 523         return age;
 524 }
 525
 526 /*
 527  * Length of the normalization of s.
 528  * Return -1 if s is not valid UTF-8 unicode.
 529  *
 530  * A string of Default_Ignorable_Code_Point has length 0.
 531  */
 532 ssize_t utf8len(const struct utf8data *data, const char *s)
 533 {
 534         utf8leaf_t      *leaf;
 535         size_t          ret = 0;
 536         unsigned char   hangul[UTF8HANGULLEAF];
 537
 538         if (!data)
 539                 return -1;
 540         while (*s) {
 541                 leaf = utf8lookup(data, hangul, s);
 542                 if (!leaf)
 543                         return -1;
 544                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 545                         ret += utf8clen(s);
 546                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 547                         ret += strlen(LEAF_STR(leaf));
 548                 else
 549                         ret += utf8clen(s);
 550                 s += utf8clen(s);
 551         }
 552         return ret;
 553 }
 554
 555 /*
 556  * Length of the normalization of s, touch at most len bytes.
 557  * Return -1 if s is not valid UTF-8 unicode.
 558  */
 559 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
 560 {
 561         utf8leaf_t      *leaf;
 562         size_t          ret = 0;
 563         unsigned char   hangul[UTF8HANGULLEAF];
 564
 565         if (!data)
 566                 return -1;
 567         while (len && *s) {
 568                 leaf = utf8nlookup(data, hangul, s, len);
 569                 if (!leaf)
 570                         return -1;
 571                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 572                         ret += utf8clen(s);
 573                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 574                         ret += strlen(LEAF_STR(leaf));
 575                 else
 576                         ret += utf8clen(s);
 577                 len -= utf8clen(s);
 578                 s += utf8clen(s);
 579         }
 580         return ret;
 581 }
 582
 583 /*
 584  * Set up an utf8cursor for use by utf8byte().
 585  *
 586  *   u8c    : pointer to cursor.
 587  *   data   : const struct utf8data to use for normalization.
 588  *   s      : string.
 589  *   len    : length of s.
 590  *
 591  * Returns -1 on error, 0 on success.
 592  */
 593 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
 594                 const char *s, size_t len)
 595 {
 596         if (!data)
 597                 return -1;
 598         if (!s)
 599                 return -1;
 600         u8c->data = data;
 601         u8c->s = s;
 602         u8c->p = NULL;
 603         u8c->ss = NULL;
 604         u8c->sp = NULL;
 605         u8c->len = len;
 606         u8c->slen = 0;
 607         u8c->ccc = STOPPER;
 608         u8c->nccc = STOPPER;
 609         /* Check we didn't clobber the maximum length. */
 610         if (u8c->len != len)
 611                 return -1;
 612         /* The first byte of s may not be an utf8 continuation. */
 613         if (len > 0 && (*s & 0xC0) == 0x80)
 614                 return -1;
 615         return 0;
 616 }
 617
 618 /*
 619  * Set up an utf8cursor for use by utf8byte().
 620  *
 621  *   u8c    : pointer to cursor.
 622  *   data   : const struct utf8data to use for normalization.
 623  *   s      : NUL-terminated string.
 624  *
 625  * Returns -1 on error, 0 on success.
 626  */
 627 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
 628                const char *s)
 629 {
 630         return utf8ncursor(u8c, data, s, (unsigned int)-1);
 631 }
 632
 633 /*
 634  * Get one byte from the normalized form of the string described by u8c.
 635  *
 636  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
 637  *
 638  * The cursor keeps track of the location in the string in u8c->s.
 639  * When a character is decomposed, the current location is stored in
 640  * u8c->p, and u8c->s is set to the start of the decomposition. Note
 641  * that bytes from a decomposition do not count against u8c->len.
 642  *
 643  * Characters are emitted if they match the current CCC in u8c->ccc.
 644  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
 645  * and the function returns 0 in that case.
 646  *
 647  * Sorting by CCC is done by repeatedly scanning the string.  The
 648  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
 649  * the start of the scan.  The first pass finds the lowest CCC to be
 650  * emitted and stores it in u8c->nccc, the second pass emits the
 651  * characters with this CCC and finds the next lowest CCC. This limits
 652  * the number of passes to 1 + the number of different CCCs in the
 653  * sequence being scanned.
 654  *
 655  * Therefore:
 656  *  u8c->p  != NULL -> a decomposition is being scanned.
 657  *  u8c->ss != NULL -> this is a repeating scan.
 658  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
 659  */
 660 int utf8byte(struct utf8cursor *u8c)
 661 {
 662         utf8leaf_t *leaf;
 663         int ccc;
 664
 665         for (;;) {
 666                 /* Check for the end of a decomposed character. */
 667                 if (u8c->p && *u8c->s == '\0') {
 668                         u8c->s = u8c->p;
 669                         u8c->p = NULL;
 670                 }
 671
 672                 /* Check for end-of-string. */
 673                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
 674                         /* There is no next byte. */
 675                         if (u8c->ccc == STOPPER)
 676                                 return 0;
 677                         /* End-of-string during a scan counts as a stopper. */
 678                         ccc = STOPPER;
 679                         goto ccc_mismatch;
 680                 } else if ((*u8c->s & 0xC0) == 0x80) {
 681                         /* This is a continuation of the current character. */
 682                         if (!u8c->p)
 683                                 u8c->len--;
 684                         return (unsigned char)*u8c->s++;
 685                 }
 686
 687                 /* Look up the data for the current character. */
 688                 if (u8c->p) {
 689                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 690                 } else {
 691                         leaf = utf8nlookup(u8c->data, u8c->hangul,
 692                                            u8c->s, u8c->len);
 693                 }
 694
 695                 /* No leaf found implies that the input is a binary blob. */
 696                 if (!leaf)
 697                         return -1;
 698
 699                 ccc = LEAF_CCC(leaf);
 700                 /* Characters that are too new have CCC 0. */
 701                 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
 702                         ccc = STOPPER;
 703                 } else if (ccc == DECOMPOSE) {
 704                         u8c->len -= utf8clen(u8c->s);
 705                         u8c->p = u8c->s + utf8clen(u8c->s);
 706                         u8c->s = LEAF_STR(leaf);
 707                         /* Empty decomposition implies CCC 0. */
 708                         if (*u8c->s == '\0') {
 709                                 if (u8c->ccc == STOPPER)
 710                                         continue;
 711                                 ccc = STOPPER;
 712                                 goto ccc_mismatch;
 713                         }
 714
 715                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 716                         ccc = LEAF_CCC(leaf);
 717                 }
 718
 719                 /*
 720                  * If this is not a stopper, then see if it updates
 721                  * the next canonical class to be emitted.
 722                  */
 723                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
 724                         u8c->nccc = ccc;
 725
 726                 /*
 727                  * Return the current byte if this is the current
 728                  * combining class.
 729                  */
 730                 if (ccc == u8c->ccc) {
 731                         if (!u8c->p)
 732                                 u8c->len--;
 733                         return (unsigned char)*u8c->s++;
 734                 }
 735
 736                 /* Current combining class mismatch. */
 737 ccc_mismatch:
 738                 if (u8c->nccc == STOPPER) {
 739                         /*
 740                          * Scan forward for the first canonical class
 741                          * to be emitted.  Save the position from
 742                          * which to restart.
 743                          */
 744                         u8c->ccc = MINCCC - 1;
 745                         u8c->nccc = ccc;
 746                         u8c->sp = u8c->p;
 747                         u8c->ss = u8c->s;
 748                         u8c->slen = u8c->len;
 749                         if (!u8c->p)
 750                                 u8c->len -= utf8clen(u8c->s);
 751                         u8c->s += utf8clen(u8c->s);
 752                 } else if (ccc != STOPPER) {
 753                         /* Not a stopper, and not the ccc we're emitting. */
 754                         if (!u8c->p)
 755                                 u8c->len -= utf8clen(u8c->s);
 756                         u8c->s += utf8clen(u8c->s);
 757                 } else if (u8c->nccc != MAXCCC + 1) {
 758                         /* At a stopper, restart for next ccc. */
 759                         u8c->ccc = u8c->nccc;
 760                         u8c->nccc = MAXCCC + 1;
 761                         u8c->s = u8c->ss;
 762                         u8c->p = u8c->sp;
 763                         u8c->len = u8c->slen;
 764                 } else {
 765                         /* All done, proceed from here. */
 766                         u8c->ccc = STOPPER;
 767                         u8c->nccc = STOPPER;
 768                         u8c->sp = NULL;
 769                         u8c->ss = NULL;
 770                         u8c->slen = 0;
 771                 }
 772         }
 773 }
 774
 775 const struct utf8data *utf8nfdi(unsigned int maxage)
 776 {
 777         int i = ARRAY_SIZE(utf8nfdidata) - 1;
 778
 779         while (maxage < utf8nfdidata[i].maxage)
 780                 i--;
 781         if (maxage > utf8nfdidata[i].maxage)
 782                 return NULL;
 783         return &utf8nfdidata[i];
 784 }
 785
 786 const struct utf8data *utf8nfdicf(unsigned int maxage)
 787 {
 788         int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
 789
 790         while (maxage < utf8nfdicfdata[i].maxage)
 791                 i--;
 792         if (maxage > utf8nfdicfdata[i].maxage)
 793                 return NULL;
 794         return &utf8nfdicfdata[i];
 795 }