lib/charset.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  *  charset conversion utils
   4  *
   5  *  Copyright (c) 2017 Rob Clark
   6  */
   7
   8 #include <common.h>
   9 #include <charset.h>
  10 #include <capitalization.h>
  11 #include <cp437.h>
  12 #include <efi_loader.h>
  13 #include <errno.h>
  14 #include <malloc.h>
  15
  16 /**
  17  * codepage_437 - Unicode to codepage 437 translation table
  18  */
  19 const u16 codepage_437[128] = CP437;
  20
  21 static struct capitalization_table capitalization_table[] =
  22 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
  23         UNICODE_CAPITALIZATION_TABLE;
  24 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
  25         CP1250_CAPITALIZATION_TABLE;
  26 #else
  27         CP437_CAPITALIZATION_TABLE;
  28 #endif
  29
  30 /**
  31  * get_code() - read Unicode code point from UTF-8 stream
  32  *
  33  * @read_u8:    - stream reader
  34  * @src:        - string buffer passed to stream reader, optional
  35  * Return:      - Unicode code point, or -1
  36  */
  37 static int get_code(u8 (*read_u8)(void *data), void *data)
  38 {
  39         s32 ch = 0;
  40
  41         ch = read_u8(data);
  42         if (!ch)
  43                 return 0;
  44         if (ch >= 0xc2 && ch <= 0xf4) {
  45                 int code = 0;
  46
  47                 if (ch >= 0xe0) {
  48                         if (ch >= 0xf0) {
  49                                 /* 0xf0 - 0xf4 */
  50                                 ch &= 0x07;
  51                                 code = ch << 18;
  52                                 ch = read_u8(data);
  53                                 if (ch < 0x80 || ch > 0xbf)
  54                                         goto error;
  55                                 ch &= 0x3f;
  56                         } else {
  57                                 /* 0xe0 - 0xef */
  58                                 ch &= 0x0f;
  59                         }
  60                         code += ch << 12;
  61                         if ((code >= 0xD800 && code <= 0xDFFF) ||
  62                             code >= 0x110000)
  63                                 goto error;
  64                         ch = read_u8(data);
  65                         if (ch < 0x80 || ch > 0xbf)
  66                                 goto error;
  67                 }
  68                 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
  69                 ch &= 0x3f;
  70                 code += ch << 6;
  71                 ch = read_u8(data);
  72                 if (ch < 0x80 || ch > 0xbf)
  73                         goto error;
  74                 ch &= 0x3f;
  75                 ch += code;
  76         } else if (ch >= 0x80) {
  77                 goto error;
  78         }
  79         return ch;
  80 error:
  81         return -1;
  82 }
  83
  84 /**
  85  * read_string() - read byte from character string
  86  *
  87  * @data:       - pointer to string
  88  * Return:      - byte read
  89  *
  90  * The string pointer is incremented if it does not point to '\0'.
  91  */
  92 static u8 read_string(void *data)
  93
  94 {
  95         const char **src = (const char **)data;
  96         u8 c;
  97
  98         if (!src || !*src || !**src)
  99                 return 0;
 100         c = **src;
 101         ++*src;
 102         return c;
 103 }
 104
 105 /**
 106  * read_console() - read byte from console
 107  *
 108  * @data        - not used, needed to match interface
 109  * Return:      - byte read or 0 on error
 110  */
 111 static u8 read_console(void *data)
 112 {
 113         int ch;
 114
 115         ch = getchar();
 116         if (ch < 0)
 117                 ch = 0;
 118         return ch;
 119 }
 120
 121 int console_read_unicode(s32 *code)
 122 {
 123         for (;;) {
 124                 s32 c;
 125
 126                 if (!tstc()) {
 127                         /* No input available */
 128                         return 1;
 129                 }
 130
 131                 /* Read Unicode code */
 132                 c = get_code(read_console, NULL);
 133                 if (c > 0) {
 134                         *code = c;
 135                         return 0;
 136                 }
 137         }
 138 }
 139
 140 s32 utf8_get(const char **src)
 141 {
 142         return get_code(read_string, src);
 143 }
 144
 145 int utf8_put(s32 code, char **dst)
 146 {
 147         if (!dst || !*dst)
 148                 return -1;
 149         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 150                 return -1;
 151         if (code <= 0x007F) {
 152                 **dst = code;
 153         } else {
 154                 if (code <= 0x07FF) {
 155                         **dst = code >> 6 | 0xC0;
 156                 } else {
 157                         if (code < 0x10000) {
 158                                 **dst = code >> 12 | 0xE0;
 159                         } else {
 160                                 **dst = code >> 18 | 0xF0;
 161                                 ++*dst;
 162                                 **dst = (code >> 12 & 0x3F) | 0x80;
 163                         }
 164                         ++*dst;
 165                         **dst = (code >> 6 & 0x3F) | 0x80;
 166                 }
 167                 ++*dst;
 168                 **dst = (code & 0x3F) | 0x80;
 169         }
 170         ++*dst;
 171         return 0;
 172 }
 173
 174 size_t utf8_utf16_strnlen(const char *src, size_t count)
 175 {
 176         size_t len = 0;
 177
 178         for (; *src && count; --count)  {
 179                 s32 code = utf8_get(&src);
 180
 181                 if (!code)
 182                         break;
 183                 if (code < 0) {
 184                         /* Reserve space for a replacement character */
 185                         len += 1;
 186                 } else if (code < 0x10000) {
 187                         len += 1;
 188                 } else {
 189                         len += 2;
 190                 }
 191         }
 192         return len;
 193 }
 194
 195 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
 196 {
 197         if (!src || !dst || !*dst)
 198                 return -1;
 199
 200         for (; count && *src; --count) {
 201                 s32 code = utf8_get(&src);
 202
 203                 if (code < 0)
 204                         code = '?';
 205                 utf16_put(code, dst);
 206         }
 207         **dst = 0;
 208         return 0;
 209 }
 210
 211 s32 utf16_get(const u16 **src)
 212 {
 213         s32 code, code2;
 214
 215         if (!src || !*src)
 216                 return -1;
 217         if (!**src)
 218                 return 0;
 219         code = **src;
 220         ++*src;
 221         if (code >= 0xDC00 && code <= 0xDFFF)
 222                 return -1;
 223         if (code >= 0xD800 && code <= 0xDBFF) {
 224                 if (!**src)
 225                         return -1;
 226                 code &= 0x3ff;
 227                 code <<= 10;
 228                 code += 0x10000;
 229                 code2 = **src;
 230                 ++*src;
 231                 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
 232                         return -1;
 233                 code2 &= 0x3ff;
 234                 code += code2;
 235         }
 236         return code;
 237 }
 238
 239 int utf16_put(s32 code, u16 **dst)
 240 {
 241         if (!dst || !*dst)
 242                 return -1;
 243         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 244                 return -1;
 245         if (code < 0x10000) {
 246                 **dst = code;
 247         } else {
 248                 code -= 0x10000;
 249                 **dst = code >> 10 | 0xD800;
 250                 ++*dst;
 251                 **dst = (code & 0x3ff) | 0xDC00;
 252         }
 253         ++*dst;
 254         return 0;
 255 }
 256
 257 size_t utf16_strnlen(const u16 *src, size_t count)
 258 {
 259         size_t len = 0;
 260
 261         for (; *src && count; --count)  {
 262                 s32 code = utf16_get(&src);
 263
 264                 if (!code)
 265                         break;
 266                 /*
 267                  * In case of an illegal sequence still reserve space for a
 268                  * replacement character.
 269                  */
 270                 ++len;
 271         }
 272         return len;
 273 }
 274
 275 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
 276 {
 277         size_t len = 0;
 278
 279         for (; *src && count; --count)  {
 280                 s32 code = utf16_get(&src);
 281
 282                 if (!code)
 283                         break;
 284                 if (code < 0)
 285                         /* Reserve space for a replacement character */
 286                         len += 1;
 287                 else if (code < 0x80)
 288                         len += 1;
 289                 else if (code < 0x800)
 290                         len += 2;
 291                 else if (code < 0x10000)
 292                         len += 3;
 293                 else
 294                         len += 4;
 295         }
 296         return len;
 297 }
 298
 299 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
 300 {
 301         if (!src || !dst || !*dst)
 302                 return -1;
 303
 304         for (; count && *src; --count) {
 305                 s32 code = utf16_get(&src);
 306
 307                 if (code < 0)
 308                         code = '?';
 309                 utf8_put(code, dst);
 310         }
 311         **dst = 0;
 312         return 0;
 313 }
 314
 315 s32 utf_to_lower(const s32 code)
 316 {
 317         struct capitalization_table *pos = capitalization_table;
 318         s32 ret = code;
 319
 320         if (code <= 0x7f) {
 321                 if (code >= 'A' && code <= 'Z')
 322                         ret += 0x20;
 323                 return ret;
 324         }
 325         for (; pos->upper; ++pos) {
 326                 if (pos->upper == code) {
 327                         ret = pos->lower;
 328                         break;
 329                 }
 330         }
 331         return ret;
 332 }
 333
 334 s32 utf_to_upper(const s32 code)
 335 {
 336         struct capitalization_table *pos = capitalization_table;
 337         s32 ret = code;
 338
 339         if (code <= 0x7f) {
 340                 if (code >= 'a' && code <= 'z')
 341                         ret -= 0x20;
 342                 return ret;
 343         }
 344         for (; pos->lower; ++pos) {
 345                 if (pos->lower == code) {
 346                         ret = pos->upper;
 347                         break;
 348                 }
 349         }
 350         return ret;
 351 }
 352
 353 /*
 354  * u16_strcasecmp() - compare two u16 strings case insensitively
 355  *
 356  * @s1:         first string to compare
 357  * @s2:         second string to compare
 358  * @n:          maximum number of u16 to compare
 359  * Return:      0  if the first n u16 are the same in s1 and s2
 360  *              < 0 if the first different u16 in s1 is less than the
 361  *              corresponding u16 in s2
 362  *              > 0 if the first different u16 in s1 is greater than the
 363  */
 364 int u16_strcasecmp(const u16 *s1, const u16 *s2)
 365 {
 366         int ret = 0;
 367         s32 c1, c2;
 368
 369         for (;;) {
 370                 c1 = utf_to_upper(utf16_get(&s1));
 371                 c2 = utf_to_upper(utf16_get(&s2));
 372                 ret = c1 - c2;
 373                 if (ret || !c1 || c1 == -1 || c2 == -1)
 374                         break;
 375         }
 376         return ret;
 377 }
 378
 379 /*
 380  * u16_strncmp() - compare two u16 string
 381  *
 382  * @s1:         first string to compare
 383  * @s2:         second string to compare
 384  * @n:          maximum number of u16 to compare
 385  * Return:      0  if the first n u16 are the same in s1 and s2
 386  *              < 0 if the first different u16 in s1 is less than the
 387  *              corresponding u16 in s2
 388  *              > 0 if the first different u16 in s1 is greater than the
 389  *              corresponding u16 in s2
 390  */
 391 int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
 392 {
 393         int ret = 0;
 394
 395         for (; n; --n, ++s1, ++s2) {
 396                 ret = *s1 - *s2;
 397                 if (ret || !*s1)
 398                         break;
 399         }
 400
 401         return ret;
 402 }
 403
 404 size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
 405 {
 406         size_t i;
 407         for (i = 0; count-- && in[i]; i++);
 408         return i;
 409 }
 410
 411 size_t u16_strsize(const void *in)
 412 {
 413         return (u16_strlen(in) + 1) * sizeof(u16);
 414 }
 415
 416 u16 *u16_strcpy(u16 *dest, const u16 *src)
 417 {
 418         u16 *tmp = dest;
 419
 420         for (;; dest++, src++) {
 421                 *dest = *src;
 422                 if (!*src)
 423                         break;
 424         }
 425
 426         return tmp;
 427 }
 428
 429 u16 *u16_strdup(const void *src)
 430 {
 431         u16 *new;
 432         size_t len;
 433
 434         if (!src)
 435                 return NULL;
 436         len = u16_strsize(src);
 437         new = malloc(len);
 438         if (!new)
 439                 return NULL;
 440         memcpy(new, src, len);
 441
 442         return new;
 443 }
 444
 445 size_t u16_strlcat(u16 *dest, const u16 *src, size_t count)
 446 {
 447         size_t destlen = u16_strnlen(dest, count);
 448         size_t srclen = u16_strlen(src);
 449         size_t ret = destlen + srclen;
 450
 451         if (destlen >= count)
 452                 return ret;
 453         if (ret >= count)
 454                 srclen -= (ret - count + 1);
 455         memcpy(&dest[destlen], src, 2 * srclen);
 456         dest[destlen + srclen] = 0x0000;
 457
 458         return ret;
 459 }
 460
 461 /* Convert UTF-16 to UTF-8.  */
 462 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
 463 {
 464         uint32_t code_high = 0;
 465
 466         while (size--) {
 467                 uint32_t code = *src++;
 468
 469                 if (code_high) {
 470                         if (code >= 0xDC00 && code <= 0xDFFF) {
 471                                 /* Surrogate pair.  */
 472                                 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
 473
 474                                 *dest++ = (code >> 18) | 0xF0;
 475                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
 476                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 477                                 *dest++ = (code & 0x3F) | 0x80;
 478                         } else {
 479                                 /* Error...  */
 480                                 *dest++ = '?';
 481                                 /* *src may be valid. Don't eat it.  */
 482                                 src--;
 483                         }
 484
 485                         code_high = 0;
 486                 } else {
 487                         if (code <= 0x007F) {
 488                                 *dest++ = code;
 489                         } else if (code <= 0x07FF) {
 490                                 *dest++ = (code >> 6) | 0xC0;
 491                                 *dest++ = (code & 0x3F) | 0x80;
 492                         } else if (code >= 0xD800 && code <= 0xDBFF) {
 493                                 code_high = code;
 494                                 continue;
 495                         } else if (code >= 0xDC00 && code <= 0xDFFF) {
 496                                 /* Error... */
 497                                 *dest++ = '?';
 498                         } else if (code < 0x10000) {
 499                                 *dest++ = (code >> 12) | 0xE0;
 500                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 501                                 *dest++ = (code & 0x3F) | 0x80;
 502                         } else {
 503                                 *dest++ = (code >> 18) | 0xF0;
 504                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
 505                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 506                                 *dest++ = (code & 0x3F) | 0x80;
 507                         }
 508                 }
 509         }
 510
 511         return dest;
 512 }
 513
 514 int utf_to_cp(s32 *c, const u16 *codepage)
 515 {
 516         if (*c >= 0x80) {
 517                 int j;
 518
 519                 /* Look up codepage translation */
 520                 for (j = 0; j < 0x80; ++j) {
 521                         if (*c == codepage[j]) {
 522                                 *c = j + 0x80;
 523                                 return 0;
 524                         }
 525                 }
 526                 *c = '?';
 527                 return -ENOENT;
 528         }
 529         return 0;
 530 }
 531
 532 int utf8_to_cp437_stream(u8 c, char *buffer)
 533 {
 534         char *end;
 535         const char *pos;
 536         s32 s;
 537         int ret;
 538
 539         for (;;) {
 540                 pos = buffer;
 541                 end = buffer + strlen(buffer);
 542                 *end++ = c;
 543                 *end = 0;
 544                 s = utf8_get(&pos);
 545                 if (s > 0) {
 546                         *buffer = 0;
 547                         ret = utf_to_cp(&s, codepage_437);
 548                         return s;
 549                         }
 550                 if (pos == end)
 551                         return 0;
 552                 *buffer = 0;
 553         }
 554 }
 555
 556 int utf8_to_utf32_stream(u8 c, char *buffer)
 557 {
 558         char *end;
 559         const char *pos;
 560         s32 s;
 561
 562         for (;;) {
 563                 pos = buffer;
 564                 end = buffer + strlen(buffer);
 565                 *end++ = c;
 566                 *end = 0;
 567                 s = utf8_get(&pos);
 568                 if (s > 0) {
 569                         *buffer = 0;
 570                         return s;
 571                 }
 572                 if (pos == end)
 573                         return 0;
 574                 *buffer = 0;
 575         }
 576 }