]> git.ipfire.org Git - thirdparty/cups.git/blame - cups/normalize.c
Load cups into easysw/current.
[thirdparty/cups.git] / cups / normalize.c
CommitLineData
ef416fc2 1/*
fa73b229 2 * "$Id: normalize.c 4967 2006-01-24 03:42:15Z mike $"
ef416fc2 3 *
4 * Unicode normalization for the Common UNIX Printing System (CUPS).
5 *
6 * Copyright 1997-2006 by Easy Software Products.
7 *
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
13 * Products at:
14 *
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
19 *
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
23 *
24 * Contents:
25 *
26 * cupsNormalizeMapsGet() - Get all norm maps to cache.
27 * cupsNormalizeMapsFree() - Free all norm maps in cache.
28 * cupsNormalizeMapsFlush() - Flush all norm maps in cache.
fa73b229 29 * _cupsNormalizeMapsFlush() - Flush all normalization maps in cache.
ef416fc2 30 * cupsUTF8Normalize() - Normalize UTF-8 string.
31 * cupsUTF32Normalize() - Normalize UTF-32 string.
32 * cupsUTF8CaseFold() - Case fold UTF-8 string.
33 * cupsUTF32CaseFold() - Case fold UTF-32 string.
34 * cupsUTF8CompareCaseless() - Compare case folded UTF-8 strings.
35 * cupsUTF32CompareCaseless() - Compare case folded UTF-32 strings.
36 * cupsUTF8CompareIdentifier() - Compare folded NFKC UTF-8 strings.
37 * cupsUTF32CompareIdentifier() - Compare folded NFKC UTF-32 strings.
38 * cupsUTF32CharacterProperty() - Get UTF-32 character property.
39 * get_general_category() - Get UTF-32 Char General Category.
40 * get_bidi_category() - Get UTF-32 Char Bidi Category.
41 * get_combining_class() - Get UTF-32 Char Combining Class.
42 * get_break_class() - Get UTF-32 Char Line Break Class.
43 * get_map_count() - Count lines in a map file.
44 * get_normmap() - Get Unicode norm map to cache.
45 * get_foldmap() - Get Unicode casefold map to cache.
46 * get_propmap() - Get Unicode property map to cache.
47 * get_combmap() - Get Unicode combining map to cache.
48 * get_breakmap() - Get Unicode break map to cache.
49 * compare_compose() - Compare key for compose match.
50 * compare_decompose() - Compare key for decompose match.
51 * compare_foldchar() - Compare key for case fold match.
52 * compare_combchar() - Compare key for combining match.
53 * compare_breakchar() - Compare key for line break match.
54 * compare_propchar() - Compare key for property char match.
55 */
56
57/*
58 * Include necessary headers...
59 */
60
61#include "globals.h"
62#include "debug.h"
63#include <stdlib.h>
64#include <errno.h>
65#include <time.h>
66
67
68typedef struct /**** General Category Index Struct****/
69{
70 cups_gencat_t gencat; /* General Category Value */
71 const char *str; /* General Category String */
72} gencat_t;
73
74static const gencat_t gencat_index[] = /* General Category Index */
75{
76 { CUPS_GENCAT_LU, "Lu" }, /* Letter, Uppercase */
77 { CUPS_GENCAT_LL, "Ll" }, /* Letter, Lowercase */
78 { CUPS_GENCAT_LT, "Lt" }, /* Letter, Titlecase */
79 { CUPS_GENCAT_LM, "Lm" }, /* Letter, Modifier */
80 { CUPS_GENCAT_LO, "Lo" }, /* Letter, Other */
81 { CUPS_GENCAT_MN, "Mn" }, /* Mark, Non-Spacing */
82 { CUPS_GENCAT_MC, "Mc" }, /* Mark, Spacing Combining */
83 { CUPS_GENCAT_ME, "Me" }, /* Mark, Enclosing */
84 { CUPS_GENCAT_ND, "Nd" }, /* Number, Decimal Digit */
85 { CUPS_GENCAT_NL, "Nl" }, /* Number, Letter */
86 { CUPS_GENCAT_NO, "No" }, /* Number, Other */
87 { CUPS_GENCAT_PC, "Pc" }, /* Punctuation, Connector */
88 { CUPS_GENCAT_PD, "Pd" }, /* Punctuation, Dash */
89 { CUPS_GENCAT_PS, "Ps" }, /* Punctuation, Open (start) */
90 { CUPS_GENCAT_PE, "Pe" }, /* Punctuation, Close (end) */
91 { CUPS_GENCAT_PI, "Pi" }, /* Punctuation, Initial Quote */
92 { CUPS_GENCAT_PF, "Pf" }, /* Punctuation, Final Quote */
93 { CUPS_GENCAT_PO, "Po" }, /* Punctuation, Other */
94 { CUPS_GENCAT_SM, "Sm" }, /* Symbol, Math */
95 { CUPS_GENCAT_SC, "Sc" }, /* Symbol, Currency */
96 { CUPS_GENCAT_SK, "Sk" }, /* Symbol, Modifier */
97 { CUPS_GENCAT_SO, "So" }, /* Symbol, Other */
98 { CUPS_GENCAT_ZS, "Zs" }, /* Separator, Space */
99 { CUPS_GENCAT_ZL, "Zl" }, /* Separator, Line */
100 { CUPS_GENCAT_ZP, "Zp" }, /* Separator, Paragraph */
101 { CUPS_GENCAT_CC, "Cc" }, /* Other, Control */
102 { CUPS_GENCAT_CF, "Cf" }, /* Other, Format */
103 { CUPS_GENCAT_CS, "Cs" }, /* Other, Surrogate */
104 { CUPS_GENCAT_CO, "Co" }, /* Other, Private Use */
105 { CUPS_GENCAT_CN, "Cn" }, /* Other, Not Assigned */
106 { 0, NULL }
107};
108
109static const char * const bidicat_index[] =
110 /* Bidi Category Index */
111{
112 "L", /* Left-to-Right (Alpha, Syllabic, Ideographic) */
113 "LRE", /* Left-to-Right Embedding (explicit) */
114 "LRO", /* Left-to-Right Override (explicit) */
115 "R", /* Right-to-Left (Hebrew alphabet and most punct) */
116 "AL", /* Right-to-Left Arabic (Arabic, Thaana, Syriac) */
117 "RLE", /* Right-to-Left Embedding (explicit) */
118 "RLO", /* Right-to-Left Override (explicit) */
119 "PDF", /* Pop Directional Format */
120 "EN", /* Euro Number (Euro and East Arabic-Indic digits) */
121 "ES", /* Euro Number Separator (Slash) */
122 "ET", /* Euro Number Termintor (Plus, Minus, Degree, etc) */
123 "AN", /* Arabic Number (Arabic-Indic digits, separators) */
124 "CS", /* Common Number Separator (Colon, Comma, Dot, etc) */
125 "NSM", /* Non-Spacing Mark (category Mn / Me in UCD) */
126 "BN", /* Boundary Neutral (Formatting / Control chars) */
127 "B", /* Paragraph Separator */
128 "S", /* Segment Separator (Tab) */
129 "WS", /* Whitespace Space (Space, Line Separator, etc) */
130 "ON", /* Other Neutrals */
131 NULL
132};
133
134typedef struct /**** Line Break Class Index Struct****/
135{
136 cups_break_class_t breakclass; /* Line Break Class Value */
137 const char *str; /* Line Break Class String */
138} _cups_break_t;
139
140static const _cups_break_t break_index[] = /* Line Break Class Index */
141{
142 { CUPS_BREAK_AI, "AI" }, /* Ambiguous (Alphabetic or Ideograph) */
143 { CUPS_BREAK_AL, "AL" }, /* Ordinary Alpha/Symbol Chars (XP) */
144 { CUPS_BREAK_BA, "BA" }, /* Break Opportunity After Chars (A) */
145 { CUPS_BREAK_BB, "BB" }, /* Break Opportunities Before Chars (B) */
146 { CUPS_BREAK_B2, "B2" }, /* Break Opportunity Either (B/A/XP) */
147 { CUPS_BREAK_BK, "BK" }, /* Mandatory Break (A) (norm) */
148 { CUPS_BREAK_CB, "CB" }, /* Contingent Break (B/A) (norm) */
149 { CUPS_BREAK_CL, "CL" }, /* Closing Punctuation (XB) */
150 { CUPS_BREAK_CM, "CM" }, /* Attached/Combining (XB) (norm) */
151 { CUPS_BREAK_CR, "CR" }, /* Carriage Return (A) (norm) */
152 { CUPS_BREAK_EX, "EX" }, /* Exclamation / Interrogation (XB) */
153 { CUPS_BREAK_GL, "GL" }, /* Non-breaking ("Glue") (XB/XA) (norm) */
154 { CUPS_BREAK_HY, "HY" }, /* Hyphen (XA) */
155 { CUPS_BREAK_ID, "ID" }, /* Ideographic (B/A) */
156 { CUPS_BREAK_IN, "IN" }, /* Inseparable chars (XP) */
157 { CUPS_BREAK_IS, "IS" }, /* Numeric Separator (Infix) (XB) */
158 { CUPS_BREAK_LF, "LF" }, /* Line Feed (A) (norm) */
159 { CUPS_BREAK_NS, "NS" }, /* Non-starters (XB) */
160 { CUPS_BREAK_NU, "NU" }, /* Numeric (XP) */
161 { CUPS_BREAK_OP, "OP" }, /* Opening Punctuation (XA) */
162 { CUPS_BREAK_PO, "PO" }, /* Postfix (Numeric) (XB) */
163 { CUPS_BREAK_PR, "PR" }, /* Prefix (Numeric) (XA) */
164 { CUPS_BREAK_QU, "QU" }, /* Ambiguous Quotation (XB/XA) */
165 { CUPS_BREAK_SA, "SA" }, /* Context Dependent (SE Asian) (P) */
166 { CUPS_BREAK_SG, "SG" }, /* Surrogates (XP) (norm) */
167 { CUPS_BREAK_SP, "SP" }, /* Space (A) (norm) */
168 { CUPS_BREAK_SY, "SY" }, /* Symbols Allowing Break After (A) */
169 { CUPS_BREAK_XX, "XX" }, /* Unknown (XP) */
170 { CUPS_BREAK_ZW, "ZW" }, /* Zero Width Space (A) (norm) */
171 { 0, NULL }
172};
173
174/*
175 * Prototypes...
176 */
177
178static int compare_breakchar(const void *k1, const void *k2);
179static int compare_combchar(const void *k1, const void *k2);
180static int compare_compose(const void *k1, const void *k2);
181static int compare_decompose(const void *k1, const void *k2);
182static int compare_foldchar(const void *k1, const void *k2);
183static int compare_propchar(const void *k1, const void *k2);
184static int get_bidi_category(const cups_utf32_t ch);
185static int get_break_class(const cups_utf32_t ch);
186static int get_breakmap(void);
187static int get_combining_class(const cups_utf32_t ch);
188static int get_combmap(void);
189static int get_foldmap(const cups_folding_t fold);
190static int get_general_category(const cups_utf32_t ch);
191static int get_map_count(const char *filename);
192static int get_normmap(const cups_normalize_t normalize);
193static int get_propmap(void);
194
195
196/*
197 * 'cupsNormalizeMapsGet()' - Get all normalization maps to cache.
198 */
199
200int /* O - Zero or -1 on error */
201cupsNormalizeMapsGet(void)
202{
203 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
204 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
205 _cups_globals_t *cg = _cupsGlobals();
206 /* Pointer to library globals */
207
208
209 /*
210 * See if we already have normalization maps loaded...
211 */
212
213 if (cg->normmap_cache)
214 {
215 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
216 nmap->used ++;
217
218 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
219 fmap->used ++;
220
221 if (cg->combmap_cache)
222 cg->combmap_cache->used ++;
223
224 if (cg->propmap_cache)
225 cg->propmap_cache->used ++;
226
227 if (cg->breakmap_cache)
228 cg->breakmap_cache->used ++;
229
230 return (0);
231 }
232
233 /*
234 * Get normalization maps...
235 */
236
237 if (get_normmap(CUPS_NORM_NFD) < 0)
238 return (-1);
239
240 if (get_normmap(CUPS_NORM_NFKD) < 0)
241 return (-1);
242
243 if (get_normmap(CUPS_NORM_NFC) < 0)
244 return (-1);
245
246 /*
247 * Get case folding, combining class, character property maps...
248 */
249
250 if (get_foldmap(CUPS_FOLD_SIMPLE) < 0)
251 return (-1);
252
253 if (get_foldmap(CUPS_FOLD_FULL) < 0)
254 return (-1);
255
256 if (get_propmap() < 0)
257 return (-1);
258
259 if (get_combmap() < 0)
260 return (-1);
261
262 if (get_breakmap() < 0)
263 return (-1);
264
265 return (0);
266}
267
268
269/*
270 * 'cupsNormalizeMapsFree()' - Free all normalization maps in cache.
271 *
272 * This does not actually free; use 'cupsNormalizeMapsFlush()' for that.
273 */
274
275int /* O - Zero or -1 on error */
276cupsNormalizeMapsFree(void)
277{
278 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
279 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
280 _cups_globals_t *cg = _cupsGlobals();
281 /* Pointer to library globals */
282
283
284 /*
285 * See if we already have normalization maps loaded...
286 */
287
288 if (cg->normmap_cache == NULL)
289 return (-1);
290
291 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
292 if (nmap->used > 0)
293 nmap->used --;
294
295 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
296 if (fmap->used > 0)
297 fmap->used --;
298
299 if (cg->propmap_cache && (cg->propmap_cache->used > 0))
300 cg->propmap_cache->used --;
301
302 if (cg->combmap_cache && (cg->combmap_cache->used > 0))
303 cg->combmap_cache->used --;
304
305 if (cg->breakmap_cache && (cg->breakmap_cache->used > 0))
306 cg->breakmap_cache->used --;
307
308 return (0);
309}
310
311
312/*
313 * 'cupsNormalizeMapsFlush()' - Flush all normalization maps in cache.
314 */
315
316void
317cupsNormalizeMapsFlush(void)
fa73b229 318{
319 _cupsNormalizeMapsFlush(_cupsGlobals());
320}
321
322
323/*
324 * '_cupsNormalizeMapsFlush()' - Flush all normalization maps in cache.
325 */
326
327void
328_cupsNormalizeMapsFlush(
329 _cups_globals_t *cg) /* I - Global data */
ef416fc2 330{
331 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
332 _cups_norm_map_t *nextnorm; /* Next Unicode Normalization Map */
333 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
334 _cups_fold_map_t *nextfold; /* Next Unicode Case Folding Map */
ef416fc2 335
336
337 /*
338 * Flush all normalization maps...
339 */
340
341 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nextnorm)
342 {
343 free(nmap->uni2norm);
344 nextnorm = nmap->next;
345 free(nmap);
346 }
347
348 cg->normmap_cache = NULL;
349
350 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = nextfold)
351 {
352 free(fmap->uni2fold);
353 nextfold = fmap->next;
354 free(fmap);
355 }
356
357 cg->foldmap_cache = NULL;
358
359 if (cg->propmap_cache)
360 {
361 free(cg->propmap_cache->uni2prop);
362 free(cg->propmap_cache);
363 cg->propmap_cache = NULL;
364 }
365
366 if (cg->combmap_cache)
367 {
368 free(cg->combmap_cache->uni2comb);
369 free(cg->combmap_cache);
370 cg->combmap_cache = NULL;
371 }
372
373 if (cg->breakmap_cache)
374 {
375 free(cg->breakmap_cache->uni2break);
376 free(cg->breakmap_cache);
377 cg->breakmap_cache = NULL;
378 }
379}
380
381
382/*
383 * 'cupsUTF8Normalize()' - Normalize UTF-8 string.
384 *
385 * Normalize UTF-8 string to Unicode UAX-15 Normalization Form
386 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
387 * unsafe for subsequent transcoding to legacy charsets
388 */
389
390int /* O - Count or -1 on error */
391cupsUTF8Normalize(
392 cups_utf8_t *dest, /* O - Target string */
393 const cups_utf8_t *src, /* I - Source string */
394 const int maxout, /* I - Max output */
395 const cups_normalize_t normalize) /* I - Normalization */
396{
397 int len; /* String length */
398 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
399 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
400
401
402 /*
403 * Check for valid arguments and clear output...
404 */
405
406 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
407 return (-1);
408
409 *dest = 0;
410
411 /*
412 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
413 */
414
415 len = cupsUTF8ToUTF32(work1, src, CUPS_MAX_USTRING);
416
417 if (len < 0)
418 return (-1);
419
420 /*
421 * Normalize internal UCS-4 to second internal UCS-4...
422 */
423
424 len = cupsUTF32Normalize(work2, work1, CUPS_MAX_USTRING, normalize);
425
426 if (len < 0)
427 return (-1);
428
429 /*
430 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
431 */
432
433 len = cupsUTF32ToUTF8(dest, work2, maxout);
434
435 return (len);
436}
437
438
439/*
440 * 'cupsUTF32Normalize()' - Normalize UTF-32 string.
441 *
442 * Normalize UTF-32 string to Unicode UAX-15 Normalization Form
443 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
444 * unsafe for subsequent transcoding to legacy charsets
445 */
446
447int /* O - Count or -1 on error */
448cupsUTF32Normalize(
449 cups_utf32_t *dest, /* O - Target string */
450 const cups_utf32_t *src, /* I - Source string */
451 const int maxout, /* I - Max output */
452 const cups_normalize_t normalize) /* I - Normalization */
453{
454 int i; /* Looping variable */
455 int result; /* Result Value */
456 cups_ucs2_t *mp; /* Map char pointer */
457 int pass; /* Pass count for each transform */
458 int hit; /* Hit count from binary search */
459 cups_utf32_t unichar1; /* Unicode character value */
460 cups_utf32_t unichar2; /* Unicode character value */
461 _cups_comb_class_t class1; /* First Combining Class */
462 _cups_comb_class_t class2; /* Second Combining Class */
463 int len; /* String length */
464 cups_utf32_t work1[CUPS_MAX_USTRING];
465 /* First internal UCS-4 string */
466 cups_utf32_t work2[CUPS_MAX_USTRING];
467 /* Second internal UCS-4 string */
468 cups_utf32_t *p1; /* First UCS-4 string pointer */
469 cups_utf32_t *p2; /* Second UCS-4 string pointer */
470 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
471 cups_normalize_t decompose; /* Decomposition Type */
472 _cups_globals_t *cg = _cupsGlobals();
473 /* Pointer to library globals */
474
475
476 /*
477 * Check for valid arguments and clear output...
478 */
479
480 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
481 return (-1);
482
483 *dest = 0;
484
485 result = cupsNormalizeMapsGet();
486
487 if (result < 0)
488 return (-1);
489
490 /*
491 * Find decomposition map...
492 */
493
494 switch (normalize)
495 {
496 case CUPS_NORM_NFD:
497 case CUPS_NORM_NFC:
498 decompose = CUPS_NORM_NFD;
499 break;
500
501 case CUPS_NORM_NFKD:
502 case CUPS_NORM_NFKC:
503 decompose = CUPS_NORM_NFKD;
504 break;
505
506 default:
507 return (-1);
508 }
509
510 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
511 if (nmap->normalize == decompose)
512 break;
513
514 if (nmap == NULL)
515 return (-1);
516
517 /*
518 * Copy input to internal buffer...
519 */
520
521 p1 = &work1[0];
522
523 for (i = 0; i < CUPS_MAX_USTRING; i ++)
524 {
525 if (*src == 0)
526 break;
527
528 *p1 ++ = *src ++;
529 }
530
531 *p1 = 0;
532 len = i;
533
534 /*
535 * Decompose until no further decomposition...
536 */
537
538 for (pass = 0; pass < 20; pass ++)
539 {
540 p1 = &work1[0];
541 p2 = &work2[0];
542
543 for (hit = 0; *p1 != 0; p1 ++)
544 {
545 /*
546 * Check for decomposition defined...
547 */
548
549 mp = (cups_ucs2_t *)bsearch(p1, nmap->uni2norm, nmap->normcount,
550 (sizeof(cups_ucs2_t) * 3), compare_decompose);
551 if (mp == NULL)
552 {
553 *p2 ++ = *p1;
554 continue;
555 }
556
557 /*
558 * Decompose input character to one or two output characters...
559 */
560
561 hit ++;
562 mp ++;
563 *p2 ++ = (cups_utf32_t) *mp ++;
564
565 if (*mp != 0)
566 *p2 ++ = (cups_utf32_t) *mp;
567 }
568
569 *p2 = 0;
570 len = (int)(p2 - &work2[0]);
571
572 /*
573 * Check for decomposition finished...
574 */
575 if (hit == 0)
576 break;
577 memcpy (work1, work2, sizeof(cups_utf32_t) * (len + 1));
578 }
579
580 /*
581 * Canonical reorder until no further reordering...
582 */
583
584 for (pass = 0; pass < 20; pass ++)
585 {
586 p1 = &work1[0];
587
588 for (hit = 0; *p1 != 0; p1 ++)
589 {
590 /*
591 * Check for combining characters to reorder...
592 */
593
594 unichar1 = *p1;
595 unichar2 = *(p1 + 1);
596
597 if (unichar2 == 0)
598 break;
599
600 class1 = get_combining_class(unichar1);
601 class2 = get_combining_class(unichar2);
602
603 if ((class1 < 0) || (class2 < 0))
604 return (-1);
605
606 if ((class1 == 0) || (class2 == 0))
607 continue;
608
609 if (class1 <= class2)
610 continue;
611
612 /*
613 * Swap two combining characters...
614 */
615
616 *p1 = unichar2;
617 p1 ++;
618 *p1 = unichar1;
619 hit ++;
620 }
621
622 if (hit == 0)
623 break;
624 }
625
626 /*
627 * Check for decomposition only...
628 */
629
630 if (normalize == CUPS_NORM_NFD || normalize == CUPS_NORM_NFKD)
631 {
632 memcpy(dest, work1, sizeof(cups_utf32_t) * (len + 1));
633 return (len);
634 }
635
636 /*
637 * Find composition map...
638 */
639
640 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
641 if (nmap->normalize == CUPS_NORM_NFC)
642 break;
643
644 if (nmap == NULL)
645 return (-1);
646
647 /*
648 * Compose until no further composition...
649 */
650
651 for (pass = 0; pass < 20; pass ++)
652 {
653 p1 = &work1[0];
654 p2 = &work2[0];
655
656 for (hit = 0; *p1 != 0; p1 ++)
657 {
658 /*
659 * Check for composition defined...
660 */
661
662 unichar1 = *p1;
663 unichar2 = *(p1 + 1);
664
665 if (unichar2 == 0)
666 {
667 *p2 ++ = unichar1;
668 break;
669 }
670
671 mp = (cups_ucs2_t *)bsearch(p1, nmap->uni2norm, nmap->normcount,
672 (sizeof(cups_ucs2_t) * 3), compare_compose);
673 if (mp == NULL)
674 {
675 *p2 ++ = *p1;
676 continue;
677 }
678
679 /*
680 * Compose two input characters to one output character...
681 */
682
683 hit ++;
684 mp += 2;
685 *p2 ++ = (cups_utf32_t) *mp;
686 p1 ++;
687 }
688
689 *p2 = 0;
690 len = (int) (p2 - &work2[0]);
691
692 /*
693 * Check for composition finished...
694 */
695
696 if (hit == 0)
697 break;
698
699 memcpy (work1, work2, sizeof(cups_utf32_t) * (len + 1));
700 }
701
702 memcpy (dest, work1, sizeof(cups_utf32_t) * (len + 1));
703
704 cupsNormalizeMapsFree();
705
706 return (len);
707}
708
709
710/*
711 * 'cupsUTF8CaseFold()' - Case fold UTF-8 string.
712 *
713 * Case Fold UTF-8 string per Unicode UAX-21 Section 2.3
714 * Note - Case folding output is
715 * unsafe for subsequent transcoding to legacy charsets
716 */
717
718int /* O - Count or -1 on error */
719cupsUTF8CaseFold(
720 cups_utf8_t *dest, /* O - Target string */
721 const cups_utf8_t *src, /* I - Source string */
722 const int maxout, /* I - Max output */
723 const cups_folding_t fold) /* I - Fold Mode */
724{
725 int len; /* String length */
726 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
727 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
728
729
730 /*
731 * Check for valid arguments and clear output...
732 */
733
734 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
735 return (-1);
736
737 *dest = 0;
738
739 if (fold != CUPS_FOLD_SIMPLE && fold != CUPS_FOLD_FULL)
740 return (-1);
741
742 /*
743 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
744 */
745
746 len = cupsUTF8ToUTF32(work1, src, CUPS_MAX_USTRING);
747
748 if (len < 0)
749 return (-1);
750
751 /*
752 * Case Fold internal UCS-4 to second internal UCS-4...
753 */
754
755 len = cupsUTF32CaseFold(work2, work1, CUPS_MAX_USTRING, fold);
756
757 if (len < 0)
758 return (-1);
759
760 /*
761 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
762 */
763
764 len = cupsUTF32ToUTF8(dest, work2, maxout);
765
766 return (len);
767}
768
769
770/*
771 * 'cupsUTF32CaseFold()' - Case fold UTF-32 string.
772 *
773 * Case Fold UTF-32 string per Unicode UAX-21 Section 2.3
774 * Note - Case folding output is
775 * unsafe for subsequent transcoding to legacy charsets
776 */
777
778int /* O - Count or -1 on error */
779cupsUTF32CaseFold(
780 cups_utf32_t *dest, /* O - Target string */
781 const cups_utf32_t *src, /* I - Source string */
782 const int maxout, /* I - Max output */
783 const cups_folding_t fold) /* I - Fold Mode */
784{
785 cups_utf32_t *start = dest; /* Start of destination string */
786 int i; /* Looping variable */
787 int result; /* Result Value */
788 cups_ucs2_t *mp; /* Map char pointer */
789 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
790 _cups_globals_t *cg = _cupsGlobals();
791 /* Pointer to library globals */
792
793
794 /*
795 * Check for valid arguments and clear output...
796 */
797
798 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
799 return (-1);
800
801 *dest = 0;
802
803 if (fold != CUPS_FOLD_SIMPLE && fold != CUPS_FOLD_FULL)
804 return (-1);
805
806 /*
807 * Find case folding map...
808 */
809
810 result = cupsNormalizeMapsGet();
811
812 if (result < 0)
813 return (-1);
814
815 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
816 if (fmap->fold == fold)
817 break;
818
819 if (fmap == NULL)
820 return (-1);
821
822 /*
823 * Case fold input string to output string...
824 */
825
826 for (i = 0; i < (maxout - 1); i ++, src ++)
827 {
828 /*
829 * Check for case folding defined...
830 */
831
832 mp = (cups_ucs2_t *)bsearch(src, fmap->uni2fold, fmap->foldcount,
833 (sizeof(cups_ucs2_t) * 4), compare_foldchar);
834 if (mp == NULL)
835 {
836 *dest ++ = *src;
837 continue;
838 }
839
840 /*
841 * Case fold input character to one or two output characters...
842 */
843
844 mp ++;
845 *dest ++ = (cups_utf32_t) *mp ++;
846
847 if (*mp != 0 && fold == CUPS_FOLD_FULL)
848 {
849 i ++;
850 if (i >= (maxout - 1))
851 break;
852
853 *dest ++ = (cups_utf32_t) *mp;
854 }
855 }
856
857 *dest = 0;
858
859 cupsNormalizeMapsFree();
860
861 return ((int)(dest - start));
862}
863
864
865/*
866 * 'cupsUTF8CompareCaseless()' - Compare case folded UTF-8 strings.
867 */
868
869int /* O - Difference of strings */
870cupsUTF8CompareCaseless(
871 const cups_utf8_t *s1, /* I - String1 */
872 const cups_utf8_t *s2) /* I - String2 */
873{
874 int difference; /* Difference of two strings */
875 int len; /* String length */
876 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
877 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
878
879
880 /*
881 * Check for valid arguments...
882 */
883
884 if (!s1 || !s2)
885 return (-1);
886
887 /*
888 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
889 */
890
891 len = cupsUTF8ToUTF32(work1, s1, CUPS_MAX_USTRING);
892
893 if (len < 0)
894 return (-1);
895
896 len = cupsUTF8ToUTF32(work2, s2, CUPS_MAX_USTRING);
897
898 if (len < 0)
899 return (-1);
900
901 /*
902 * Compare first internal UCS-4 to second internal UCS-4...
903 */
904
905 difference = cupsUTF32CompareCaseless(work1, work2);
906
907 return (difference);
908}
909
910
911/*
912 * 'cupsUTF32CompareCaseless()' - Compare case folded UTF-32 strings.
913 */
914
915int /* O - Difference of strings */
916cupsUTF32CompareCaseless(
917 const cups_utf32_t *s1, /* I - String1 */
918 const cups_utf32_t *s2) /* I - String2 */
919{
920 int difference; /* Difference of two strings */
921 int len; /* String length */
922 cups_folding_t fold = CUPS_FOLD_FULL;
923 /* Case folding mode */
924 cups_utf32_t fold1[CUPS_MAX_USTRING];
925 /* First UCS-4 folded string */
926 cups_utf32_t fold2[CUPS_MAX_USTRING];
927 /* Second UCS-4 folded string */
928 cups_utf32_t *p1; /* First UCS-4 string pointer */
929 cups_utf32_t *p2; /* Second UCS-4 string pointer */
930
931
932 /*
933 * Check for valid arguments...
934 */
935
936 if (!s1 || !s2)
937 return (-1);
938
939 /*
940 * Case Fold input UTF-32 strings to internal UCS-4 strings...
941 */
942
943 len = cupsUTF32CaseFold(fold1, s1, CUPS_MAX_USTRING, fold);
944
945 if (len < 0)
946 return (-1);
947
948 len = cupsUTF32CaseFold(fold2, s2, CUPS_MAX_USTRING, fold);
949
950 if (len < 0)
951 return (-1);
952
953 /*
954 * Compare first internal UCS-4 to second internal UCS-4...
955 */
956
957 p1 = &fold1[0];
958 p2 = &fold2[0];
959
960 for (;; p1 ++, p2 ++)
961 {
962 difference = (int) (*p1 - *p2);
963
964 if (difference != 0)
965 break;
966
967 if ((*p1 == 0) && (*p2 == 0))
968 break;
969 }
970
971 return (difference);
972}
973
974
975/*
976 * 'cupsUTF8CompareIdentifier()' - Compare folded NFKC UTF-8 strings.
977 */
978
979int /* O - Result of comparison */
980cupsUTF8CompareIdentifier(
981 const cups_utf8_t *s1, /* I - String1 */
982 const cups_utf8_t *s2) /* I - String2 */
983{
984 int difference; /* Difference of two strings */
985 int len; /* String length */
986 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
987 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
988
989
990 /*
991 * Check for valid arguments...
992 */
993
994 if (!s1 || !s2)
995 return (-1);
996
997 /*
998 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
999 */
1000
1001 len = cupsUTF8ToUTF32(work1, s1, CUPS_MAX_USTRING);
1002
1003 if (len < 0)
1004 return (-1);
1005
1006 len = cupsUTF8ToUTF32(work2, s2, CUPS_MAX_USTRING);
1007
1008 if (len < 0)
1009 return (-1);
1010
1011 /*
1012 * Compare first internal UCS-4 to second internal UCS-4...
1013 */
1014
1015 difference = cupsUTF32CompareIdentifier(work1, work2);
1016
1017 return (difference);
1018}
1019
1020
1021/*
1022 * 'cupsUTF32CompareIdentifier()' - Compare folded NFKC UTF-32 strings.
1023 */
1024
1025int /* O - Result of comparison */
1026cupsUTF32CompareIdentifier(
1027 const cups_utf32_t *s1, /* I - String1 */
1028 const cups_utf32_t *s2) /* I - String2 */
1029{
1030 int difference; /* Difference of two strings */
1031 int len; /* String length */
1032 cups_folding_t fold = CUPS_FOLD_FULL;
1033 /* Case folding mode */
1034 cups_utf32_t fold1[CUPS_MAX_USTRING];
1035 /* First UCS-4 folded string */
1036 cups_utf32_t fold2[CUPS_MAX_USTRING];
1037 /* Second UCS-4 folded string */
1038 cups_normalize_t normalize = CUPS_NORM_NFKC;
1039 /* Normalization form */
1040 cups_utf32_t norm1[CUPS_MAX_USTRING];
1041 /* First UCS-4 normalized string */
1042 cups_utf32_t norm2[CUPS_MAX_USTRING];
1043 /* Second UCS-4 normalized string */
1044 cups_utf32_t *p1; /* First UCS-4 string pointer */
1045 cups_utf32_t *p2; /* Second UCS-4 string pointer */
1046
1047
1048 /*
1049 * Check for valid arguments...
1050 */
1051
1052 if (!s1 || !s2)
1053 return (-1);
1054
1055 /*
1056 * Case Fold input UTF-32 strings to internal UCS-4 strings...
1057 */
1058
1059 len = cupsUTF32CaseFold(fold1, s1, CUPS_MAX_USTRING, fold);
1060
1061 if (len < 0)
1062 return (-1);
1063
1064 len = cupsUTF32CaseFold(fold2, s2, CUPS_MAX_USTRING, fold);
1065
1066 if (len < 0)
1067 return (-1);
1068
1069 /*
1070 * Normalize internal UCS-4 strings to NFKC...
1071 */
1072
1073 len = cupsUTF32Normalize(norm1, fold1, CUPS_MAX_USTRING, normalize);
1074
1075 if (len < 0)
1076 return (-1);
1077
1078 len = cupsUTF32Normalize(norm2, fold2, CUPS_MAX_USTRING, normalize);
1079
1080 if (len < 0)
1081 return (-1);
1082
1083 /*
1084 * Compare first internal UCS-4 to second internal UCS-4...
1085 */
1086
1087 p1 = &norm1[0];
1088 p2 = &norm2[0];
1089
1090 for (;; p1 ++, p2 ++)
1091 {
1092 difference = (int) (*p1 - *p2);
1093
1094 if (difference != 0)
1095 break;
1096
1097 if ((*p1 == 0) && (*p2 == 0))
1098 break;
1099 }
1100
1101 return (difference);
1102}
1103
1104
1105/*
1106 * 'cupsUTF32CharacterProperty()' - Get UTF-32 character property.
1107 */
1108
1109int /* O - Result of comparison */
1110cupsUTF32CharacterProperty(
1111 const cups_utf32_t ch, /* I - Source char */
1112 const cups_property_t prop) /* I - Char Property */
1113{
1114 int result; /* Result Value */
1115
1116
1117 /*
1118 * Check for valid arguments...
1119 */
1120
1121 if (ch == 0)
1122 return (-1);
1123
1124 /*
1125 * Find character property...
1126 */
1127
1128 switch (prop)
1129 {
1130 case CUPS_PROP_GENERAL_CATEGORY:
1131 result = (get_general_category(ch));
1132 break;
1133
1134 case CUPS_PROP_BIDI_CATEGORY:
1135 result = (get_bidi_category(ch));
1136 break;
1137
1138 case CUPS_PROP_COMBINING_CLASS:
1139 result = (get_combining_class(ch));
1140 break;
1141 case CUPS_PROP_BREAK_CLASS:
1142 result = (get_break_class(ch));
1143 break;
1144
1145 default:
1146 return (-1);
1147 }
1148
1149 return (result);
1150}
1151
1152
1153/*
1154 * 'get_general_category()' - Get UTF-32 Character General Category.
1155 */
1156
1157static int /* O - Class or -1 on error */
1158get_general_category(
1159 const cups_utf32_t ch) /* I - Source char */
1160{
1161 int result; /* Result Value */
1162 cups_gencat_t gencat; /* General Category Value */
1163 _cups_prop_map_t *pmap; /* Unicode Property Map */
1164 _cups_prop_t *uni2prop; /* Unicode Char -> Properties */
1165 _cups_globals_t *cg = _cupsGlobals();
1166 /* Pointer to library globals */
1167
1168
1169 /*
1170 * Check for valid argument...
1171 */
1172
1173 if (ch == 0)
1174 return (-1);
1175
1176 /*
1177 * Find property map...
1178 */
1179
1180 result = cupsNormalizeMapsGet();
1181
1182 if (result < 0)
1183 return (-1);
1184
1185 pmap = cg->propmap_cache;
1186
1187 if (pmap == NULL)
1188 return (-1);
1189
1190 /*
1191 * Find character in map...
1192 */
1193
1194 uni2prop = (_cups_prop_t *)bsearch(&ch, pmap->uni2prop, pmap->propcount,
1195 (sizeof(_cups_prop_t)), compare_propchar);
1196
1197 cupsNormalizeMapsFree();
1198
1199 if (uni2prop == NULL)
1200 gencat = CUPS_GENCAT_CN; /* Other, Not Assigned */
1201 else
1202 gencat = (cups_gencat_t)uni2prop->gencat;
1203
1204 result = (int)gencat;
1205
1206 return (result);
1207}
1208
1209
1210/*
1211 * 'get_bidi_category()' - Get UTF-32 Character Bidi Category.
1212 */
1213
1214static int /* O - Class or -1 on error */
1215get_bidi_category(const cups_utf32_t ch)/* I - Source char */
1216{
1217 int result; /* Result Value */
1218 cups_bidi_t bidicat; /* Bidi Category Value */
1219 _cups_prop_map_t *pmap; /* Unicode Property Map */
1220 _cups_prop_t *uni2prop; /* Unicode Char -> Properties */
1221 _cups_globals_t *cg = _cupsGlobals();
1222 /* Pointer to library globals */
1223
1224
1225 /*
1226 * Check for valid argument...
1227 */
1228
1229 if (ch == 0)
1230 return (-1);
1231
1232 /*
1233 * Find property map...
1234 */
1235
1236 result = cupsNormalizeMapsGet();
1237
1238 if (result < 0)
1239 return (-1);
1240
1241 pmap = cg->propmap_cache;
1242
1243 if (pmap == NULL)
1244 return (-1);
1245
1246 /*
1247 * Find character in map...
1248 */
1249
1250 uni2prop = (_cups_prop_t *)bsearch(&ch, pmap->uni2prop, pmap->propcount,
1251 (sizeof(_cups_prop_t)), compare_propchar);
1252
1253 cupsNormalizeMapsFree();
1254
1255 if (uni2prop == NULL)
1256 bidicat = CUPS_BIDI_ON; /* Other Neutral */
1257 else
1258 bidicat = (cups_bidi_t)uni2prop->bidicat;
1259
1260 result = (int)bidicat;
1261
1262 return (result);
1263}
1264
1265/*
1266 * 'get_combining_class()' - Get UTF-32 Character Combining Class.
1267 *
1268 * Note - Zero is non-combining (base character)
1269 */
1270
1271static int /* O - Class or -1 on error */
1272get_combining_class(
1273 const cups_utf32_t ch) /* I - Source char */
1274{
1275 int result; /* Result Value */
1276 _cups_comb_map_t *cmap; /* Unicode Combining Class Map */
1277 _cups_comb_class_t combclass; /* Unicode Combining Class */
1278 _cups_comb_t *uni2comb; /* Unicode Char -> Combining Class */
1279 _cups_globals_t *cg = _cupsGlobals();
1280 /* Pointer to library globals */
1281
1282
1283 /*
1284 * Check for valid argument...
1285 */
1286
1287 if (ch == 0)
1288 return (-1);
1289
1290 /*
1291 * Find combining class map...
1292 */
1293
1294 result = cupsNormalizeMapsGet();
1295
1296 if (result < 0)
1297 return (-1);
1298
1299 cmap = cg->combmap_cache;
1300
1301 if (cmap == NULL)
1302 return (-1);
1303
1304 /*
1305 * Find combining character in map...
1306 */
1307
1308 uni2comb = (_cups_comb_t *)bsearch(&ch, cmap->uni2comb, cmap->combcount,
1309 (sizeof(_cups_comb_t)), compare_combchar);
1310
1311 cupsNormalizeMapsFree();
1312
1313 if (uni2comb == NULL)
1314 combclass = 0;
1315 else
1316 combclass = (_cups_comb_class_t)uni2comb->combclass;
1317
1318 result = (int)combclass;
1319
1320 return (result);
1321}
1322
1323
1324/*
1325 * 'get_break_class()' - Get UTF-32 Character Line Break Class.
1326 */
1327
1328static int /* O - Class or -1 on error */
1329get_break_class(const cups_utf32_t ch) /* I - Source char */
1330{
1331 int result; /* Result Value */
1332 _cups_break_map_t *bmap; /* Unicode Line Break Class Map */
1333 cups_break_class_t breakclass; /* Unicode Line Break Class */
1334 cups_ucs2_t *uni2break; /* Unicode -> Line Break Class */
1335 _cups_globals_t *cg = _cupsGlobals();
1336 /* Pointer to library globals */
1337
1338
1339 /*
1340 * Check for valid argument...
1341 */
1342
1343 if (ch == 0)
1344 return (-1);
1345
1346 /*
1347 * Find line break class map...
1348 */
1349
1350 result = cupsNormalizeMapsGet();
1351
1352 if (result < 0)
1353 return (-1);
1354
1355 bmap = cg->breakmap_cache;
1356
1357 if (bmap == NULL)
1358 return (-1);
1359
1360 /*
1361 * Find line break character in map...
1362 */
1363
1364 uni2break = (cups_ucs2_t *)bsearch(&ch, bmap->uni2break, bmap->breakcount,
1365 (sizeof(cups_ucs2_t) * 3),
1366 compare_breakchar);
1367
1368 cupsNormalizeMapsFree();
1369
1370 if (uni2break == NULL)
1371 breakclass = CUPS_BREAK_AI;
1372 else
1373 breakclass = (cups_break_class_t)*(uni2break + 2);
1374
1375 result = (int)breakclass;
1376
1377 return (result);
1378}
1379
1380
1381/*
1382 * 'get_map_count()' - Count lines in a map file.
1383 */
1384
1385static int /* O - Count or -1 on error */
1386get_map_count(const char *filename) /* I - Map Filename */
1387{
1388 int i; /* Looping variable */
1389 cups_file_t *fp; /* Map input file pointer */
1390 char *s; /* Line parsing pointer */
1391 char line[256]; /* Line from input map file */
1392 cups_utf32_t unichar; /* Unicode character value */
1393
1394
1395 /*
1396 * Open map input file...
1397 */
1398
1399 if (!filename || !*filename)
1400 return (-1);
1401
1402 fp = cupsFileOpen(filename, "r");
1403 if (fp == NULL)
1404 return (-1);
1405
1406 /*
1407 * Count lines in map input file...
1408 */
1409
1410 for (i = 0; i < 50000;)
1411 {
1412 s = cupsFileGets(fp, line, sizeof(line));
1413 if (s == NULL)
1414 break;
1415 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1416 continue;
1417 if (strncmp (s, "0x", 2) == 0)
1418 s += 2;
1419 if (sscanf(s, "%lx", &unichar) != 1)
1420 break;
1421 if (unichar > 0xffff)
1422 break;
1423 i ++;
1424 }
1425 if (i == 0)
1426 i = -1;
1427
1428 /*
1429 * Close file and return map count (non-comment line count)...
1430 */
1431
1432 cupsFileClose(fp);
1433
1434 return (i);
1435}
1436
1437
1438/*
1439 * 'get_normmap()' - Get Unicode normalization map to cache.
1440 */
1441
1442static int /* O - Zero or -1 on error */
1443get_normmap(
1444 const cups_normalize_t normalize) /* I - Normalization Form */
1445{
1446 int i; /* Looping variable */
1447 cups_utf32_t unichar1; /* Unicode character value */
1448 cups_utf32_t unichar2; /* Unicode character value */
1449 cups_utf32_t unichar3; /* Unicode character value */
1450 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
1451 int normcount; /* Count of Unicode Source Chars */
1452 cups_ucs2_t *uni2norm; /* Unicode Char -> Normalization */
1453 char *mapname; /* Normalization map name */
1454 char filename[1024]; /* Filename for charset map file */
1455 cups_file_t *fp; /* Normalization map file pointer */
1456 char *s; /* Line parsing pointer */
1457 char line[256]; /* Line from input map file */
1458 _cups_globals_t *cg = _cupsGlobals();
1459 /* Pointer to library globals */
1460
1461
1462 /*
1463 * See if we already have this normalization map loaded...
1464 */
1465
1466 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
1467 if (nmap->normalize == normalize)
1468 return (0);
1469
1470 /*
1471 * Get the mapping name...
1472 */
1473
1474 switch (normalize)
1475 {
1476 case CUPS_NORM_NFD: /* Canonical Decomposition */
1477 mapname = "uni-nfd.txt";
1478 break;
1479
1480 case CUPS_NORM_NFKD: /* Compatibility Decomposition */
1481 mapname = "uni-nfkd.txt";
1482 break;
1483
1484 case CUPS_NORM_NFC: /* Canonical Composition */
1485 mapname = "uni-nfc.txt";
1486 break;
1487
1488 case CUPS_NORM_NFKC: /* no such map file... */
1489 default:
1490 return (-1);
1491 }
1492
1493 /*
1494 * Open normalization map input file...
1495 */
1496
1497 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1498 cg->cups_datadir, mapname);
1499 if ((normcount = get_map_count(filename)) <= 0)
1500 return (-1);
1501
1502 fp = cupsFileOpen(filename, "r");
1503 if (fp == NULL)
1504 return (-1);
1505
1506 /*
1507 * Allocate memory for normalization map and add to cache...
1508 */
1509
1510 nmap = (_cups_norm_map_t *)calloc(1, sizeof(_cups_norm_map_t));
1511 if (nmap == NULL)
1512 {
1513 cupsFileClose(fp);
1514 return (-1);
1515 }
1516
1517 uni2norm = (cups_ucs2_t *)calloc(1, sizeof(cups_ucs2_t) * 3 * normcount);
1518 if (uni2norm == NULL)
1519 {
1520 free(nmap);
1521 cupsFileClose(fp);
1522 return (-1);
1523 }
1524 nmap->next = cg->normmap_cache;
1525 cg->normmap_cache = nmap;
1526 nmap->used ++;
1527 nmap->normalize = normalize;
1528 nmap->normcount = normcount;
1529 nmap->uni2norm = uni2norm;
1530
1531 /*
1532 * Save normalization map into memory for later use...
1533 */
1534 for (i = 0; i < normcount; )
1535 {
1536 s = cupsFileGets(fp, line, sizeof(line));
1537 if (s == NULL)
1538 break;
1539 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1540 continue;
1541 if (sscanf(s, "%lx %lx %lx", &unichar1, &unichar2, &unichar3) != 3)
1542 break;
1543 if ((unichar1 > 0xffff)
1544 || (unichar2 > 0xffff)
1545 || (unichar3 > 0xffff))
1546 break;
1547 *uni2norm ++ = (cups_ucs2_t) unichar1;
1548 *uni2norm ++ = (cups_ucs2_t) unichar2;
1549 *uni2norm ++ = (cups_ucs2_t) unichar3;
1550 i ++;
1551 }
1552 if (i < normcount)
1553 nmap->normcount = i;
1554 cupsFileClose(fp);
1555 return (0);
1556}
1557
1558
1559/*
1560 * 'get_foldmap()' - Get Unicode case folding map to cache.
1561 */
1562
1563static int /* O - Zero or -1 on error */
1564get_foldmap(const cups_folding_t fold) /* I - Case folding type */
1565{
1566 int i; /* Looping variable */
1567 cups_utf32_t unichar1; /* Unicode character value */
1568 cups_utf32_t unichar2; /* Unicode character value */
1569 cups_utf32_t unichar3; /* Unicode character value */
1570 cups_utf32_t unichar4; /* Unicode character value */
1571 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
1572 int foldcount; /* Count of Unicode Source Chars */
1573 cups_ucs2_t *uni2fold; /* Unicode -> Folded Char(s) */
1574 char *mapname; /* Case Folding map name */
1575 char filename[1024]; /* Filename for charset map file */
1576 cups_file_t *fp; /* Case Folding map file pointer */
1577 char *s; /* Line parsing pointer */
1578 char line[256]; /* Line from input map file */
1579 _cups_globals_t *cg = _cupsGlobals();
1580 /* Pointer to library globals */
1581
1582
1583 /*
1584 * See if we already have this case folding map loaded...
1585 */
1586
1587 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
1588 if (fmap->fold == fold)
1589 return (0);
1590
1591 /*
1592 * Get the mapping name...
1593 */
1594
1595 switch (fold)
1596 {
1597 case CUPS_FOLD_SIMPLE: /* Simple case folding */
1598 mapname = "uni-fold.txt";
1599 break;
1600 case CUPS_FOLD_FULL: /* Full case folding */
1601 mapname = "uni-full.txt";
1602 break;
1603 default:
1604 return (-1);
1605 }
1606
1607 /*
1608 * Open case folding map input file...
1609 */
1610
1611 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1612 cg->cups_datadir, mapname);
1613 if ((foldcount = get_map_count(filename)) <= 0)
1614 return (-1);
1615 fp = cupsFileOpen(filename, "r");
1616 if (fp == NULL)
1617 return (-1);
1618
1619 /*
1620 * Allocate memory for case folding map and add to cache...
1621 */
1622 fmap = (_cups_fold_map_t *)calloc(1, sizeof(_cups_fold_map_t));
1623 if (fmap == NULL)
1624 {
1625 cupsFileClose(fp);
1626 return (-1);
1627 }
1628 uni2fold = (cups_ucs2_t *)calloc(1, sizeof(cups_ucs2_t) * 4 * foldcount);
1629 if (uni2fold == NULL)
1630 {
1631 free(fmap);
1632 cupsFileClose(fp);
1633 return (-1);
1634 }
1635 fmap->next = cg->foldmap_cache;
1636 cg->foldmap_cache = fmap;
1637 fmap->used ++;
1638 fmap->fold = fold;
1639 fmap->foldcount = foldcount;
1640 fmap->uni2fold = uni2fold;
1641
1642 /*
1643 * Save case folding map into memory for later use...
1644 */
1645
1646 for (i = 0; i < foldcount; )
1647 {
1648 s = cupsFileGets(fp, line, sizeof(line));
1649 if (s == NULL)
1650 break;
1651 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1652 continue;
1653 unichar1 = unichar2 = unichar3 = unichar4 = 0;
1654 if ((fold == CUPS_FOLD_SIMPLE)
1655 && (sscanf(s, "%lx %lx", &unichar1, &unichar2) != 2))
1656 break;
1657 if ((fold == CUPS_FOLD_FULL)
1658 && (sscanf(s, "%lx %lx %lx %lx",
1659 &unichar1, &unichar2, &unichar3, &unichar4) != 4))
1660 break;
1661 if ((unichar1 > 0xffff)
1662 || (unichar2 > 0xffff)
1663 || (unichar3 > 0xffff)
1664 || (unichar4 > 0xffff))
1665 break;
1666 *uni2fold ++ = (cups_ucs2_t) unichar1;
1667 *uni2fold ++ = (cups_ucs2_t) unichar2;
1668 *uni2fold ++ = (cups_ucs2_t) unichar3;
1669 *uni2fold ++ = (cups_ucs2_t) unichar4;
1670 i ++;
1671 }
1672 if (i < foldcount)
1673 fmap->foldcount = i;
1674 cupsFileClose(fp);
1675 return (0);
1676}
1677
1678/*
1679 * 'get_propmap()' - Get Unicode character property map to cache.
1680 */
1681
1682static int /* O - Zero or -1 on error */
1683get_propmap(void)
1684{
1685 int i, j; /* Looping variables */
1686 size_t len; /* String length */
1687 cups_utf32_t unichar; /* Unicode character value */
1688 cups_gencat_t gencat; /* General Category Value */
1689 cups_bidi_t bidicat; /* Bidi Category Value */
1690 _cups_prop_map_t *pmap; /* Unicode Char Property Map */
1691 int propcount; /* Count of Unicode Source Chars */
1692 _cups_prop_t *uni2prop; /* Unicode Char -> Properties */
1693 char *mapname; /* Char Property map name */
1694 char filename[1024]; /* Filename for charset map file */
1695 cups_file_t *fp; /* Char Property map file pointer */
1696 char *s; /* Line parsing pointer */
1697 char line[256]; /* Line from input map file */
1698 _cups_globals_t *cg = _cupsGlobals();
1699 /* Pointer to library globals */
1700
1701
1702 /*
1703 * See if we already have this char properties map loaded...
1704 */
1705
1706 if ((pmap = cg->propmap_cache) != NULL)
1707 return (0);
1708
1709 /*
1710 * Get the mapping name...
1711 */
1712
1713 mapname = "uni-prop.txt";
1714
1715 /*
1716 * Open char properties map input file...
1717 */
1718 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1719 cg->cups_datadir, mapname);
1720 if ((propcount = get_map_count(filename)) <= 0)
1721 return (-1);
1722 fp = cupsFileOpen(filename, "r");
1723 if (fp == NULL)
1724 return (-1);
1725
1726 /*
1727 * Allocate memory for char properties map and add to cache...
1728 */
1729 pmap = (_cups_prop_map_t *)calloc(1, sizeof(_cups_prop_map_t));
1730 if (pmap == NULL)
1731 {
1732 cupsFileClose(fp);
1733 return (-1);
1734 }
1735 uni2prop = (_cups_prop_t *)calloc(1, sizeof(_cups_prop_t) * propcount);
1736 if (uni2prop == NULL)
1737 {
1738 free(pmap);
1739 cupsFileClose(fp);
1740 return (-1);
1741 }
1742 cg->propmap_cache = pmap;
1743 pmap->used ++;
1744 pmap->propcount = propcount;
1745 pmap->uni2prop = uni2prop;
1746
1747 /*
1748 * Save char properties map into memory for later use...
1749 */
1750 for (i = 0; i < propcount; )
1751 {
1752 s = cupsFileGets(fp, line, sizeof(line));
1753 if (s == NULL)
1754 break;
1755 if (strlen(s) > 0)
1756 *(s + strlen(s) - 1) = '\0';
1757 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1758 continue;
1759 if (sscanf(s, "%lx", &unichar) != 1)
1760 break;
1761 if (unichar > 0xffff)
1762 break;
1763 while ((*s != '\0') && (*s != ';'))
1764 s ++;
1765 if (*s != ';')
1766 break;
1767 s ++;
1768 for (j = 0; gencat_index[j].str != NULL; j ++)
1769 {
1770 len = strlen(gencat_index[j].str);
1771 if (strncmp (s, gencat_index[j].str, len) == 0)
1772 break;
1773 }
1774 if (gencat_index[j].str == NULL)
1775 return (-1);
1776 gencat = gencat_index[j].gencat;
1777 while ((*s != '\0') && (*s != ';'))
1778 s ++;
1779 if (*s != ';')
1780 break;
1781 s ++;
1782 for (j = 0; bidicat_index[j] != NULL; j ++)
1783 {
1784 len = strlen(bidicat_index[j]);
1785 if (strncmp (s, bidicat_index[j], len) == 0)
1786 break;
1787 }
1788 if (bidicat_index[j] == NULL)
1789 return (-1);
1790 bidicat = (cups_bidi_t) j;
1791 uni2prop->ch = (cups_ucs2_t) unichar;
1792 uni2prop->gencat = (unsigned char) gencat;
1793 uni2prop->bidicat = (unsigned char) bidicat;
1794 uni2prop ++;
1795 i ++;
1796 }
1797 if (i < propcount)
1798 pmap->propcount = i;
1799 cupsFileClose(fp);
1800 return (0);
1801}
1802
1803
1804/*
1805 * 'get_combmap()' - Get Unicode combining class map to cache.
1806 */
1807
1808static int /* O - Zero or -1 on error */
1809get_combmap(void)
1810{
1811 int i; /* Looping variable */
1812 cups_utf32_t unichar; /* Unicode character value */
1813 int combclass; /* Unicode char combining class */
1814 _cups_comb_map_t *cmap; /* Unicode Comb Class Map */
1815 int combcount; /* Count of Unicode Source Chars */
1816 _cups_comb_t *uni2comb; /* Unicode Char -> Combining Class */
1817 char *mapname; /* Comb Class map name */
1818 char filename[1024]; /* Filename for charset map file */
1819 cups_file_t *fp; /* Comb Class map file pointer */
1820 char *s; /* Line parsing pointer */
1821 char line[256]; /* Line from input map file */
1822 _cups_globals_t *cg = _cupsGlobals();
1823 /* Pointer to library globals */
1824
1825
1826 /*
1827 * See if we already have this combining class map loaded...
1828 */
1829
1830 if ((cmap = cg->combmap_cache) != NULL)
1831 return (0);
1832
1833 /*
1834 * Get the mapping name...
1835 */
1836
1837 mapname = "uni-comb.txt";
1838
1839 /*
1840 * Open combining class map input file...
1841 */
1842
1843 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1844 cg->cups_datadir, mapname);
1845 if ((combcount = get_map_count(filename)) <= 0)
1846 return (-1);
1847 fp = cupsFileOpen(filename, "r");
1848 if (fp == NULL)
1849 return (-1);
1850
1851 /*
1852 * Allocate memory for combining class map and add to cache...
1853 */
1854
1855 cmap = (_cups_comb_map_t *)calloc(1, sizeof(_cups_comb_map_t));
1856 if (cmap == NULL)
1857 {
1858 cupsFileClose(fp);
1859 return (-1);
1860 }
1861
1862 uni2comb = (_cups_comb_t *)calloc(1, sizeof(_cups_comb_t) * combcount);
1863 if (uni2comb == NULL)
1864 {
1865 free(cmap);
1866 cupsFileClose(fp);
1867 return (-1);
1868 }
1869 cg->combmap_cache = cmap;
1870 cmap->used ++;
1871 cmap->combcount = combcount;
1872 cmap->uni2comb = uni2comb;
1873
1874 /*
1875 * Save combining class map into memory for later use...
1876 */
1877 for (i = 0; i < combcount; )
1878 {
1879 s = cupsFileGets(fp, line, sizeof(line));
1880 if (s == NULL)
1881 break;
1882 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1883 continue;
1884 if (sscanf(s, "%lx", &unichar) != 1)
1885 break;
1886 if (unichar > 0xffff)
1887 break;
1888 while ((*s != '\0') && (*s != ';'))
1889 s ++;
1890 if (*s != ';')
1891 break;
1892 s ++;
1893 if (sscanf(s, "%d", &combclass) != 1)
1894 break;
1895 uni2comb->ch = (cups_ucs2_t) unichar;
1896 uni2comb->combclass = (unsigned char) combclass;
1897 uni2comb ++;
1898 i ++;
1899 }
1900 if (i < combcount)
1901 cmap->combcount = i;
1902 cupsFileClose(fp);
1903 return (0);
1904}
1905
1906
1907/*
1908 * 'get_breakmap()' - Get Unicode line break class map to cache.
1909 */
1910
1911static int /* O - Zero or -1 on error */
1912get_breakmap(void)
1913{
1914 int i, j; /* Looping variables */
1915 int len; /* String length */
1916 cups_utf32_t unichar1; /* Unicode character value */
1917 cups_utf32_t unichar2; /* Unicode character value */
1918 cups_break_class_t breakclass; /* Unicode char line break class */
1919 _cups_break_map_t *bmap; /* Unicode Line Break Class Map */
1920 int breakcount; /* Count of Unicode Source Chars */
1921 cups_ucs2_t *uni2break; /* Unicode -> Line Break Class */
1922 char *mapname; /* Comb Class map name */
1923 char filename[1024]; /* Filename for charset map file */
1924 cups_file_t *fp; /* Comb Class map file pointer */
1925 char *s; /* Line parsing pointer */
1926 char line[256]; /* Line from input map file */
1927 _cups_globals_t *cg = _cupsGlobals();
1928 /* Pointer to library globals */
1929
1930
1931 /*
1932 * See if we already have this line break class map loaded...
1933 */
1934
1935 if ((bmap = cg->breakmap_cache) != NULL)
1936 return (0);
1937
1938 /*
1939 * Get the mapping name...
1940 */
1941
1942 mapname = "uni-line.txt";
1943
1944 /*
1945 * Open line break class map input file...
1946 */
1947
1948 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1949 cg->cups_datadir, mapname);
1950 if ((breakcount = get_map_count(filename)) <= 0)
1951 return (-1);
1952 fp = cupsFileOpen(filename, "r");
1953 if (fp == NULL)
1954 return (-1);
1955
1956 /*
1957 * Allocate memory for line break class map and add to cache...
1958 */
1959
1960 bmap = (_cups_break_map_t *)calloc(1, sizeof(_cups_break_map_t));
1961 if (bmap == NULL)
1962 {
1963 cupsFileClose(fp);
1964 return (-1);
1965 }
1966
1967 uni2break = (cups_ucs2_t *)calloc(1, sizeof(cups_ucs2_t) * 3 * breakcount);
1968 if (uni2break == NULL)
1969 {
1970 free(bmap);
1971 cupsFileClose(fp);
1972 return (-1);
1973 }
1974 cg->breakmap_cache = bmap;
1975 bmap->used ++;
1976 bmap->breakcount = breakcount;
1977 bmap->uni2break = uni2break;
1978
1979 /*
1980 * Save line break class map into memory for later use...
1981 */
1982 for (i = 0; i < breakcount; )
1983 {
1984 s = cupsFileGets(fp, line, sizeof(line));
1985 if (s == NULL)
1986 break;
1987 if (strlen(s) > 0)
1988 *(s + strlen(s) - 1) = '\0';
1989 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1990 continue;
1991 if (sscanf(s, "%lx %lx", &unichar1, &unichar2) != 2)
1992 break;
1993 if ((unichar1 > 0xffff)
1994 || (unichar2 > 0xffff))
1995 break;
1996 while ((*s != '\0') && (*s != ';'))
1997 s ++;
1998 if (*s != ';')
1999 break;
2000 s ++;
2001 for (j = 0; break_index[j].str != NULL; j ++)
2002 {
2003 len = strlen (break_index[j].str);
2004 if (strncmp (s, break_index[j].str, len) == 0)
2005 break;
2006 }
2007 if (break_index[j].str == NULL)
2008 return (-1);
2009 breakclass = break_index[j].breakclass;
2010 *uni2break ++ = (cups_ucs2_t) unichar1;
2011 *uni2break ++ = (cups_ucs2_t) unichar2;
2012 *uni2break ++ = (cups_ucs2_t) breakclass;
2013 i ++;
2014 }
2015 if (i < breakcount)
2016 bmap->breakcount = i;
2017 cupsFileClose(fp);
2018 return (0);
2019}
2020
2021
2022/*
2023 * 'compare_compose()' - Compare key for compose match.
2024 *
2025 * Note - This function cannot be easily modified for 32-bit Unicode.
2026 */
2027
2028static int /* O - Result of comparison */
2029compare_compose(const void *k1, /* I - Key char */
2030 const void *k2) /* I - Map char */
2031{
2032 cups_utf32_t *kp = (cups_utf32_t *)k1;
2033 /* Key char pointer */
2034 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2035 unsigned long key; /* Pair of key characters */
2036 unsigned long map; /* Pair of map characters */
2037 int result; /* Result Value */
2038
2039
2040 key = (*kp << 16);
2041 key |= *(kp + 1);
2042 map = (unsigned long) (*mp << 16);
2043 map |= (unsigned long) *(mp + 1);
2044
2045 if (key >= map)
2046 result = (int) (key - map);
2047 else
2048 result = -1 * ((int) (map - key));
2049
2050 return (result);
2051}
2052
2053
2054/*
2055 * 'compare_decompose()' - Compare key for decompose match.
2056 */
2057
2058static int /* O - Result of comparison */
2059compare_decompose(const void *k1, /* I - Key char */
2060 const void *k2) /* I - Map char */
2061{
2062 cups_utf32_t *kp = (cups_utf32_t *)k1;
2063 /* Key char pointer */
2064 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2065 cups_ucs2_t ch; /* Key char as UCS-2 */
2066 int result; /* Result Value */
2067
2068
2069 ch = (cups_ucs2_t) *kp;
2070
2071 if (ch >= *mp)
2072 result = (int) (ch - *mp);
2073 else
2074 result = -1 * ((int) (*mp - ch));
2075
2076 return (result);
2077}
2078
2079
2080/*
2081 * 'compare_foldchar()' - Compare key for case fold match.
2082 */
2083
2084static int /* O - Result of comparison */
2085compare_foldchar(const void *k1, /* I - Key char */
2086 const void *k2) /* I - Map char */
2087{
2088 cups_utf32_t *kp = (cups_utf32_t *)k1;
2089 /* Key char pointer */
2090 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2091 cups_ucs2_t ch; /* Key char as UCS-2 */
2092 int result; /* Result Value */
2093
2094
2095 ch = (cups_ucs2_t) *kp;
2096
2097 if (ch >= *mp)
2098 result = (int) (ch - *mp);
2099 else
2100 result = -1 * ((int) (*mp - ch));
2101
2102 return (result);
2103}
2104
2105
2106/*
2107 * 'compare_combchar()' - Compare key for combining char match.
2108 */
2109
2110static int /* O - Result of comparison */
2111compare_combchar(const void *k1, /* I - Key char */
2112 const void *k2) /* I - Map char */
2113{
2114 cups_utf32_t *kp = (cups_utf32_t *)k1;
2115 /* Key char pointer */
2116 _cups_comb_t *cp = (_cups_comb_t *)k2;/* Combining map row pointer */
2117 cups_ucs2_t ch; /* Key char as UCS-2 */
2118 int result; /* Result Value */
2119
2120
2121 ch = (cups_ucs2_t) *kp;
2122
2123 if (ch >= cp->ch)
2124 result = (int) (ch - cp->ch);
2125 else
2126 result = -1 * ((int) (cp->ch - ch));
2127
2128 return (result);
2129}
2130
2131
2132/*
2133 * 'compare_breakchar()' - Compare key for line break char match.
2134 */
2135
2136static int /* O - Result of comparison */
2137compare_breakchar(const void *k1, /* I - Key char */
2138 const void *k2) /* I - Map char */
2139{
2140 cups_utf32_t *kp = (cups_utf32_t *)k1;
2141 /* Key char pointer */
2142 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2143 cups_ucs2_t ch; /* Key char as UCS-2 */
2144 int result; /* Result Value */
2145
2146
2147 ch = (cups_ucs2_t) *kp;
2148
2149 if (ch < *mp)
2150 result = -1 * (int) (*mp - ch);
2151 else if (ch > *(mp + 1))
2152 result = (int) (ch - *(mp + 1));
2153 else
2154 result = 0;
2155
2156 return (result);
2157}
2158
2159
2160/*
2161 * 'compare_propchar()' - Compare key for property char match.
2162 */
2163
2164static int /* O - Result of comparison */
2165compare_propchar(const void *k1, /* I - Key char */
2166 const void *k2) /* I - Map char */
2167{
2168 cups_utf32_t *kp = (cups_utf32_t *)k1;
2169 /* Key char pointer */
2170 _cups_prop_t *pp = (_cups_prop_t *)k2;/* Property map row pointer */
2171 cups_ucs2_t ch; /* Key char as UCS-2 */
2172 int result; /* Result Value */
2173
2174
2175 ch = (cups_ucs2_t) *kp;
2176
2177 if (ch >= pp->ch)
2178 result = (int) (ch - pp->ch);
2179 else
2180 result = -1 * ((int) (pp->ch - ch));
2181
2182 return (result);
2183}
2184
2185
2186/*
fa73b229 2187 * End of "$Id: normalize.c 4967 2006-01-24 03:42:15Z mike $"
ef416fc2 2188 */