]> git.ipfire.org Git - thirdparty/cups.git/blob - cups/normalize.c
Load cups into easysw/current.
[thirdparty/cups.git] / cups / normalize.c
1 /*
2 * "$Id: normalize.c 4903 2006-01-10 20:02:46Z mike $"
3 *
4 * Unicode normalization for the Common UNIX Printing System (CUPS).
5 *
6 * Copyright 1997-2006 by Easy Software Products.
7 *
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
13 * Products at:
14 *
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
19 *
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
23 *
24 * Contents:
25 *
26 * cupsNormalizeMapsGet() - Get all norm maps to cache.
27 * cupsNormalizeMapsFree() - Free all norm maps in cache.
28 * cupsNormalizeMapsFlush() - Flush all norm maps in cache.
29 * cupsUTF8Normalize() - Normalize UTF-8 string.
30 * cupsUTF32Normalize() - Normalize UTF-32 string.
31 * cupsUTF8CaseFold() - Case fold UTF-8 string.
32 * cupsUTF32CaseFold() - Case fold UTF-32 string.
33 * cupsUTF8CompareCaseless() - Compare case folded UTF-8 strings.
34 * cupsUTF32CompareCaseless() - Compare case folded UTF-32 strings.
35 * cupsUTF8CompareIdentifier() - Compare folded NFKC UTF-8 strings.
36 * cupsUTF32CompareIdentifier() - Compare folded NFKC UTF-32 strings.
37 * cupsUTF32CharacterProperty() - Get UTF-32 character property.
38 * get_general_category() - Get UTF-32 Char General Category.
39 * get_bidi_category() - Get UTF-32 Char Bidi Category.
40 * get_combining_class() - Get UTF-32 Char Combining Class.
41 * get_break_class() - Get UTF-32 Char Line Break Class.
42 * get_map_count() - Count lines in a map file.
43 * get_normmap() - Get Unicode norm map to cache.
44 * get_foldmap() - Get Unicode casefold map to cache.
45 * get_propmap() - Get Unicode property map to cache.
46 * get_combmap() - Get Unicode combining map to cache.
47 * get_breakmap() - Get Unicode break map to cache.
48 * compare_compose() - Compare key for compose match.
49 * compare_decompose() - Compare key for decompose match.
50 * compare_foldchar() - Compare key for case fold match.
51 * compare_combchar() - Compare key for combining match.
52 * compare_breakchar() - Compare key for line break match.
53 * compare_propchar() - Compare key for property char match.
54 */
55
56 /*
57 * Include necessary headers...
58 */
59
60 #include "globals.h"
61 #include "debug.h"
62 #include <stdlib.h>
63 #include <errno.h>
64 #include <time.h>
65
66
67 typedef struct /**** General Category Index Struct****/
68 {
69 cups_gencat_t gencat; /* General Category Value */
70 const char *str; /* General Category String */
71 } gencat_t;
72
73 static const gencat_t gencat_index[] = /* General Category Index */
74 {
75 { CUPS_GENCAT_LU, "Lu" }, /* Letter, Uppercase */
76 { CUPS_GENCAT_LL, "Ll" }, /* Letter, Lowercase */
77 { CUPS_GENCAT_LT, "Lt" }, /* Letter, Titlecase */
78 { CUPS_GENCAT_LM, "Lm" }, /* Letter, Modifier */
79 { CUPS_GENCAT_LO, "Lo" }, /* Letter, Other */
80 { CUPS_GENCAT_MN, "Mn" }, /* Mark, Non-Spacing */
81 { CUPS_GENCAT_MC, "Mc" }, /* Mark, Spacing Combining */
82 { CUPS_GENCAT_ME, "Me" }, /* Mark, Enclosing */
83 { CUPS_GENCAT_ND, "Nd" }, /* Number, Decimal Digit */
84 { CUPS_GENCAT_NL, "Nl" }, /* Number, Letter */
85 { CUPS_GENCAT_NO, "No" }, /* Number, Other */
86 { CUPS_GENCAT_PC, "Pc" }, /* Punctuation, Connector */
87 { CUPS_GENCAT_PD, "Pd" }, /* Punctuation, Dash */
88 { CUPS_GENCAT_PS, "Ps" }, /* Punctuation, Open (start) */
89 { CUPS_GENCAT_PE, "Pe" }, /* Punctuation, Close (end) */
90 { CUPS_GENCAT_PI, "Pi" }, /* Punctuation, Initial Quote */
91 { CUPS_GENCAT_PF, "Pf" }, /* Punctuation, Final Quote */
92 { CUPS_GENCAT_PO, "Po" }, /* Punctuation, Other */
93 { CUPS_GENCAT_SM, "Sm" }, /* Symbol, Math */
94 { CUPS_GENCAT_SC, "Sc" }, /* Symbol, Currency */
95 { CUPS_GENCAT_SK, "Sk" }, /* Symbol, Modifier */
96 { CUPS_GENCAT_SO, "So" }, /* Symbol, Other */
97 { CUPS_GENCAT_ZS, "Zs" }, /* Separator, Space */
98 { CUPS_GENCAT_ZL, "Zl" }, /* Separator, Line */
99 { CUPS_GENCAT_ZP, "Zp" }, /* Separator, Paragraph */
100 { CUPS_GENCAT_CC, "Cc" }, /* Other, Control */
101 { CUPS_GENCAT_CF, "Cf" }, /* Other, Format */
102 { CUPS_GENCAT_CS, "Cs" }, /* Other, Surrogate */
103 { CUPS_GENCAT_CO, "Co" }, /* Other, Private Use */
104 { CUPS_GENCAT_CN, "Cn" }, /* Other, Not Assigned */
105 { 0, NULL }
106 };
107
108 static const char * const bidicat_index[] =
109 /* Bidi Category Index */
110 {
111 "L", /* Left-to-Right (Alpha, Syllabic, Ideographic) */
112 "LRE", /* Left-to-Right Embedding (explicit) */
113 "LRO", /* Left-to-Right Override (explicit) */
114 "R", /* Right-to-Left (Hebrew alphabet and most punct) */
115 "AL", /* Right-to-Left Arabic (Arabic, Thaana, Syriac) */
116 "RLE", /* Right-to-Left Embedding (explicit) */
117 "RLO", /* Right-to-Left Override (explicit) */
118 "PDF", /* Pop Directional Format */
119 "EN", /* Euro Number (Euro and East Arabic-Indic digits) */
120 "ES", /* Euro Number Separator (Slash) */
121 "ET", /* Euro Number Termintor (Plus, Minus, Degree, etc) */
122 "AN", /* Arabic Number (Arabic-Indic digits, separators) */
123 "CS", /* Common Number Separator (Colon, Comma, Dot, etc) */
124 "NSM", /* Non-Spacing Mark (category Mn / Me in UCD) */
125 "BN", /* Boundary Neutral (Formatting / Control chars) */
126 "B", /* Paragraph Separator */
127 "S", /* Segment Separator (Tab) */
128 "WS", /* Whitespace Space (Space, Line Separator, etc) */
129 "ON", /* Other Neutrals */
130 NULL
131 };
132
133 typedef struct /**** Line Break Class Index Struct****/
134 {
135 cups_break_class_t breakclass; /* Line Break Class Value */
136 const char *str; /* Line Break Class String */
137 } _cups_break_t;
138
139 static const _cups_break_t break_index[] = /* Line Break Class Index */
140 {
141 { CUPS_BREAK_AI, "AI" }, /* Ambiguous (Alphabetic or Ideograph) */
142 { CUPS_BREAK_AL, "AL" }, /* Ordinary Alpha/Symbol Chars (XP) */
143 { CUPS_BREAK_BA, "BA" }, /* Break Opportunity After Chars (A) */
144 { CUPS_BREAK_BB, "BB" }, /* Break Opportunities Before Chars (B) */
145 { CUPS_BREAK_B2, "B2" }, /* Break Opportunity Either (B/A/XP) */
146 { CUPS_BREAK_BK, "BK" }, /* Mandatory Break (A) (norm) */
147 { CUPS_BREAK_CB, "CB" }, /* Contingent Break (B/A) (norm) */
148 { CUPS_BREAK_CL, "CL" }, /* Closing Punctuation (XB) */
149 { CUPS_BREAK_CM, "CM" }, /* Attached/Combining (XB) (norm) */
150 { CUPS_BREAK_CR, "CR" }, /* Carriage Return (A) (norm) */
151 { CUPS_BREAK_EX, "EX" }, /* Exclamation / Interrogation (XB) */
152 { CUPS_BREAK_GL, "GL" }, /* Non-breaking ("Glue") (XB/XA) (norm) */
153 { CUPS_BREAK_HY, "HY" }, /* Hyphen (XA) */
154 { CUPS_BREAK_ID, "ID" }, /* Ideographic (B/A) */
155 { CUPS_BREAK_IN, "IN" }, /* Inseparable chars (XP) */
156 { CUPS_BREAK_IS, "IS" }, /* Numeric Separator (Infix) (XB) */
157 { CUPS_BREAK_LF, "LF" }, /* Line Feed (A) (norm) */
158 { CUPS_BREAK_NS, "NS" }, /* Non-starters (XB) */
159 { CUPS_BREAK_NU, "NU" }, /* Numeric (XP) */
160 { CUPS_BREAK_OP, "OP" }, /* Opening Punctuation (XA) */
161 { CUPS_BREAK_PO, "PO" }, /* Postfix (Numeric) (XB) */
162 { CUPS_BREAK_PR, "PR" }, /* Prefix (Numeric) (XA) */
163 { CUPS_BREAK_QU, "QU" }, /* Ambiguous Quotation (XB/XA) */
164 { CUPS_BREAK_SA, "SA" }, /* Context Dependent (SE Asian) (P) */
165 { CUPS_BREAK_SG, "SG" }, /* Surrogates (XP) (norm) */
166 { CUPS_BREAK_SP, "SP" }, /* Space (A) (norm) */
167 { CUPS_BREAK_SY, "SY" }, /* Symbols Allowing Break After (A) */
168 { CUPS_BREAK_XX, "XX" }, /* Unknown (XP) */
169 { CUPS_BREAK_ZW, "ZW" }, /* Zero Width Space (A) (norm) */
170 { 0, NULL }
171 };
172
173 /*
174 * Prototypes...
175 */
176
177 static int compare_breakchar(const void *k1, const void *k2);
178 static int compare_combchar(const void *k1, const void *k2);
179 static int compare_compose(const void *k1, const void *k2);
180 static int compare_decompose(const void *k1, const void *k2);
181 static int compare_foldchar(const void *k1, const void *k2);
182 static int compare_propchar(const void *k1, const void *k2);
183 static int get_bidi_category(const cups_utf32_t ch);
184 static int get_break_class(const cups_utf32_t ch);
185 static int get_breakmap(void);
186 static int get_combining_class(const cups_utf32_t ch);
187 static int get_combmap(void);
188 static int get_foldmap(const cups_folding_t fold);
189 static int get_general_category(const cups_utf32_t ch);
190 static int get_map_count(const char *filename);
191 static int get_normmap(const cups_normalize_t normalize);
192 static int get_propmap(void);
193
194
195 /*
196 * 'cupsNormalizeMapsGet()' - Get all normalization maps to cache.
197 */
198
199 int /* O - Zero or -1 on error */
200 cupsNormalizeMapsGet(void)
201 {
202 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
203 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
204 _cups_globals_t *cg = _cupsGlobals();
205 /* Pointer to library globals */
206
207
208 /*
209 * See if we already have normalization maps loaded...
210 */
211
212 if (cg->normmap_cache)
213 {
214 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
215 nmap->used ++;
216
217 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
218 fmap->used ++;
219
220 if (cg->combmap_cache)
221 cg->combmap_cache->used ++;
222
223 if (cg->propmap_cache)
224 cg->propmap_cache->used ++;
225
226 if (cg->breakmap_cache)
227 cg->breakmap_cache->used ++;
228
229 return (0);
230 }
231
232 /*
233 * Get normalization maps...
234 */
235
236 if (get_normmap(CUPS_NORM_NFD) < 0)
237 return (-1);
238
239 if (get_normmap(CUPS_NORM_NFKD) < 0)
240 return (-1);
241
242 if (get_normmap(CUPS_NORM_NFC) < 0)
243 return (-1);
244
245 /*
246 * Get case folding, combining class, character property maps...
247 */
248
249 if (get_foldmap(CUPS_FOLD_SIMPLE) < 0)
250 return (-1);
251
252 if (get_foldmap(CUPS_FOLD_FULL) < 0)
253 return (-1);
254
255 if (get_propmap() < 0)
256 return (-1);
257
258 if (get_combmap() < 0)
259 return (-1);
260
261 if (get_breakmap() < 0)
262 return (-1);
263
264 return (0);
265 }
266
267
268 /*
269 * 'cupsNormalizeMapsFree()' - Free all normalization maps in cache.
270 *
271 * This does not actually free; use 'cupsNormalizeMapsFlush()' for that.
272 */
273
274 int /* O - Zero or -1 on error */
275 cupsNormalizeMapsFree(void)
276 {
277 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
278 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
279 _cups_globals_t *cg = _cupsGlobals();
280 /* Pointer to library globals */
281
282
283 /*
284 * See if we already have normalization maps loaded...
285 */
286
287 if (cg->normmap_cache == NULL)
288 return (-1);
289
290 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
291 if (nmap->used > 0)
292 nmap->used --;
293
294 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
295 if (fmap->used > 0)
296 fmap->used --;
297
298 if (cg->propmap_cache && (cg->propmap_cache->used > 0))
299 cg->propmap_cache->used --;
300
301 if (cg->combmap_cache && (cg->combmap_cache->used > 0))
302 cg->combmap_cache->used --;
303
304 if (cg->breakmap_cache && (cg->breakmap_cache->used > 0))
305 cg->breakmap_cache->used --;
306
307 return (0);
308 }
309
310
311 /*
312 * 'cupsNormalizeMapsFlush()' - Flush all normalization maps in cache.
313 */
314
315 void
316 cupsNormalizeMapsFlush(void)
317 {
318 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
319 _cups_norm_map_t *nextnorm; /* Next Unicode Normalization Map */
320 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
321 _cups_fold_map_t *nextfold; /* Next Unicode Case Folding Map */
322 _cups_globals_t *cg = _cupsGlobals();
323 /* Pointer to library globals */
324
325
326 /*
327 * Flush all normalization maps...
328 */
329
330 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nextnorm)
331 {
332 free(nmap->uni2norm);
333 nextnorm = nmap->next;
334 free(nmap);
335 }
336
337 cg->normmap_cache = NULL;
338
339 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = nextfold)
340 {
341 free(fmap->uni2fold);
342 nextfold = fmap->next;
343 free(fmap);
344 }
345
346 cg->foldmap_cache = NULL;
347
348 if (cg->propmap_cache)
349 {
350 free(cg->propmap_cache->uni2prop);
351 free(cg->propmap_cache);
352 cg->propmap_cache = NULL;
353 }
354
355 if (cg->combmap_cache)
356 {
357 free(cg->combmap_cache->uni2comb);
358 free(cg->combmap_cache);
359 cg->combmap_cache = NULL;
360 }
361
362 if (cg->breakmap_cache)
363 {
364 free(cg->breakmap_cache->uni2break);
365 free(cg->breakmap_cache);
366 cg->breakmap_cache = NULL;
367 }
368 }
369
370
371 /*
372 * 'cupsUTF8Normalize()' - Normalize UTF-8 string.
373 *
374 * Normalize UTF-8 string to Unicode UAX-15 Normalization Form
375 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
376 * unsafe for subsequent transcoding to legacy charsets
377 */
378
379 int /* O - Count or -1 on error */
380 cupsUTF8Normalize(
381 cups_utf8_t *dest, /* O - Target string */
382 const cups_utf8_t *src, /* I - Source string */
383 const int maxout, /* I - Max output */
384 const cups_normalize_t normalize) /* I - Normalization */
385 {
386 int len; /* String length */
387 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
388 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
389
390
391 /*
392 * Check for valid arguments and clear output...
393 */
394
395 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
396 return (-1);
397
398 *dest = 0;
399
400 /*
401 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
402 */
403
404 len = cupsUTF8ToUTF32(work1, src, CUPS_MAX_USTRING);
405
406 if (len < 0)
407 return (-1);
408
409 /*
410 * Normalize internal UCS-4 to second internal UCS-4...
411 */
412
413 len = cupsUTF32Normalize(work2, work1, CUPS_MAX_USTRING, normalize);
414
415 if (len < 0)
416 return (-1);
417
418 /*
419 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
420 */
421
422 len = cupsUTF32ToUTF8(dest, work2, maxout);
423
424 return (len);
425 }
426
427
428 /*
429 * 'cupsUTF32Normalize()' - Normalize UTF-32 string.
430 *
431 * Normalize UTF-32 string to Unicode UAX-15 Normalization Form
432 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
433 * unsafe for subsequent transcoding to legacy charsets
434 */
435
436 int /* O - Count or -1 on error */
437 cupsUTF32Normalize(
438 cups_utf32_t *dest, /* O - Target string */
439 const cups_utf32_t *src, /* I - Source string */
440 const int maxout, /* I - Max output */
441 const cups_normalize_t normalize) /* I - Normalization */
442 {
443 int i; /* Looping variable */
444 int result; /* Result Value */
445 cups_ucs2_t *mp; /* Map char pointer */
446 int pass; /* Pass count for each transform */
447 int hit; /* Hit count from binary search */
448 cups_utf32_t unichar1; /* Unicode character value */
449 cups_utf32_t unichar2; /* Unicode character value */
450 _cups_comb_class_t class1; /* First Combining Class */
451 _cups_comb_class_t class2; /* Second Combining Class */
452 int len; /* String length */
453 cups_utf32_t work1[CUPS_MAX_USTRING];
454 /* First internal UCS-4 string */
455 cups_utf32_t work2[CUPS_MAX_USTRING];
456 /* Second internal UCS-4 string */
457 cups_utf32_t *p1; /* First UCS-4 string pointer */
458 cups_utf32_t *p2; /* Second UCS-4 string pointer */
459 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
460 cups_normalize_t decompose; /* Decomposition Type */
461 _cups_globals_t *cg = _cupsGlobals();
462 /* Pointer to library globals */
463
464
465 /*
466 * Check for valid arguments and clear output...
467 */
468
469 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
470 return (-1);
471
472 *dest = 0;
473
474 result = cupsNormalizeMapsGet();
475
476 if (result < 0)
477 return (-1);
478
479 /*
480 * Find decomposition map...
481 */
482
483 switch (normalize)
484 {
485 case CUPS_NORM_NFD:
486 case CUPS_NORM_NFC:
487 decompose = CUPS_NORM_NFD;
488 break;
489
490 case CUPS_NORM_NFKD:
491 case CUPS_NORM_NFKC:
492 decompose = CUPS_NORM_NFKD;
493 break;
494
495 default:
496 return (-1);
497 }
498
499 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
500 if (nmap->normalize == decompose)
501 break;
502
503 if (nmap == NULL)
504 return (-1);
505
506 /*
507 * Copy input to internal buffer...
508 */
509
510 p1 = &work1[0];
511
512 for (i = 0; i < CUPS_MAX_USTRING; i ++)
513 {
514 if (*src == 0)
515 break;
516
517 *p1 ++ = *src ++;
518 }
519
520 *p1 = 0;
521 len = i;
522
523 /*
524 * Decompose until no further decomposition...
525 */
526
527 for (pass = 0; pass < 20; pass ++)
528 {
529 p1 = &work1[0];
530 p2 = &work2[0];
531
532 for (hit = 0; *p1 != 0; p1 ++)
533 {
534 /*
535 * Check for decomposition defined...
536 */
537
538 mp = (cups_ucs2_t *)bsearch(p1, nmap->uni2norm, nmap->normcount,
539 (sizeof(cups_ucs2_t) * 3), compare_decompose);
540 if (mp == NULL)
541 {
542 *p2 ++ = *p1;
543 continue;
544 }
545
546 /*
547 * Decompose input character to one or two output characters...
548 */
549
550 hit ++;
551 mp ++;
552 *p2 ++ = (cups_utf32_t) *mp ++;
553
554 if (*mp != 0)
555 *p2 ++ = (cups_utf32_t) *mp;
556 }
557
558 *p2 = 0;
559 len = (int)(p2 - &work2[0]);
560
561 /*
562 * Check for decomposition finished...
563 */
564 if (hit == 0)
565 break;
566 memcpy (work1, work2, sizeof(cups_utf32_t) * (len + 1));
567 }
568
569 /*
570 * Canonical reorder until no further reordering...
571 */
572
573 for (pass = 0; pass < 20; pass ++)
574 {
575 p1 = &work1[0];
576
577 for (hit = 0; *p1 != 0; p1 ++)
578 {
579 /*
580 * Check for combining characters to reorder...
581 */
582
583 unichar1 = *p1;
584 unichar2 = *(p1 + 1);
585
586 if (unichar2 == 0)
587 break;
588
589 class1 = get_combining_class(unichar1);
590 class2 = get_combining_class(unichar2);
591
592 if ((class1 < 0) || (class2 < 0))
593 return (-1);
594
595 if ((class1 == 0) || (class2 == 0))
596 continue;
597
598 if (class1 <= class2)
599 continue;
600
601 /*
602 * Swap two combining characters...
603 */
604
605 *p1 = unichar2;
606 p1 ++;
607 *p1 = unichar1;
608 hit ++;
609 }
610
611 if (hit == 0)
612 break;
613 }
614
615 /*
616 * Check for decomposition only...
617 */
618
619 if (normalize == CUPS_NORM_NFD || normalize == CUPS_NORM_NFKD)
620 {
621 memcpy(dest, work1, sizeof(cups_utf32_t) * (len + 1));
622 return (len);
623 }
624
625 /*
626 * Find composition map...
627 */
628
629 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
630 if (nmap->normalize == CUPS_NORM_NFC)
631 break;
632
633 if (nmap == NULL)
634 return (-1);
635
636 /*
637 * Compose until no further composition...
638 */
639
640 for (pass = 0; pass < 20; pass ++)
641 {
642 p1 = &work1[0];
643 p2 = &work2[0];
644
645 for (hit = 0; *p1 != 0; p1 ++)
646 {
647 /*
648 * Check for composition defined...
649 */
650
651 unichar1 = *p1;
652 unichar2 = *(p1 + 1);
653
654 if (unichar2 == 0)
655 {
656 *p2 ++ = unichar1;
657 break;
658 }
659
660 mp = (cups_ucs2_t *)bsearch(p1, nmap->uni2norm, nmap->normcount,
661 (sizeof(cups_ucs2_t) * 3), compare_compose);
662 if (mp == NULL)
663 {
664 *p2 ++ = *p1;
665 continue;
666 }
667
668 /*
669 * Compose two input characters to one output character...
670 */
671
672 hit ++;
673 mp += 2;
674 *p2 ++ = (cups_utf32_t) *mp;
675 p1 ++;
676 }
677
678 *p2 = 0;
679 len = (int) (p2 - &work2[0]);
680
681 /*
682 * Check for composition finished...
683 */
684
685 if (hit == 0)
686 break;
687
688 memcpy (work1, work2, sizeof(cups_utf32_t) * (len + 1));
689 }
690
691 memcpy (dest, work1, sizeof(cups_utf32_t) * (len + 1));
692
693 cupsNormalizeMapsFree();
694
695 return (len);
696 }
697
698
699 /*
700 * 'cupsUTF8CaseFold()' - Case fold UTF-8 string.
701 *
702 * Case Fold UTF-8 string per Unicode UAX-21 Section 2.3
703 * Note - Case folding output is
704 * unsafe for subsequent transcoding to legacy charsets
705 */
706
707 int /* O - Count or -1 on error */
708 cupsUTF8CaseFold(
709 cups_utf8_t *dest, /* O - Target string */
710 const cups_utf8_t *src, /* I - Source string */
711 const int maxout, /* I - Max output */
712 const cups_folding_t fold) /* I - Fold Mode */
713 {
714 int len; /* String length */
715 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
716 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
717
718
719 /*
720 * Check for valid arguments and clear output...
721 */
722
723 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
724 return (-1);
725
726 *dest = 0;
727
728 if (fold != CUPS_FOLD_SIMPLE && fold != CUPS_FOLD_FULL)
729 return (-1);
730
731 /*
732 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
733 */
734
735 len = cupsUTF8ToUTF32(work1, src, CUPS_MAX_USTRING);
736
737 if (len < 0)
738 return (-1);
739
740 /*
741 * Case Fold internal UCS-4 to second internal UCS-4...
742 */
743
744 len = cupsUTF32CaseFold(work2, work1, CUPS_MAX_USTRING, fold);
745
746 if (len < 0)
747 return (-1);
748
749 /*
750 * Convert internal UCS-4 to output UTF-8 (and delete BOM)...
751 */
752
753 len = cupsUTF32ToUTF8(dest, work2, maxout);
754
755 return (len);
756 }
757
758
759 /*
760 * 'cupsUTF32CaseFold()' - Case fold UTF-32 string.
761 *
762 * Case Fold UTF-32 string per Unicode UAX-21 Section 2.3
763 * Note - Case folding output is
764 * unsafe for subsequent transcoding to legacy charsets
765 */
766
767 int /* O - Count or -1 on error */
768 cupsUTF32CaseFold(
769 cups_utf32_t *dest, /* O - Target string */
770 const cups_utf32_t *src, /* I - Source string */
771 const int maxout, /* I - Max output */
772 const cups_folding_t fold) /* I - Fold Mode */
773 {
774 cups_utf32_t *start = dest; /* Start of destination string */
775 int i; /* Looping variable */
776 int result; /* Result Value */
777 cups_ucs2_t *mp; /* Map char pointer */
778 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
779 _cups_globals_t *cg = _cupsGlobals();
780 /* Pointer to library globals */
781
782
783 /*
784 * Check for valid arguments and clear output...
785 */
786
787 if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
788 return (-1);
789
790 *dest = 0;
791
792 if (fold != CUPS_FOLD_SIMPLE && fold != CUPS_FOLD_FULL)
793 return (-1);
794
795 /*
796 * Find case folding map...
797 */
798
799 result = cupsNormalizeMapsGet();
800
801 if (result < 0)
802 return (-1);
803
804 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
805 if (fmap->fold == fold)
806 break;
807
808 if (fmap == NULL)
809 return (-1);
810
811 /*
812 * Case fold input string to output string...
813 */
814
815 for (i = 0; i < (maxout - 1); i ++, src ++)
816 {
817 /*
818 * Check for case folding defined...
819 */
820
821 mp = (cups_ucs2_t *)bsearch(src, fmap->uni2fold, fmap->foldcount,
822 (sizeof(cups_ucs2_t) * 4), compare_foldchar);
823 if (mp == NULL)
824 {
825 *dest ++ = *src;
826 continue;
827 }
828
829 /*
830 * Case fold input character to one or two output characters...
831 */
832
833 mp ++;
834 *dest ++ = (cups_utf32_t) *mp ++;
835
836 if (*mp != 0 && fold == CUPS_FOLD_FULL)
837 {
838 i ++;
839 if (i >= (maxout - 1))
840 break;
841
842 *dest ++ = (cups_utf32_t) *mp;
843 }
844 }
845
846 *dest = 0;
847
848 cupsNormalizeMapsFree();
849
850 return ((int)(dest - start));
851 }
852
853
854 /*
855 * 'cupsUTF8CompareCaseless()' - Compare case folded UTF-8 strings.
856 */
857
858 int /* O - Difference of strings */
859 cupsUTF8CompareCaseless(
860 const cups_utf8_t *s1, /* I - String1 */
861 const cups_utf8_t *s2) /* I - String2 */
862 {
863 int difference; /* Difference of two strings */
864 int len; /* String length */
865 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
866 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
867
868
869 /*
870 * Check for valid arguments...
871 */
872
873 if (!s1 || !s2)
874 return (-1);
875
876 /*
877 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
878 */
879
880 len = cupsUTF8ToUTF32(work1, s1, CUPS_MAX_USTRING);
881
882 if (len < 0)
883 return (-1);
884
885 len = cupsUTF8ToUTF32(work2, s2, CUPS_MAX_USTRING);
886
887 if (len < 0)
888 return (-1);
889
890 /*
891 * Compare first internal UCS-4 to second internal UCS-4...
892 */
893
894 difference = cupsUTF32CompareCaseless(work1, work2);
895
896 return (difference);
897 }
898
899
900 /*
901 * 'cupsUTF32CompareCaseless()' - Compare case folded UTF-32 strings.
902 */
903
904 int /* O - Difference of strings */
905 cupsUTF32CompareCaseless(
906 const cups_utf32_t *s1, /* I - String1 */
907 const cups_utf32_t *s2) /* I - String2 */
908 {
909 int difference; /* Difference of two strings */
910 int len; /* String length */
911 cups_folding_t fold = CUPS_FOLD_FULL;
912 /* Case folding mode */
913 cups_utf32_t fold1[CUPS_MAX_USTRING];
914 /* First UCS-4 folded string */
915 cups_utf32_t fold2[CUPS_MAX_USTRING];
916 /* Second UCS-4 folded string */
917 cups_utf32_t *p1; /* First UCS-4 string pointer */
918 cups_utf32_t *p2; /* Second UCS-4 string pointer */
919
920
921 /*
922 * Check for valid arguments...
923 */
924
925 if (!s1 || !s2)
926 return (-1);
927
928 /*
929 * Case Fold input UTF-32 strings to internal UCS-4 strings...
930 */
931
932 len = cupsUTF32CaseFold(fold1, s1, CUPS_MAX_USTRING, fold);
933
934 if (len < 0)
935 return (-1);
936
937 len = cupsUTF32CaseFold(fold2, s2, CUPS_MAX_USTRING, fold);
938
939 if (len < 0)
940 return (-1);
941
942 /*
943 * Compare first internal UCS-4 to second internal UCS-4...
944 */
945
946 p1 = &fold1[0];
947 p2 = &fold2[0];
948
949 for (;; p1 ++, p2 ++)
950 {
951 difference = (int) (*p1 - *p2);
952
953 if (difference != 0)
954 break;
955
956 if ((*p1 == 0) && (*p2 == 0))
957 break;
958 }
959
960 return (difference);
961 }
962
963
964 /*
965 * 'cupsUTF8CompareIdentifier()' - Compare folded NFKC UTF-8 strings.
966 */
967
968 int /* O - Result of comparison */
969 cupsUTF8CompareIdentifier(
970 const cups_utf8_t *s1, /* I - String1 */
971 const cups_utf8_t *s2) /* I - String2 */
972 {
973 int difference; /* Difference of two strings */
974 int len; /* String length */
975 cups_utf32_t work1[CUPS_MAX_USTRING];/* First internal UCS-4 string */
976 cups_utf32_t work2[CUPS_MAX_USTRING];/* Second internal UCS-4 string */
977
978
979 /*
980 * Check for valid arguments...
981 */
982
983 if (!s1 || !s2)
984 return (-1);
985
986 /*
987 * Convert input UTF-8 to internal UCS-4 (and insert BOM)...
988 */
989
990 len = cupsUTF8ToUTF32(work1, s1, CUPS_MAX_USTRING);
991
992 if (len < 0)
993 return (-1);
994
995 len = cupsUTF8ToUTF32(work2, s2, CUPS_MAX_USTRING);
996
997 if (len < 0)
998 return (-1);
999
1000 /*
1001 * Compare first internal UCS-4 to second internal UCS-4...
1002 */
1003
1004 difference = cupsUTF32CompareIdentifier(work1, work2);
1005
1006 return (difference);
1007 }
1008
1009
1010 /*
1011 * 'cupsUTF32CompareIdentifier()' - Compare folded NFKC UTF-32 strings.
1012 */
1013
1014 int /* O - Result of comparison */
1015 cupsUTF32CompareIdentifier(
1016 const cups_utf32_t *s1, /* I - String1 */
1017 const cups_utf32_t *s2) /* I - String2 */
1018 {
1019 int difference; /* Difference of two strings */
1020 int len; /* String length */
1021 cups_folding_t fold = CUPS_FOLD_FULL;
1022 /* Case folding mode */
1023 cups_utf32_t fold1[CUPS_MAX_USTRING];
1024 /* First UCS-4 folded string */
1025 cups_utf32_t fold2[CUPS_MAX_USTRING];
1026 /* Second UCS-4 folded string */
1027 cups_normalize_t normalize = CUPS_NORM_NFKC;
1028 /* Normalization form */
1029 cups_utf32_t norm1[CUPS_MAX_USTRING];
1030 /* First UCS-4 normalized string */
1031 cups_utf32_t norm2[CUPS_MAX_USTRING];
1032 /* Second UCS-4 normalized string */
1033 cups_utf32_t *p1; /* First UCS-4 string pointer */
1034 cups_utf32_t *p2; /* Second UCS-4 string pointer */
1035
1036
1037 /*
1038 * Check for valid arguments...
1039 */
1040
1041 if (!s1 || !s2)
1042 return (-1);
1043
1044 /*
1045 * Case Fold input UTF-32 strings to internal UCS-4 strings...
1046 */
1047
1048 len = cupsUTF32CaseFold(fold1, s1, CUPS_MAX_USTRING, fold);
1049
1050 if (len < 0)
1051 return (-1);
1052
1053 len = cupsUTF32CaseFold(fold2, s2, CUPS_MAX_USTRING, fold);
1054
1055 if (len < 0)
1056 return (-1);
1057
1058 /*
1059 * Normalize internal UCS-4 strings to NFKC...
1060 */
1061
1062 len = cupsUTF32Normalize(norm1, fold1, CUPS_MAX_USTRING, normalize);
1063
1064 if (len < 0)
1065 return (-1);
1066
1067 len = cupsUTF32Normalize(norm2, fold2, CUPS_MAX_USTRING, normalize);
1068
1069 if (len < 0)
1070 return (-1);
1071
1072 /*
1073 * Compare first internal UCS-4 to second internal UCS-4...
1074 */
1075
1076 p1 = &norm1[0];
1077 p2 = &norm2[0];
1078
1079 for (;; p1 ++, p2 ++)
1080 {
1081 difference = (int) (*p1 - *p2);
1082
1083 if (difference != 0)
1084 break;
1085
1086 if ((*p1 == 0) && (*p2 == 0))
1087 break;
1088 }
1089
1090 return (difference);
1091 }
1092
1093
1094 /*
1095 * 'cupsUTF32CharacterProperty()' - Get UTF-32 character property.
1096 */
1097
1098 int /* O - Result of comparison */
1099 cupsUTF32CharacterProperty(
1100 const cups_utf32_t ch, /* I - Source char */
1101 const cups_property_t prop) /* I - Char Property */
1102 {
1103 int result; /* Result Value */
1104
1105
1106 /*
1107 * Check for valid arguments...
1108 */
1109
1110 if (ch == 0)
1111 return (-1);
1112
1113 /*
1114 * Find character property...
1115 */
1116
1117 switch (prop)
1118 {
1119 case CUPS_PROP_GENERAL_CATEGORY:
1120 result = (get_general_category(ch));
1121 break;
1122
1123 case CUPS_PROP_BIDI_CATEGORY:
1124 result = (get_bidi_category(ch));
1125 break;
1126
1127 case CUPS_PROP_COMBINING_CLASS:
1128 result = (get_combining_class(ch));
1129 break;
1130 case CUPS_PROP_BREAK_CLASS:
1131 result = (get_break_class(ch));
1132 break;
1133
1134 default:
1135 return (-1);
1136 }
1137
1138 return (result);
1139 }
1140
1141
1142 /*
1143 * 'get_general_category()' - Get UTF-32 Character General Category.
1144 */
1145
1146 static int /* O - Class or -1 on error */
1147 get_general_category(
1148 const cups_utf32_t ch) /* I - Source char */
1149 {
1150 int result; /* Result Value */
1151 cups_gencat_t gencat; /* General Category Value */
1152 _cups_prop_map_t *pmap; /* Unicode Property Map */
1153 _cups_prop_t *uni2prop; /* Unicode Char -> Properties */
1154 _cups_globals_t *cg = _cupsGlobals();
1155 /* Pointer to library globals */
1156
1157
1158 /*
1159 * Check for valid argument...
1160 */
1161
1162 if (ch == 0)
1163 return (-1);
1164
1165 /*
1166 * Find property map...
1167 */
1168
1169 result = cupsNormalizeMapsGet();
1170
1171 if (result < 0)
1172 return (-1);
1173
1174 pmap = cg->propmap_cache;
1175
1176 if (pmap == NULL)
1177 return (-1);
1178
1179 /*
1180 * Find character in map...
1181 */
1182
1183 uni2prop = (_cups_prop_t *)bsearch(&ch, pmap->uni2prop, pmap->propcount,
1184 (sizeof(_cups_prop_t)), compare_propchar);
1185
1186 cupsNormalizeMapsFree();
1187
1188 if (uni2prop == NULL)
1189 gencat = CUPS_GENCAT_CN; /* Other, Not Assigned */
1190 else
1191 gencat = (cups_gencat_t)uni2prop->gencat;
1192
1193 result = (int)gencat;
1194
1195 return (result);
1196 }
1197
1198
1199 /*
1200 * 'get_bidi_category()' - Get UTF-32 Character Bidi Category.
1201 */
1202
1203 static int /* O - Class or -1 on error */
1204 get_bidi_category(const cups_utf32_t ch)/* I - Source char */
1205 {
1206 int result; /* Result Value */
1207 cups_bidi_t bidicat; /* Bidi Category Value */
1208 _cups_prop_map_t *pmap; /* Unicode Property Map */
1209 _cups_prop_t *uni2prop; /* Unicode Char -> Properties */
1210 _cups_globals_t *cg = _cupsGlobals();
1211 /* Pointer to library globals */
1212
1213
1214 /*
1215 * Check for valid argument...
1216 */
1217
1218 if (ch == 0)
1219 return (-1);
1220
1221 /*
1222 * Find property map...
1223 */
1224
1225 result = cupsNormalizeMapsGet();
1226
1227 if (result < 0)
1228 return (-1);
1229
1230 pmap = cg->propmap_cache;
1231
1232 if (pmap == NULL)
1233 return (-1);
1234
1235 /*
1236 * Find character in map...
1237 */
1238
1239 uni2prop = (_cups_prop_t *)bsearch(&ch, pmap->uni2prop, pmap->propcount,
1240 (sizeof(_cups_prop_t)), compare_propchar);
1241
1242 cupsNormalizeMapsFree();
1243
1244 if (uni2prop == NULL)
1245 bidicat = CUPS_BIDI_ON; /* Other Neutral */
1246 else
1247 bidicat = (cups_bidi_t)uni2prop->bidicat;
1248
1249 result = (int)bidicat;
1250
1251 return (result);
1252 }
1253
1254 /*
1255 * 'get_combining_class()' - Get UTF-32 Character Combining Class.
1256 *
1257 * Note - Zero is non-combining (base character)
1258 */
1259
1260 static int /* O - Class or -1 on error */
1261 get_combining_class(
1262 const cups_utf32_t ch) /* I - Source char */
1263 {
1264 int result; /* Result Value */
1265 _cups_comb_map_t *cmap; /* Unicode Combining Class Map */
1266 _cups_comb_class_t combclass; /* Unicode Combining Class */
1267 _cups_comb_t *uni2comb; /* Unicode Char -> Combining Class */
1268 _cups_globals_t *cg = _cupsGlobals();
1269 /* Pointer to library globals */
1270
1271
1272 /*
1273 * Check for valid argument...
1274 */
1275
1276 if (ch == 0)
1277 return (-1);
1278
1279 /*
1280 * Find combining class map...
1281 */
1282
1283 result = cupsNormalizeMapsGet();
1284
1285 if (result < 0)
1286 return (-1);
1287
1288 cmap = cg->combmap_cache;
1289
1290 if (cmap == NULL)
1291 return (-1);
1292
1293 /*
1294 * Find combining character in map...
1295 */
1296
1297 uni2comb = (_cups_comb_t *)bsearch(&ch, cmap->uni2comb, cmap->combcount,
1298 (sizeof(_cups_comb_t)), compare_combchar);
1299
1300 cupsNormalizeMapsFree();
1301
1302 if (uni2comb == NULL)
1303 combclass = 0;
1304 else
1305 combclass = (_cups_comb_class_t)uni2comb->combclass;
1306
1307 result = (int)combclass;
1308
1309 return (result);
1310 }
1311
1312
1313 /*
1314 * 'get_break_class()' - Get UTF-32 Character Line Break Class.
1315 */
1316
1317 static int /* O - Class or -1 on error */
1318 get_break_class(const cups_utf32_t ch) /* I - Source char */
1319 {
1320 int result; /* Result Value */
1321 _cups_break_map_t *bmap; /* Unicode Line Break Class Map */
1322 cups_break_class_t breakclass; /* Unicode Line Break Class */
1323 cups_ucs2_t *uni2break; /* Unicode -> Line Break Class */
1324 _cups_globals_t *cg = _cupsGlobals();
1325 /* Pointer to library globals */
1326
1327
1328 /*
1329 * Check for valid argument...
1330 */
1331
1332 if (ch == 0)
1333 return (-1);
1334
1335 /*
1336 * Find line break class map...
1337 */
1338
1339 result = cupsNormalizeMapsGet();
1340
1341 if (result < 0)
1342 return (-1);
1343
1344 bmap = cg->breakmap_cache;
1345
1346 if (bmap == NULL)
1347 return (-1);
1348
1349 /*
1350 * Find line break character in map...
1351 */
1352
1353 uni2break = (cups_ucs2_t *)bsearch(&ch, bmap->uni2break, bmap->breakcount,
1354 (sizeof(cups_ucs2_t) * 3),
1355 compare_breakchar);
1356
1357 cupsNormalizeMapsFree();
1358
1359 if (uni2break == NULL)
1360 breakclass = CUPS_BREAK_AI;
1361 else
1362 breakclass = (cups_break_class_t)*(uni2break + 2);
1363
1364 result = (int)breakclass;
1365
1366 return (result);
1367 }
1368
1369
1370 /*
1371 * 'get_map_count()' - Count lines in a map file.
1372 */
1373
1374 static int /* O - Count or -1 on error */
1375 get_map_count(const char *filename) /* I - Map Filename */
1376 {
1377 int i; /* Looping variable */
1378 cups_file_t *fp; /* Map input file pointer */
1379 char *s; /* Line parsing pointer */
1380 char line[256]; /* Line from input map file */
1381 cups_utf32_t unichar; /* Unicode character value */
1382
1383
1384 /*
1385 * Open map input file...
1386 */
1387
1388 if (!filename || !*filename)
1389 return (-1);
1390
1391 fp = cupsFileOpen(filename, "r");
1392 if (fp == NULL)
1393 return (-1);
1394
1395 /*
1396 * Count lines in map input file...
1397 */
1398
1399 for (i = 0; i < 50000;)
1400 {
1401 s = cupsFileGets(fp, line, sizeof(line));
1402 if (s == NULL)
1403 break;
1404 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1405 continue;
1406 if (strncmp (s, "0x", 2) == 0)
1407 s += 2;
1408 if (sscanf(s, "%lx", &unichar) != 1)
1409 break;
1410 if (unichar > 0xffff)
1411 break;
1412 i ++;
1413 }
1414 if (i == 0)
1415 i = -1;
1416
1417 /*
1418 * Close file and return map count (non-comment line count)...
1419 */
1420
1421 cupsFileClose(fp);
1422
1423 return (i);
1424 }
1425
1426
1427 /*
1428 * 'get_normmap()' - Get Unicode normalization map to cache.
1429 */
1430
1431 static int /* O - Zero or -1 on error */
1432 get_normmap(
1433 const cups_normalize_t normalize) /* I - Normalization Form */
1434 {
1435 int i; /* Looping variable */
1436 cups_utf32_t unichar1; /* Unicode character value */
1437 cups_utf32_t unichar2; /* Unicode character value */
1438 cups_utf32_t unichar3; /* Unicode character value */
1439 _cups_norm_map_t *nmap; /* Unicode Normalization Map */
1440 int normcount; /* Count of Unicode Source Chars */
1441 cups_ucs2_t *uni2norm; /* Unicode Char -> Normalization */
1442 char *mapname; /* Normalization map name */
1443 char filename[1024]; /* Filename for charset map file */
1444 cups_file_t *fp; /* Normalization map file pointer */
1445 char *s; /* Line parsing pointer */
1446 char line[256]; /* Line from input map file */
1447 _cups_globals_t *cg = _cupsGlobals();
1448 /* Pointer to library globals */
1449
1450
1451 /*
1452 * See if we already have this normalization map loaded...
1453 */
1454
1455 for (nmap = cg->normmap_cache; nmap != NULL; nmap = nmap->next)
1456 if (nmap->normalize == normalize)
1457 return (0);
1458
1459 /*
1460 * Get the mapping name...
1461 */
1462
1463 switch (normalize)
1464 {
1465 case CUPS_NORM_NFD: /* Canonical Decomposition */
1466 mapname = "uni-nfd.txt";
1467 break;
1468
1469 case CUPS_NORM_NFKD: /* Compatibility Decomposition */
1470 mapname = "uni-nfkd.txt";
1471 break;
1472
1473 case CUPS_NORM_NFC: /* Canonical Composition */
1474 mapname = "uni-nfc.txt";
1475 break;
1476
1477 case CUPS_NORM_NFKC: /* no such map file... */
1478 default:
1479 return (-1);
1480 }
1481
1482 /*
1483 * Open normalization map input file...
1484 */
1485
1486 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1487 cg->cups_datadir, mapname);
1488 if ((normcount = get_map_count(filename)) <= 0)
1489 return (-1);
1490
1491 fp = cupsFileOpen(filename, "r");
1492 if (fp == NULL)
1493 return (-1);
1494
1495 /*
1496 * Allocate memory for normalization map and add to cache...
1497 */
1498
1499 nmap = (_cups_norm_map_t *)calloc(1, sizeof(_cups_norm_map_t));
1500 if (nmap == NULL)
1501 {
1502 cupsFileClose(fp);
1503 return (-1);
1504 }
1505
1506 uni2norm = (cups_ucs2_t *)calloc(1, sizeof(cups_ucs2_t) * 3 * normcount);
1507 if (uni2norm == NULL)
1508 {
1509 free(nmap);
1510 cupsFileClose(fp);
1511 return (-1);
1512 }
1513 nmap->next = cg->normmap_cache;
1514 cg->normmap_cache = nmap;
1515 nmap->used ++;
1516 nmap->normalize = normalize;
1517 nmap->normcount = normcount;
1518 nmap->uni2norm = uni2norm;
1519
1520 /*
1521 * Save normalization map into memory for later use...
1522 */
1523 for (i = 0; i < normcount; )
1524 {
1525 s = cupsFileGets(fp, line, sizeof(line));
1526 if (s == NULL)
1527 break;
1528 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1529 continue;
1530 if (sscanf(s, "%lx %lx %lx", &unichar1, &unichar2, &unichar3) != 3)
1531 break;
1532 if ((unichar1 > 0xffff)
1533 || (unichar2 > 0xffff)
1534 || (unichar3 > 0xffff))
1535 break;
1536 *uni2norm ++ = (cups_ucs2_t) unichar1;
1537 *uni2norm ++ = (cups_ucs2_t) unichar2;
1538 *uni2norm ++ = (cups_ucs2_t) unichar3;
1539 i ++;
1540 }
1541 if (i < normcount)
1542 nmap->normcount = i;
1543 cupsFileClose(fp);
1544 return (0);
1545 }
1546
1547
1548 /*
1549 * 'get_foldmap()' - Get Unicode case folding map to cache.
1550 */
1551
1552 static int /* O - Zero or -1 on error */
1553 get_foldmap(const cups_folding_t fold) /* I - Case folding type */
1554 {
1555 int i; /* Looping variable */
1556 cups_utf32_t unichar1; /* Unicode character value */
1557 cups_utf32_t unichar2; /* Unicode character value */
1558 cups_utf32_t unichar3; /* Unicode character value */
1559 cups_utf32_t unichar4; /* Unicode character value */
1560 _cups_fold_map_t *fmap; /* Unicode Case Folding Map */
1561 int foldcount; /* Count of Unicode Source Chars */
1562 cups_ucs2_t *uni2fold; /* Unicode -> Folded Char(s) */
1563 char *mapname; /* Case Folding map name */
1564 char filename[1024]; /* Filename for charset map file */
1565 cups_file_t *fp; /* Case Folding map file pointer */
1566 char *s; /* Line parsing pointer */
1567 char line[256]; /* Line from input map file */
1568 _cups_globals_t *cg = _cupsGlobals();
1569 /* Pointer to library globals */
1570
1571
1572 /*
1573 * See if we already have this case folding map loaded...
1574 */
1575
1576 for (fmap = cg->foldmap_cache; fmap != NULL; fmap = fmap->next)
1577 if (fmap->fold == fold)
1578 return (0);
1579
1580 /*
1581 * Get the mapping name...
1582 */
1583
1584 switch (fold)
1585 {
1586 case CUPS_FOLD_SIMPLE: /* Simple case folding */
1587 mapname = "uni-fold.txt";
1588 break;
1589 case CUPS_FOLD_FULL: /* Full case folding */
1590 mapname = "uni-full.txt";
1591 break;
1592 default:
1593 return (-1);
1594 }
1595
1596 /*
1597 * Open case folding map input file...
1598 */
1599
1600 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1601 cg->cups_datadir, mapname);
1602 if ((foldcount = get_map_count(filename)) <= 0)
1603 return (-1);
1604 fp = cupsFileOpen(filename, "r");
1605 if (fp == NULL)
1606 return (-1);
1607
1608 /*
1609 * Allocate memory for case folding map and add to cache...
1610 */
1611 fmap = (_cups_fold_map_t *)calloc(1, sizeof(_cups_fold_map_t));
1612 if (fmap == NULL)
1613 {
1614 cupsFileClose(fp);
1615 return (-1);
1616 }
1617 uni2fold = (cups_ucs2_t *)calloc(1, sizeof(cups_ucs2_t) * 4 * foldcount);
1618 if (uni2fold == NULL)
1619 {
1620 free(fmap);
1621 cupsFileClose(fp);
1622 return (-1);
1623 }
1624 fmap->next = cg->foldmap_cache;
1625 cg->foldmap_cache = fmap;
1626 fmap->used ++;
1627 fmap->fold = fold;
1628 fmap->foldcount = foldcount;
1629 fmap->uni2fold = uni2fold;
1630
1631 /*
1632 * Save case folding map into memory for later use...
1633 */
1634
1635 for (i = 0; i < foldcount; )
1636 {
1637 s = cupsFileGets(fp, line, sizeof(line));
1638 if (s == NULL)
1639 break;
1640 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1641 continue;
1642 unichar1 = unichar2 = unichar3 = unichar4 = 0;
1643 if ((fold == CUPS_FOLD_SIMPLE)
1644 && (sscanf(s, "%lx %lx", &unichar1, &unichar2) != 2))
1645 break;
1646 if ((fold == CUPS_FOLD_FULL)
1647 && (sscanf(s, "%lx %lx %lx %lx",
1648 &unichar1, &unichar2, &unichar3, &unichar4) != 4))
1649 break;
1650 if ((unichar1 > 0xffff)
1651 || (unichar2 > 0xffff)
1652 || (unichar3 > 0xffff)
1653 || (unichar4 > 0xffff))
1654 break;
1655 *uni2fold ++ = (cups_ucs2_t) unichar1;
1656 *uni2fold ++ = (cups_ucs2_t) unichar2;
1657 *uni2fold ++ = (cups_ucs2_t) unichar3;
1658 *uni2fold ++ = (cups_ucs2_t) unichar4;
1659 i ++;
1660 }
1661 if (i < foldcount)
1662 fmap->foldcount = i;
1663 cupsFileClose(fp);
1664 return (0);
1665 }
1666
1667 /*
1668 * 'get_propmap()' - Get Unicode character property map to cache.
1669 */
1670
1671 static int /* O - Zero or -1 on error */
1672 get_propmap(void)
1673 {
1674 int i, j; /* Looping variables */
1675 size_t len; /* String length */
1676 cups_utf32_t unichar; /* Unicode character value */
1677 cups_gencat_t gencat; /* General Category Value */
1678 cups_bidi_t bidicat; /* Bidi Category Value */
1679 _cups_prop_map_t *pmap; /* Unicode Char Property Map */
1680 int propcount; /* Count of Unicode Source Chars */
1681 _cups_prop_t *uni2prop; /* Unicode Char -> Properties */
1682 char *mapname; /* Char Property map name */
1683 char filename[1024]; /* Filename for charset map file */
1684 cups_file_t *fp; /* Char Property map file pointer */
1685 char *s; /* Line parsing pointer */
1686 char line[256]; /* Line from input map file */
1687 _cups_globals_t *cg = _cupsGlobals();
1688 /* Pointer to library globals */
1689
1690
1691 /*
1692 * See if we already have this char properties map loaded...
1693 */
1694
1695 if ((pmap = cg->propmap_cache) != NULL)
1696 return (0);
1697
1698 /*
1699 * Get the mapping name...
1700 */
1701
1702 mapname = "uni-prop.txt";
1703
1704 /*
1705 * Open char properties map input file...
1706 */
1707 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1708 cg->cups_datadir, mapname);
1709 if ((propcount = get_map_count(filename)) <= 0)
1710 return (-1);
1711 fp = cupsFileOpen(filename, "r");
1712 if (fp == NULL)
1713 return (-1);
1714
1715 /*
1716 * Allocate memory for char properties map and add to cache...
1717 */
1718 pmap = (_cups_prop_map_t *)calloc(1, sizeof(_cups_prop_map_t));
1719 if (pmap == NULL)
1720 {
1721 cupsFileClose(fp);
1722 return (-1);
1723 }
1724 uni2prop = (_cups_prop_t *)calloc(1, sizeof(_cups_prop_t) * propcount);
1725 if (uni2prop == NULL)
1726 {
1727 free(pmap);
1728 cupsFileClose(fp);
1729 return (-1);
1730 }
1731 cg->propmap_cache = pmap;
1732 pmap->used ++;
1733 pmap->propcount = propcount;
1734 pmap->uni2prop = uni2prop;
1735
1736 /*
1737 * Save char properties map into memory for later use...
1738 */
1739 for (i = 0; i < propcount; )
1740 {
1741 s = cupsFileGets(fp, line, sizeof(line));
1742 if (s == NULL)
1743 break;
1744 if (strlen(s) > 0)
1745 *(s + strlen(s) - 1) = '\0';
1746 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1747 continue;
1748 if (sscanf(s, "%lx", &unichar) != 1)
1749 break;
1750 if (unichar > 0xffff)
1751 break;
1752 while ((*s != '\0') && (*s != ';'))
1753 s ++;
1754 if (*s != ';')
1755 break;
1756 s ++;
1757 for (j = 0; gencat_index[j].str != NULL; j ++)
1758 {
1759 len = strlen(gencat_index[j].str);
1760 if (strncmp (s, gencat_index[j].str, len) == 0)
1761 break;
1762 }
1763 if (gencat_index[j].str == NULL)
1764 return (-1);
1765 gencat = gencat_index[j].gencat;
1766 while ((*s != '\0') && (*s != ';'))
1767 s ++;
1768 if (*s != ';')
1769 break;
1770 s ++;
1771 for (j = 0; bidicat_index[j] != NULL; j ++)
1772 {
1773 len = strlen(bidicat_index[j]);
1774 if (strncmp (s, bidicat_index[j], len) == 0)
1775 break;
1776 }
1777 if (bidicat_index[j] == NULL)
1778 return (-1);
1779 bidicat = (cups_bidi_t) j;
1780 uni2prop->ch = (cups_ucs2_t) unichar;
1781 uni2prop->gencat = (unsigned char) gencat;
1782 uni2prop->bidicat = (unsigned char) bidicat;
1783 uni2prop ++;
1784 i ++;
1785 }
1786 if (i < propcount)
1787 pmap->propcount = i;
1788 cupsFileClose(fp);
1789 return (0);
1790 }
1791
1792
1793 /*
1794 * 'get_combmap()' - Get Unicode combining class map to cache.
1795 */
1796
1797 static int /* O - Zero or -1 on error */
1798 get_combmap(void)
1799 {
1800 int i; /* Looping variable */
1801 cups_utf32_t unichar; /* Unicode character value */
1802 int combclass; /* Unicode char combining class */
1803 _cups_comb_map_t *cmap; /* Unicode Comb Class Map */
1804 int combcount; /* Count of Unicode Source Chars */
1805 _cups_comb_t *uni2comb; /* Unicode Char -> Combining Class */
1806 char *mapname; /* Comb Class map name */
1807 char filename[1024]; /* Filename for charset map file */
1808 cups_file_t *fp; /* Comb Class map file pointer */
1809 char *s; /* Line parsing pointer */
1810 char line[256]; /* Line from input map file */
1811 _cups_globals_t *cg = _cupsGlobals();
1812 /* Pointer to library globals */
1813
1814
1815 /*
1816 * See if we already have this combining class map loaded...
1817 */
1818
1819 if ((cmap = cg->combmap_cache) != NULL)
1820 return (0);
1821
1822 /*
1823 * Get the mapping name...
1824 */
1825
1826 mapname = "uni-comb.txt";
1827
1828 /*
1829 * Open combining class map input file...
1830 */
1831
1832 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1833 cg->cups_datadir, mapname);
1834 if ((combcount = get_map_count(filename)) <= 0)
1835 return (-1);
1836 fp = cupsFileOpen(filename, "r");
1837 if (fp == NULL)
1838 return (-1);
1839
1840 /*
1841 * Allocate memory for combining class map and add to cache...
1842 */
1843
1844 cmap = (_cups_comb_map_t *)calloc(1, sizeof(_cups_comb_map_t));
1845 if (cmap == NULL)
1846 {
1847 cupsFileClose(fp);
1848 return (-1);
1849 }
1850
1851 uni2comb = (_cups_comb_t *)calloc(1, sizeof(_cups_comb_t) * combcount);
1852 if (uni2comb == NULL)
1853 {
1854 free(cmap);
1855 cupsFileClose(fp);
1856 return (-1);
1857 }
1858 cg->combmap_cache = cmap;
1859 cmap->used ++;
1860 cmap->combcount = combcount;
1861 cmap->uni2comb = uni2comb;
1862
1863 /*
1864 * Save combining class map into memory for later use...
1865 */
1866 for (i = 0; i < combcount; )
1867 {
1868 s = cupsFileGets(fp, line, sizeof(line));
1869 if (s == NULL)
1870 break;
1871 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1872 continue;
1873 if (sscanf(s, "%lx", &unichar) != 1)
1874 break;
1875 if (unichar > 0xffff)
1876 break;
1877 while ((*s != '\0') && (*s != ';'))
1878 s ++;
1879 if (*s != ';')
1880 break;
1881 s ++;
1882 if (sscanf(s, "%d", &combclass) != 1)
1883 break;
1884 uni2comb->ch = (cups_ucs2_t) unichar;
1885 uni2comb->combclass = (unsigned char) combclass;
1886 uni2comb ++;
1887 i ++;
1888 }
1889 if (i < combcount)
1890 cmap->combcount = i;
1891 cupsFileClose(fp);
1892 return (0);
1893 }
1894
1895
1896 /*
1897 * 'get_breakmap()' - Get Unicode line break class map to cache.
1898 */
1899
1900 static int /* O - Zero or -1 on error */
1901 get_breakmap(void)
1902 {
1903 int i, j; /* Looping variables */
1904 int len; /* String length */
1905 cups_utf32_t unichar1; /* Unicode character value */
1906 cups_utf32_t unichar2; /* Unicode character value */
1907 cups_break_class_t breakclass; /* Unicode char line break class */
1908 _cups_break_map_t *bmap; /* Unicode Line Break Class Map */
1909 int breakcount; /* Count of Unicode Source Chars */
1910 cups_ucs2_t *uni2break; /* Unicode -> Line Break Class */
1911 char *mapname; /* Comb Class map name */
1912 char filename[1024]; /* Filename for charset map file */
1913 cups_file_t *fp; /* Comb Class map file pointer */
1914 char *s; /* Line parsing pointer */
1915 char line[256]; /* Line from input map file */
1916 _cups_globals_t *cg = _cupsGlobals();
1917 /* Pointer to library globals */
1918
1919
1920 /*
1921 * See if we already have this line break class map loaded...
1922 */
1923
1924 if ((bmap = cg->breakmap_cache) != NULL)
1925 return (0);
1926
1927 /*
1928 * Get the mapping name...
1929 */
1930
1931 mapname = "uni-line.txt";
1932
1933 /*
1934 * Open line break class map input file...
1935 */
1936
1937 snprintf(filename, sizeof(filename), "%s/charmaps/%s",
1938 cg->cups_datadir, mapname);
1939 if ((breakcount = get_map_count(filename)) <= 0)
1940 return (-1);
1941 fp = cupsFileOpen(filename, "r");
1942 if (fp == NULL)
1943 return (-1);
1944
1945 /*
1946 * Allocate memory for line break class map and add to cache...
1947 */
1948
1949 bmap = (_cups_break_map_t *)calloc(1, sizeof(_cups_break_map_t));
1950 if (bmap == NULL)
1951 {
1952 cupsFileClose(fp);
1953 return (-1);
1954 }
1955
1956 uni2break = (cups_ucs2_t *)calloc(1, sizeof(cups_ucs2_t) * 3 * breakcount);
1957 if (uni2break == NULL)
1958 {
1959 free(bmap);
1960 cupsFileClose(fp);
1961 return (-1);
1962 }
1963 cg->breakmap_cache = bmap;
1964 bmap->used ++;
1965 bmap->breakcount = breakcount;
1966 bmap->uni2break = uni2break;
1967
1968 /*
1969 * Save line break class map into memory for later use...
1970 */
1971 for (i = 0; i < breakcount; )
1972 {
1973 s = cupsFileGets(fp, line, sizeof(line));
1974 if (s == NULL)
1975 break;
1976 if (strlen(s) > 0)
1977 *(s + strlen(s) - 1) = '\0';
1978 if ((*s == '#') || (*s == '\n') || (*s == '\0'))
1979 continue;
1980 if (sscanf(s, "%lx %lx", &unichar1, &unichar2) != 2)
1981 break;
1982 if ((unichar1 > 0xffff)
1983 || (unichar2 > 0xffff))
1984 break;
1985 while ((*s != '\0') && (*s != ';'))
1986 s ++;
1987 if (*s != ';')
1988 break;
1989 s ++;
1990 for (j = 0; break_index[j].str != NULL; j ++)
1991 {
1992 len = strlen (break_index[j].str);
1993 if (strncmp (s, break_index[j].str, len) == 0)
1994 break;
1995 }
1996 if (break_index[j].str == NULL)
1997 return (-1);
1998 breakclass = break_index[j].breakclass;
1999 *uni2break ++ = (cups_ucs2_t) unichar1;
2000 *uni2break ++ = (cups_ucs2_t) unichar2;
2001 *uni2break ++ = (cups_ucs2_t) breakclass;
2002 i ++;
2003 }
2004 if (i < breakcount)
2005 bmap->breakcount = i;
2006 cupsFileClose(fp);
2007 return (0);
2008 }
2009
2010
2011 /*
2012 * 'compare_compose()' - Compare key for compose match.
2013 *
2014 * Note - This function cannot be easily modified for 32-bit Unicode.
2015 */
2016
2017 static int /* O - Result of comparison */
2018 compare_compose(const void *k1, /* I - Key char */
2019 const void *k2) /* I - Map char */
2020 {
2021 cups_utf32_t *kp = (cups_utf32_t *)k1;
2022 /* Key char pointer */
2023 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2024 unsigned long key; /* Pair of key characters */
2025 unsigned long map; /* Pair of map characters */
2026 int result; /* Result Value */
2027
2028
2029 key = (*kp << 16);
2030 key |= *(kp + 1);
2031 map = (unsigned long) (*mp << 16);
2032 map |= (unsigned long) *(mp + 1);
2033
2034 if (key >= map)
2035 result = (int) (key - map);
2036 else
2037 result = -1 * ((int) (map - key));
2038
2039 return (result);
2040 }
2041
2042
2043 /*
2044 * 'compare_decompose()' - Compare key for decompose match.
2045 */
2046
2047 static int /* O - Result of comparison */
2048 compare_decompose(const void *k1, /* I - Key char */
2049 const void *k2) /* I - Map char */
2050 {
2051 cups_utf32_t *kp = (cups_utf32_t *)k1;
2052 /* Key char pointer */
2053 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2054 cups_ucs2_t ch; /* Key char as UCS-2 */
2055 int result; /* Result Value */
2056
2057
2058 ch = (cups_ucs2_t) *kp;
2059
2060 if (ch >= *mp)
2061 result = (int) (ch - *mp);
2062 else
2063 result = -1 * ((int) (*mp - ch));
2064
2065 return (result);
2066 }
2067
2068
2069 /*
2070 * 'compare_foldchar()' - Compare key for case fold match.
2071 */
2072
2073 static int /* O - Result of comparison */
2074 compare_foldchar(const void *k1, /* I - Key char */
2075 const void *k2) /* I - Map char */
2076 {
2077 cups_utf32_t *kp = (cups_utf32_t *)k1;
2078 /* Key char pointer */
2079 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2080 cups_ucs2_t ch; /* Key char as UCS-2 */
2081 int result; /* Result Value */
2082
2083
2084 ch = (cups_ucs2_t) *kp;
2085
2086 if (ch >= *mp)
2087 result = (int) (ch - *mp);
2088 else
2089 result = -1 * ((int) (*mp - ch));
2090
2091 return (result);
2092 }
2093
2094
2095 /*
2096 * 'compare_combchar()' - Compare key for combining char match.
2097 */
2098
2099 static int /* O - Result of comparison */
2100 compare_combchar(const void *k1, /* I - Key char */
2101 const void *k2) /* I - Map char */
2102 {
2103 cups_utf32_t *kp = (cups_utf32_t *)k1;
2104 /* Key char pointer */
2105 _cups_comb_t *cp = (_cups_comb_t *)k2;/* Combining map row pointer */
2106 cups_ucs2_t ch; /* Key char as UCS-2 */
2107 int result; /* Result Value */
2108
2109
2110 ch = (cups_ucs2_t) *kp;
2111
2112 if (ch >= cp->ch)
2113 result = (int) (ch - cp->ch);
2114 else
2115 result = -1 * ((int) (cp->ch - ch));
2116
2117 return (result);
2118 }
2119
2120
2121 /*
2122 * 'compare_breakchar()' - Compare key for line break char match.
2123 */
2124
2125 static int /* O - Result of comparison */
2126 compare_breakchar(const void *k1, /* I - Key char */
2127 const void *k2) /* I - Map char */
2128 {
2129 cups_utf32_t *kp = (cups_utf32_t *)k1;
2130 /* Key char pointer */
2131 cups_ucs2_t *mp = (cups_ucs2_t *)k2;/* Map char pointer */
2132 cups_ucs2_t ch; /* Key char as UCS-2 */
2133 int result; /* Result Value */
2134
2135
2136 ch = (cups_ucs2_t) *kp;
2137
2138 if (ch < *mp)
2139 result = -1 * (int) (*mp - ch);
2140 else if (ch > *(mp + 1))
2141 result = (int) (ch - *(mp + 1));
2142 else
2143 result = 0;
2144
2145 return (result);
2146 }
2147
2148
2149 /*
2150 * 'compare_propchar()' - Compare key for property char match.
2151 */
2152
2153 static int /* O - Result of comparison */
2154 compare_propchar(const void *k1, /* I - Key char */
2155 const void *k2) /* I - Map char */
2156 {
2157 cups_utf32_t *kp = (cups_utf32_t *)k1;
2158 /* Key char pointer */
2159 _cups_prop_t *pp = (_cups_prop_t *)k2;/* Property map row pointer */
2160 cups_ucs2_t ch; /* Key char as UCS-2 */
2161 int result; /* Result Value */
2162
2163
2164 ch = (cups_ucs2_t) *kp;
2165
2166 if (ch >= pp->ch)
2167 result = (int) (ch - pp->ch);
2168 else
2169 result = -1 * ((int) (pp->ch - ch));
2170
2171 return (result);
2172 }
2173
2174
2175 /*
2176 * End of "$Id: normalize.c 4903 2006-01-10 20:02:46Z mike $"
2177 */