]>
Commit | Line | Data |
---|---|---|
ef416fc2 | 1 | /* |
2 | * "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $" | |
3 | * | |
4 | * Unicode normalization for the Common UNIX Printing System (CUPS). | |
5 | * | |
6 | * Copyright 1997-2005 by Easy Software Products. | |
7 | * | |
8 | * These coded instructions, statements, and computer programs are | |
9 | * the property of Easy Software Products and are protected by Federal | |
10 | * copyright law. Distribution and use rights are outlined in the | |
11 | * file "LICENSE.txt" which should have been included with this file. | |
12 | * If this file is missing or damaged please contact Easy Software | |
13 | * Products at: | |
14 | * | |
15 | * Attn: CUPS Licensing Information | |
16 | * Easy Software Products | |
17 | * 44141 Airport View Drive, Suite 204 | |
18 | * Hollywood, Maryland 20636 USA | |
19 | * | |
20 | * Voice: (301) 373-9600 | |
21 | * EMail: cups-info@cups.org | |
22 | * WWW: http://www.cups.org | |
23 | */ | |
24 | ||
25 | #ifndef _CUPS_NORMALIZE_H_ | |
26 | # define _CUPS_NORMALIZE_H_ | |
27 | ||
28 | /* | |
29 | * Include necessary headers... | |
30 | */ | |
31 | ||
32 | #include "transcode.h" | |
33 | ||
34 | # ifdef __cplusplus | |
35 | extern "C" { | |
36 | # endif /* __cplusplus */ | |
37 | ||
38 | ||
39 | /* | |
40 | * Types... | |
41 | */ | |
42 | ||
43 | typedef enum /**** Normalizataion Types ****/ | |
44 | { | |
45 | CUPS_NORM_NFD, /* Canonical Decomposition */ | |
46 | CUPS_NORM_NFKD, /* Compatibility Decomposition */ | |
47 | CUPS_NORM_NFC, /* NFD, them Canonical Composition */ | |
48 | CUPS_NORM_NFKC /* NFKD, them Canonical Composition */ | |
49 | } cups_normalize_t; | |
50 | ||
51 | typedef enum /**** Case Folding Types ****/ | |
52 | { | |
53 | CUPS_FOLD_SIMPLE, /* Simple - no expansion in size */ | |
54 | CUPS_FOLD_FULL /* Full - possible expansion in size */ | |
55 | } cups_folding_t; | |
56 | ||
57 | typedef enum /**** Unicode Char Property Types ****/ | |
58 | { | |
59 | CUPS_PROP_GENERAL_CATEGORY, /* See 'cups_gencat_t' enum */ | |
60 | CUPS_PROP_BIDI_CATEGORY, /* See 'cups_bidi_t' enum */ | |
61 | CUPS_PROP_COMBINING_CLASS, /* See '_cups_comb_class_t' type */ | |
62 | CUPS_PROP_BREAK_CLASS /* See 'cups_break_class_t' enum */ | |
63 | } cups_property_t; | |
64 | ||
65 | ||
66 | /* | |
67 | * Note - Use major classes for logic optimizations (by mask). | |
68 | */ | |
69 | ||
70 | typedef enum /**** Unicode General Category ****/ | |
71 | { | |
72 | CUPS_GENCAT_L = 0x10, /* Letter major class */ | |
73 | CUPS_GENCAT_LU = 0x11, /* Lu Letter, Uppercase */ | |
74 | CUPS_GENCAT_LL = 0x12, /* Ll Letter, Lowercase */ | |
75 | CUPS_GENCAT_LT = 0x13, /* Lt Letter, Titlecase */ | |
76 | CUPS_GENCAT_LM = 0x14, /* Lm Letter, Modifier */ | |
77 | CUPS_GENCAT_LO = 0x15, /* Lo Letter, Other */ | |
78 | CUPS_GENCAT_M = 0x20, /* Mark major class */ | |
79 | CUPS_GENCAT_MN = 0x21, /* Mn Mark, Non-Spacing */ | |
80 | CUPS_GENCAT_MC = 0x22, /* Mc Mark, Spacing Combining */ | |
81 | CUPS_GENCAT_ME = 0x23, /* Me Mark, Enclosing */ | |
82 | CUPS_GENCAT_N = 0x30, /* Number major class */ | |
83 | CUPS_GENCAT_ND = 0x31, /* Nd Number, Decimal Digit */ | |
84 | CUPS_GENCAT_NL = 0x32, /* Nl Number, Letter */ | |
85 | CUPS_GENCAT_NO = 0x33, /* No Number, Other */ | |
86 | CUPS_GENCAT_P = 0x40, /* Punctuation major class */ | |
87 | CUPS_GENCAT_PC = 0x41, /* Pc Punctuation, Connector */ | |
88 | CUPS_GENCAT_PD = 0x42, /* Pd Punctuation, Dash */ | |
89 | CUPS_GENCAT_PS = 0x43, /* Ps Punctuation, Open (start) */ | |
90 | CUPS_GENCAT_PE = 0x44, /* Pe Punctuation, Close (end) */ | |
91 | CUPS_GENCAT_PI = 0x45, /* Pi Punctuation, Initial Quote */ | |
92 | CUPS_GENCAT_PF = 0x46, /* Pf Punctuation, Final Quote */ | |
93 | CUPS_GENCAT_PO = 0x47, /* Po Punctuation, Other */ | |
94 | CUPS_GENCAT_S = 0x50, /* Symbol major class */ | |
95 | CUPS_GENCAT_SM = 0x51, /* Sm Symbol, Math */ | |
96 | CUPS_GENCAT_SC = 0x52, /* Sc Symbol, Currency */ | |
97 | CUPS_GENCAT_SK = 0x53, /* Sk Symbol, Modifier */ | |
98 | CUPS_GENCAT_SO = 0x54, /* So Symbol, Other */ | |
99 | CUPS_GENCAT_Z = 0x60, /* Separator major class */ | |
100 | CUPS_GENCAT_ZS = 0x61, /* Zs Separator, Space */ | |
101 | CUPS_GENCAT_ZL = 0x62, /* Zl Separator, Line */ | |
102 | CUPS_GENCAT_ZP = 0x63, /* Zp Separator, Paragraph */ | |
103 | CUPS_GENCAT_C = 0x70, /* Other (miscellaneous) major class */ | |
104 | CUPS_GENCAT_CC = 0x71, /* Cc Other, Control */ | |
105 | CUPS_GENCAT_CF = 0x72, /* Cf Other, Format */ | |
106 | CUPS_GENCAT_CS = 0x73, /* Cs Other, Surrogate */ | |
107 | CUPS_GENCAT_CO = 0x74, /* Co Other, Private Use */ | |
108 | CUPS_GENCAT_CN = 0x75 /* Cn Other, Not Assigned */ | |
109 | } cups_gencat_t; | |
110 | ||
111 | typedef enum /**** Unicode Bidi Category ****/ | |
112 | { | |
113 | CUPS_BIDI_L, /* Left-to-Right (Alpha, Ideographic) */ | |
114 | CUPS_BIDI_LRE, /* Left-to-Right Embedding (explicit) */ | |
115 | CUPS_BIDI_LRO, /* Left-to-Right Override (explicit) */ | |
116 | CUPS_BIDI_R, /* Right-to-Left (Hebrew alpha/punct) */ | |
117 | CUPS_BIDI_AL, /* Right-to-Left Arabic (Arabic, etc) */ | |
118 | CUPS_BIDI_RLE, /* Right-to-Left Embedding (explicit) */ | |
119 | CUPS_BIDI_RLO, /* Right-to-Left Override (explicit) */ | |
120 | CUPS_BIDI_PDF, /* Pop Directional Format */ | |
121 | CUPS_BIDI_EN, /* Euro Number (Euro & Indic digits) */ | |
122 | CUPS_BIDI_ES, /* Euro Number Separator (Slash) */ | |
123 | CUPS_BIDI_ET, /* Euro Number Terminator */ | |
124 | CUPS_BIDI_AN, /* Arabic Number (digits, separators) */ | |
125 | CUPS_BIDI_CS, /* Common Number Separator */ | |
126 | CUPS_BIDI_NSM, /* Non-Spacing Mark (Mn/Me in UCD) */ | |
127 | CUPS_BIDI_BN, /* Boundary Neutral (formatting, etc) */ | |
128 | CUPS_BIDI_B, /* Paragraph Separator */ | |
129 | CUPS_BIDI_S, /* Segment Separator (Tab) */ | |
130 | CUPS_BIDI_WS, /* Whitespace Space (Space, etc) */ | |
131 | CUPS_BIDI_ON /* Other Neutrals */ | |
132 | } cups_bidi_t; | |
133 | ||
134 | /* | |
135 | * Note - add state table from UAX-14, section 7.3. | |
136 | * Remember to do BK and SP in outer loop (not in state table). | |
137 | * Consider optimization for CM (combining mark). | |
138 | * See 'LineBreak.txt' (12,875) and 'DerivedLineBreak.txt' (1,350). | |
139 | */ | |
140 | ||
141 | typedef enum /**** Unicode Line Break Class ****/ | |
142 | { | |
143 | /* | |
144 | * (A) - Allow Break AFTER | |
145 | * (XA) - Prevent Break AFTER | |
146 | * (B) - Allow Break BEFORE | |
147 | * (XB) - Prevent Break BEFORE | |
148 | * (P) - Allow Break For Pair | |
149 | * (XP) - Prevent Break For Pair | |
150 | */ | |
151 | CUPS_BREAK_AI, /* Ambiguous Alphabetic or Ideograph */ | |
152 | CUPS_BREAK_AL, /* Ordinary Alpha/Symbol Chars (XP) */ | |
153 | CUPS_BREAK_BA, /* Break Opportunity After Chars (A) */ | |
154 | CUPS_BREAK_BB, /* Break Opportunity Before Chars (B) */ | |
155 | CUPS_BREAK_B2, /* Break Opportunity Either (B/A/XP) */ | |
156 | CUPS_BREAK_BK, /* Mandatory Break (A) (norm) */ | |
157 | CUPS_BREAK_CB, /* Contingent Break (B/A) (norm) */ | |
158 | CUPS_BREAK_CL, /* Closing Punctuation (XB) */ | |
159 | CUPS_BREAK_CM, /* Attached/Combining (XB) (norm) */ | |
160 | CUPS_BREAK_CR, /* Carriage Return (A) (norm) */ | |
161 | CUPS_BREAK_EX, /* Exclamation/Interrogation (XB) */ | |
162 | CUPS_BREAK_GL, /* Non-breaking "Glue" (XB/XA) (norm) */ | |
163 | CUPS_BREAK_HY, /* Hyphen (XA) */ | |
164 | CUPS_BREAK_ID, /* Ideographic (B/A) */ | |
165 | CUPS_BREAK_IN, /* Inseparable chars (XP) */ | |
166 | CUPS_BREAK_IS, /* Numeric Separator (Infix) (XB) */ | |
167 | CUPS_BREAK_LF, /* Line Feed (A) (norm) */ | |
168 | CUPS_BREAK_NS, /* Non-starters (XB) */ | |
169 | CUPS_BREAK_NU, /* Numeric (XP) */ | |
170 | CUPS_BREAK_OP, /* Opening Punctuation (XA) */ | |
171 | CUPS_BREAK_PO, /* Postfix (Numeric) (XB) */ | |
172 | CUPS_BREAK_PR, /* Prefix (Numeric) (XA) */ | |
173 | CUPS_BREAK_QU, /* Ambiguous Quotation (XB/XA) */ | |
174 | CUPS_BREAK_SA, /* Context Dependent (SE Asian) (P) */ | |
175 | CUPS_BREAK_SG, /* Surrogates (XP) (norm) */ | |
176 | CUPS_BREAK_SP, /* Space (A) (norm) */ | |
177 | CUPS_BREAK_SY, /* Symbols Allowing Break After (A) */ | |
178 | CUPS_BREAK_XX, /* Unknown (XP) */ | |
179 | CUPS_BREAK_ZW /* Zero Width Space (A) (norm) */ | |
180 | } cups_break_class_t; | |
181 | ||
182 | typedef int _cups_comb_class_t; /**** Unicode Combining Class ****/ | |
183 | /* 0=base, 1..254=combining char */ | |
184 | ||
185 | /* | |
186 | * Structures... | |
187 | */ | |
188 | ||
189 | typedef struct _cups_normmap_s /**** Normalize Map Cache Struct ****/ | |
190 | { | |
191 | struct _cups_normmap_s *next; /* Next normalize in cache */ | |
192 | int used; /* Number of times entry used */ | |
193 | cups_normalize_t normalize; /* Normalization type */ | |
194 | int normcount; /* Count of Source Chars */ | |
195 | cups_ucs2_t *uni2norm; /* Char -> Normalization */ | |
196 | /* ...only supports UCS-2 */ | |
197 | } _cups_norm_map_t; | |
198 | ||
199 | typedef struct _cups_foldmap_s /**** Case Fold Map Cache Struct ****/ | |
200 | { | |
201 | struct _cups_foldmap_s *next; /* Next case fold in cache */ | |
202 | int used; /* Number of times entry used */ | |
203 | cups_folding_t fold; /* Case folding type */ | |
204 | int foldcount; /* Count of Source Chars */ | |
205 | cups_ucs2_t *uni2fold; /* Char -> Folded Char(s) */ | |
206 | /* ...only supports UCS-2 */ | |
207 | } _cups_fold_map_t; | |
208 | ||
209 | typedef struct _cups_prop_s /**** Char Property Struct ****/ | |
210 | { | |
211 | cups_ucs2_t ch; /* Unicode Char as UCS-2 */ | |
212 | unsigned char gencat; /* General Category */ | |
213 | unsigned char bidicat; /* Bidirectional Category */ | |
214 | } _cups_prop_t; | |
215 | ||
216 | typedef struct _cups_prop_map_s /**** Char Property Map Struct ****/ | |
217 | { | |
218 | int used; /* Number of times entry used */ | |
219 | int propcount; /* Count of Source Chars */ | |
220 | _cups_prop_t *uni2prop; /* Char -> Properties */ | |
221 | } _cups_prop_map_t; | |
222 | ||
223 | typedef struct _cups_break_map_s /**** Line Break Class Map Struct ****/ | |
224 | { | |
225 | int used; /* Number of times entry used */ | |
226 | int breakcount; /* Count of Source Chars */ | |
227 | cups_ucs2_t *uni2break; /* Char -> Line Break Class */ | |
228 | } _cups_break_map_t; | |
229 | ||
230 | typedef struct _cups_comb_s /**** Char Combining Class Struct ****/ | |
231 | { | |
232 | cups_ucs2_t ch; /* Unicode Char as UCS-2 */ | |
233 | unsigned char combclass; /* Combining Class */ | |
234 | unsigned char reserved; /* Reserved for alignment */ | |
235 | } _cups_comb_t; | |
236 | ||
237 | typedef struct _cups_comb_map_s /**** Combining Class Map Struct ****/ | |
238 | { | |
239 | int used; /* Number of times entry used */ | |
240 | int combcount; /* Count of Source Chars */ | |
241 | _cups_comb_t *uni2comb; /* Char -> Combining Class */ | |
242 | } _cups_comb_map_t; | |
243 | ||
244 | /* | |
245 | * Prototypes... | |
246 | */ | |
247 | ||
248 | /* | |
249 | * Utility functions for normalization module | |
250 | */ | |
251 | extern int cupsNormalizeMapsGet(void); | |
252 | extern int cupsNormalizeMapsFree(void); | |
253 | extern void cupsNormalizeMapsFlush(void); | |
254 | ||
255 | /* | |
256 | * Normalize UTF-8 string to Unicode UAX-15 Normalization Form | |
257 | * Note - Compatibility Normalization Forms (NFKD/NFKC) are | |
258 | * unsafe for subsequent transcoding to legacy charsets | |
259 | */ | |
260 | extern int cupsUTF8Normalize(cups_utf8_t *dest, | |
261 | const cups_utf8_t *src, | |
262 | const int maxout, | |
263 | const cups_normalize_t normalize); | |
264 | ||
265 | /* | |
266 | * Normalize UTF-32 string to Unicode UAX-15 Normalization Form | |
267 | * Note - Compatibility Normalization Forms (NFKD/NFKC) are | |
268 | * unsafe for subsequent transcoding to legacy charsets | |
269 | */ | |
270 | extern int cupsUTF32Normalize(cups_utf32_t *dest, | |
271 | const cups_utf32_t *src, | |
272 | const int maxout, | |
273 | const cups_normalize_t normalize); | |
274 | ||
275 | /* | |
276 | * Case Fold UTF-8 string per Unicode UAX-21 Section 2.3 | |
277 | * Note - Case folding output is | |
278 | * unsafe for subsequent transcoding to legacy charsets | |
279 | */ | |
280 | extern int cupsUTF8CaseFold(cups_utf8_t *dest, | |
281 | const cups_utf8_t *src, | |
282 | const int maxout, | |
283 | const cups_folding_t fold); | |
284 | ||
285 | /* | |
286 | * Case Fold UTF-32 string per Unicode UAX-21 Section 2.3 | |
287 | * Note - Case folding output is | |
288 | * unsafe for subsequent transcoding to legacy charsets | |
289 | */ | |
290 | extern int cupsUTF32CaseFold(cups_utf32_t *dest, | |
291 | const cups_utf32_t *src, | |
292 | const int maxout, | |
293 | const cups_folding_t fold); | |
294 | ||
295 | /* | |
296 | * Compare UTF-8 strings after case folding | |
297 | */ | |
298 | extern int cupsUTF8CompareCaseless(const cups_utf8_t *s1, | |
299 | const cups_utf8_t *s2); | |
300 | ||
301 | /* | |
302 | * Compare UTF-32 strings after case folding | |
303 | */ | |
304 | extern int cupsUTF32CompareCaseless(const cups_utf32_t *s1, | |
305 | const cups_utf32_t *s2); | |
306 | ||
307 | /* | |
308 | * Compare UTF-8 strings after case folding and NFKC normalization | |
309 | */ | |
310 | extern int cupsUTF8CompareIdentifier(const cups_utf8_t *s1, | |
311 | const cups_utf8_t *s2); | |
312 | ||
313 | /* | |
314 | * Compare UTF-32 strings after case folding and NFKC normalization | |
315 | */ | |
316 | extern int cupsUTF32CompareIdentifier(const cups_utf32_t *s1, | |
317 | const cups_utf32_t *s2); | |
318 | ||
319 | /* | |
320 | * Get UTF-32 character property | |
321 | */ | |
322 | extern int cupsUTF32CharacterProperty(const cups_utf32_t ch, | |
323 | const cups_property_t prop); | |
324 | ||
325 | # ifdef __cplusplus | |
326 | } | |
327 | # endif /* __cplusplus */ | |
328 | ||
329 | #endif /* !_CUPS_NORMALIZE_H_ */ | |
330 | ||
331 | ||
332 | /* | |
333 | * End of "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $" | |
334 | */ |