]> git.ipfire.org Git - thirdparty/cups.git/blob - cups/normalize.h
Load cups into easysw/current.
[thirdparty/cups.git] / cups / normalize.h
1 /*
2 * "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $"
3 *
4 * Unicode normalization for the Common UNIX Printing System (CUPS).
5 *
6 * Copyright 1997-2005 by Easy Software Products.
7 *
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
13 * Products at:
14 *
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
19 *
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
23 */
24
25 #ifndef _CUPS_NORMALIZE_H_
26 # define _CUPS_NORMALIZE_H_
27
28 /*
29 * Include necessary headers...
30 */
31
32 #include "transcode.h"
33
34 # ifdef __cplusplus
35 extern "C" {
36 # endif /* __cplusplus */
37
38
39 /*
40 * Types...
41 */
42
43 typedef enum /**** Normalizataion Types ****/
44 {
45 CUPS_NORM_NFD, /* Canonical Decomposition */
46 CUPS_NORM_NFKD, /* Compatibility Decomposition */
47 CUPS_NORM_NFC, /* NFD, them Canonical Composition */
48 CUPS_NORM_NFKC /* NFKD, them Canonical Composition */
49 } cups_normalize_t;
50
51 typedef enum /**** Case Folding Types ****/
52 {
53 CUPS_FOLD_SIMPLE, /* Simple - no expansion in size */
54 CUPS_FOLD_FULL /* Full - possible expansion in size */
55 } cups_folding_t;
56
57 typedef enum /**** Unicode Char Property Types ****/
58 {
59 CUPS_PROP_GENERAL_CATEGORY, /* See 'cups_gencat_t' enum */
60 CUPS_PROP_BIDI_CATEGORY, /* See 'cups_bidi_t' enum */
61 CUPS_PROP_COMBINING_CLASS, /* See '_cups_comb_class_t' type */
62 CUPS_PROP_BREAK_CLASS /* See 'cups_break_class_t' enum */
63 } cups_property_t;
64
65
66 /*
67 * Note - Use major classes for logic optimizations (by mask).
68 */
69
70 typedef enum /**** Unicode General Category ****/
71 {
72 CUPS_GENCAT_L = 0x10, /* Letter major class */
73 CUPS_GENCAT_LU = 0x11, /* Lu Letter, Uppercase */
74 CUPS_GENCAT_LL = 0x12, /* Ll Letter, Lowercase */
75 CUPS_GENCAT_LT = 0x13, /* Lt Letter, Titlecase */
76 CUPS_GENCAT_LM = 0x14, /* Lm Letter, Modifier */
77 CUPS_GENCAT_LO = 0x15, /* Lo Letter, Other */
78 CUPS_GENCAT_M = 0x20, /* Mark major class */
79 CUPS_GENCAT_MN = 0x21, /* Mn Mark, Non-Spacing */
80 CUPS_GENCAT_MC = 0x22, /* Mc Mark, Spacing Combining */
81 CUPS_GENCAT_ME = 0x23, /* Me Mark, Enclosing */
82 CUPS_GENCAT_N = 0x30, /* Number major class */
83 CUPS_GENCAT_ND = 0x31, /* Nd Number, Decimal Digit */
84 CUPS_GENCAT_NL = 0x32, /* Nl Number, Letter */
85 CUPS_GENCAT_NO = 0x33, /* No Number, Other */
86 CUPS_GENCAT_P = 0x40, /* Punctuation major class */
87 CUPS_GENCAT_PC = 0x41, /* Pc Punctuation, Connector */
88 CUPS_GENCAT_PD = 0x42, /* Pd Punctuation, Dash */
89 CUPS_GENCAT_PS = 0x43, /* Ps Punctuation, Open (start) */
90 CUPS_GENCAT_PE = 0x44, /* Pe Punctuation, Close (end) */
91 CUPS_GENCAT_PI = 0x45, /* Pi Punctuation, Initial Quote */
92 CUPS_GENCAT_PF = 0x46, /* Pf Punctuation, Final Quote */
93 CUPS_GENCAT_PO = 0x47, /* Po Punctuation, Other */
94 CUPS_GENCAT_S = 0x50, /* Symbol major class */
95 CUPS_GENCAT_SM = 0x51, /* Sm Symbol, Math */
96 CUPS_GENCAT_SC = 0x52, /* Sc Symbol, Currency */
97 CUPS_GENCAT_SK = 0x53, /* Sk Symbol, Modifier */
98 CUPS_GENCAT_SO = 0x54, /* So Symbol, Other */
99 CUPS_GENCAT_Z = 0x60, /* Separator major class */
100 CUPS_GENCAT_ZS = 0x61, /* Zs Separator, Space */
101 CUPS_GENCAT_ZL = 0x62, /* Zl Separator, Line */
102 CUPS_GENCAT_ZP = 0x63, /* Zp Separator, Paragraph */
103 CUPS_GENCAT_C = 0x70, /* Other (miscellaneous) major class */
104 CUPS_GENCAT_CC = 0x71, /* Cc Other, Control */
105 CUPS_GENCAT_CF = 0x72, /* Cf Other, Format */
106 CUPS_GENCAT_CS = 0x73, /* Cs Other, Surrogate */
107 CUPS_GENCAT_CO = 0x74, /* Co Other, Private Use */
108 CUPS_GENCAT_CN = 0x75 /* Cn Other, Not Assigned */
109 } cups_gencat_t;
110
111 typedef enum /**** Unicode Bidi Category ****/
112 {
113 CUPS_BIDI_L, /* Left-to-Right (Alpha, Ideographic) */
114 CUPS_BIDI_LRE, /* Left-to-Right Embedding (explicit) */
115 CUPS_BIDI_LRO, /* Left-to-Right Override (explicit) */
116 CUPS_BIDI_R, /* Right-to-Left (Hebrew alpha/punct) */
117 CUPS_BIDI_AL, /* Right-to-Left Arabic (Arabic, etc) */
118 CUPS_BIDI_RLE, /* Right-to-Left Embedding (explicit) */
119 CUPS_BIDI_RLO, /* Right-to-Left Override (explicit) */
120 CUPS_BIDI_PDF, /* Pop Directional Format */
121 CUPS_BIDI_EN, /* Euro Number (Euro & Indic digits) */
122 CUPS_BIDI_ES, /* Euro Number Separator (Slash) */
123 CUPS_BIDI_ET, /* Euro Number Terminator */
124 CUPS_BIDI_AN, /* Arabic Number (digits, separators) */
125 CUPS_BIDI_CS, /* Common Number Separator */
126 CUPS_BIDI_NSM, /* Non-Spacing Mark (Mn/Me in UCD) */
127 CUPS_BIDI_BN, /* Boundary Neutral (formatting, etc) */
128 CUPS_BIDI_B, /* Paragraph Separator */
129 CUPS_BIDI_S, /* Segment Separator (Tab) */
130 CUPS_BIDI_WS, /* Whitespace Space (Space, etc) */
131 CUPS_BIDI_ON /* Other Neutrals */
132 } cups_bidi_t;
133
134 /*
135 * Note - add state table from UAX-14, section 7.3.
136 * Remember to do BK and SP in outer loop (not in state table).
137 * Consider optimization for CM (combining mark).
138 * See 'LineBreak.txt' (12,875) and 'DerivedLineBreak.txt' (1,350).
139 */
140
141 typedef enum /**** Unicode Line Break Class ****/
142 {
143 /*
144 * (A) - Allow Break AFTER
145 * (XA) - Prevent Break AFTER
146 * (B) - Allow Break BEFORE
147 * (XB) - Prevent Break BEFORE
148 * (P) - Allow Break For Pair
149 * (XP) - Prevent Break For Pair
150 */
151 CUPS_BREAK_AI, /* Ambiguous Alphabetic or Ideograph */
152 CUPS_BREAK_AL, /* Ordinary Alpha/Symbol Chars (XP) */
153 CUPS_BREAK_BA, /* Break Opportunity After Chars (A) */
154 CUPS_BREAK_BB, /* Break Opportunity Before Chars (B) */
155 CUPS_BREAK_B2, /* Break Opportunity Either (B/A/XP) */
156 CUPS_BREAK_BK, /* Mandatory Break (A) (norm) */
157 CUPS_BREAK_CB, /* Contingent Break (B/A) (norm) */
158 CUPS_BREAK_CL, /* Closing Punctuation (XB) */
159 CUPS_BREAK_CM, /* Attached/Combining (XB) (norm) */
160 CUPS_BREAK_CR, /* Carriage Return (A) (norm) */
161 CUPS_BREAK_EX, /* Exclamation/Interrogation (XB) */
162 CUPS_BREAK_GL, /* Non-breaking "Glue" (XB/XA) (norm) */
163 CUPS_BREAK_HY, /* Hyphen (XA) */
164 CUPS_BREAK_ID, /* Ideographic (B/A) */
165 CUPS_BREAK_IN, /* Inseparable chars (XP) */
166 CUPS_BREAK_IS, /* Numeric Separator (Infix) (XB) */
167 CUPS_BREAK_LF, /* Line Feed (A) (norm) */
168 CUPS_BREAK_NS, /* Non-starters (XB) */
169 CUPS_BREAK_NU, /* Numeric (XP) */
170 CUPS_BREAK_OP, /* Opening Punctuation (XA) */
171 CUPS_BREAK_PO, /* Postfix (Numeric) (XB) */
172 CUPS_BREAK_PR, /* Prefix (Numeric) (XA) */
173 CUPS_BREAK_QU, /* Ambiguous Quotation (XB/XA) */
174 CUPS_BREAK_SA, /* Context Dependent (SE Asian) (P) */
175 CUPS_BREAK_SG, /* Surrogates (XP) (norm) */
176 CUPS_BREAK_SP, /* Space (A) (norm) */
177 CUPS_BREAK_SY, /* Symbols Allowing Break After (A) */
178 CUPS_BREAK_XX, /* Unknown (XP) */
179 CUPS_BREAK_ZW /* Zero Width Space (A) (norm) */
180 } cups_break_class_t;
181
182 typedef int _cups_comb_class_t; /**** Unicode Combining Class ****/
183 /* 0=base, 1..254=combining char */
184
185 /*
186 * Structures...
187 */
188
189 typedef struct _cups_normmap_s /**** Normalize Map Cache Struct ****/
190 {
191 struct _cups_normmap_s *next; /* Next normalize in cache */
192 int used; /* Number of times entry used */
193 cups_normalize_t normalize; /* Normalization type */
194 int normcount; /* Count of Source Chars */
195 cups_ucs2_t *uni2norm; /* Char -> Normalization */
196 /* ...only supports UCS-2 */
197 } _cups_norm_map_t;
198
199 typedef struct _cups_foldmap_s /**** Case Fold Map Cache Struct ****/
200 {
201 struct _cups_foldmap_s *next; /* Next case fold in cache */
202 int used; /* Number of times entry used */
203 cups_folding_t fold; /* Case folding type */
204 int foldcount; /* Count of Source Chars */
205 cups_ucs2_t *uni2fold; /* Char -> Folded Char(s) */
206 /* ...only supports UCS-2 */
207 } _cups_fold_map_t;
208
209 typedef struct _cups_prop_s /**** Char Property Struct ****/
210 {
211 cups_ucs2_t ch; /* Unicode Char as UCS-2 */
212 unsigned char gencat; /* General Category */
213 unsigned char bidicat; /* Bidirectional Category */
214 } _cups_prop_t;
215
216 typedef struct _cups_prop_map_s /**** Char Property Map Struct ****/
217 {
218 int used; /* Number of times entry used */
219 int propcount; /* Count of Source Chars */
220 _cups_prop_t *uni2prop; /* Char -> Properties */
221 } _cups_prop_map_t;
222
223 typedef struct _cups_break_map_s /**** Line Break Class Map Struct ****/
224 {
225 int used; /* Number of times entry used */
226 int breakcount; /* Count of Source Chars */
227 cups_ucs2_t *uni2break; /* Char -> Line Break Class */
228 } _cups_break_map_t;
229
230 typedef struct _cups_comb_s /**** Char Combining Class Struct ****/
231 {
232 cups_ucs2_t ch; /* Unicode Char as UCS-2 */
233 unsigned char combclass; /* Combining Class */
234 unsigned char reserved; /* Reserved for alignment */
235 } _cups_comb_t;
236
237 typedef struct _cups_comb_map_s /**** Combining Class Map Struct ****/
238 {
239 int used; /* Number of times entry used */
240 int combcount; /* Count of Source Chars */
241 _cups_comb_t *uni2comb; /* Char -> Combining Class */
242 } _cups_comb_map_t;
243
244 /*
245 * Prototypes...
246 */
247
248 /*
249 * Utility functions for normalization module
250 */
251 extern int cupsNormalizeMapsGet(void);
252 extern int cupsNormalizeMapsFree(void);
253 extern void cupsNormalizeMapsFlush(void);
254
255 /*
256 * Normalize UTF-8 string to Unicode UAX-15 Normalization Form
257 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
258 * unsafe for subsequent transcoding to legacy charsets
259 */
260 extern int cupsUTF8Normalize(cups_utf8_t *dest,
261 const cups_utf8_t *src,
262 const int maxout,
263 const cups_normalize_t normalize);
264
265 /*
266 * Normalize UTF-32 string to Unicode UAX-15 Normalization Form
267 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
268 * unsafe for subsequent transcoding to legacy charsets
269 */
270 extern int cupsUTF32Normalize(cups_utf32_t *dest,
271 const cups_utf32_t *src,
272 const int maxout,
273 const cups_normalize_t normalize);
274
275 /*
276 * Case Fold UTF-8 string per Unicode UAX-21 Section 2.3
277 * Note - Case folding output is
278 * unsafe for subsequent transcoding to legacy charsets
279 */
280 extern int cupsUTF8CaseFold(cups_utf8_t *dest,
281 const cups_utf8_t *src,
282 const int maxout,
283 const cups_folding_t fold);
284
285 /*
286 * Case Fold UTF-32 string per Unicode UAX-21 Section 2.3
287 * Note - Case folding output is
288 * unsafe for subsequent transcoding to legacy charsets
289 */
290 extern int cupsUTF32CaseFold(cups_utf32_t *dest,
291 const cups_utf32_t *src,
292 const int maxout,
293 const cups_folding_t fold);
294
295 /*
296 * Compare UTF-8 strings after case folding
297 */
298 extern int cupsUTF8CompareCaseless(const cups_utf8_t *s1,
299 const cups_utf8_t *s2);
300
301 /*
302 * Compare UTF-32 strings after case folding
303 */
304 extern int cupsUTF32CompareCaseless(const cups_utf32_t *s1,
305 const cups_utf32_t *s2);
306
307 /*
308 * Compare UTF-8 strings after case folding and NFKC normalization
309 */
310 extern int cupsUTF8CompareIdentifier(const cups_utf8_t *s1,
311 const cups_utf8_t *s2);
312
313 /*
314 * Compare UTF-32 strings after case folding and NFKC normalization
315 */
316 extern int cupsUTF32CompareIdentifier(const cups_utf32_t *s1,
317 const cups_utf32_t *s2);
318
319 /*
320 * Get UTF-32 character property
321 */
322 extern int cupsUTF32CharacterProperty(const cups_utf32_t ch,
323 const cups_property_t prop);
324
325 # ifdef __cplusplus
326 }
327 # endif /* __cplusplus */
328
329 #endif /* !_CUPS_NORMALIZE_H_ */
330
331
332 /*
333 * End of "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $"
334 */