2 * "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $"
4 * Unicode normalization for the Common UNIX Printing System (CUPS).
6 * Copyright 1997-2005 by Easy Software Products.
8 * These coded instructions, statements, and computer programs are
9 * the property of Easy Software Products and are protected by Federal
10 * copyright law. Distribution and use rights are outlined in the
11 * file "LICENSE.txt" which should have been included with this file.
12 * If this file is missing or damaged please contact Easy Software
15 * Attn: CUPS Licensing Information
16 * Easy Software Products
17 * 44141 Airport View Drive, Suite 204
18 * Hollywood, Maryland 20636 USA
20 * Voice: (301) 373-9600
21 * EMail: cups-info@cups.org
22 * WWW: http://www.cups.org
25 #ifndef _CUPS_NORMALIZE_H_
26 # define _CUPS_NORMALIZE_H_
29 * Include necessary headers...
32 #include "transcode.h"
36 # endif /* __cplusplus */
43 typedef enum /**** Normalizataion Types ****/
45 CUPS_NORM_NFD
, /* Canonical Decomposition */
46 CUPS_NORM_NFKD
, /* Compatibility Decomposition */
47 CUPS_NORM_NFC
, /* NFD, them Canonical Composition */
48 CUPS_NORM_NFKC
/* NFKD, them Canonical Composition */
51 typedef enum /**** Case Folding Types ****/
53 CUPS_FOLD_SIMPLE
, /* Simple - no expansion in size */
54 CUPS_FOLD_FULL
/* Full - possible expansion in size */
57 typedef enum /**** Unicode Char Property Types ****/
59 CUPS_PROP_GENERAL_CATEGORY
, /* See 'cups_gencat_t' enum */
60 CUPS_PROP_BIDI_CATEGORY
, /* See 'cups_bidi_t' enum */
61 CUPS_PROP_COMBINING_CLASS
, /* See '_cups_comb_class_t' type */
62 CUPS_PROP_BREAK_CLASS
/* See 'cups_break_class_t' enum */
67 * Note - Use major classes for logic optimizations (by mask).
70 typedef enum /**** Unicode General Category ****/
72 CUPS_GENCAT_L
= 0x10, /* Letter major class */
73 CUPS_GENCAT_LU
= 0x11, /* Lu Letter, Uppercase */
74 CUPS_GENCAT_LL
= 0x12, /* Ll Letter, Lowercase */
75 CUPS_GENCAT_LT
= 0x13, /* Lt Letter, Titlecase */
76 CUPS_GENCAT_LM
= 0x14, /* Lm Letter, Modifier */
77 CUPS_GENCAT_LO
= 0x15, /* Lo Letter, Other */
78 CUPS_GENCAT_M
= 0x20, /* Mark major class */
79 CUPS_GENCAT_MN
= 0x21, /* Mn Mark, Non-Spacing */
80 CUPS_GENCAT_MC
= 0x22, /* Mc Mark, Spacing Combining */
81 CUPS_GENCAT_ME
= 0x23, /* Me Mark, Enclosing */
82 CUPS_GENCAT_N
= 0x30, /* Number major class */
83 CUPS_GENCAT_ND
= 0x31, /* Nd Number, Decimal Digit */
84 CUPS_GENCAT_NL
= 0x32, /* Nl Number, Letter */
85 CUPS_GENCAT_NO
= 0x33, /* No Number, Other */
86 CUPS_GENCAT_P
= 0x40, /* Punctuation major class */
87 CUPS_GENCAT_PC
= 0x41, /* Pc Punctuation, Connector */
88 CUPS_GENCAT_PD
= 0x42, /* Pd Punctuation, Dash */
89 CUPS_GENCAT_PS
= 0x43, /* Ps Punctuation, Open (start) */
90 CUPS_GENCAT_PE
= 0x44, /* Pe Punctuation, Close (end) */
91 CUPS_GENCAT_PI
= 0x45, /* Pi Punctuation, Initial Quote */
92 CUPS_GENCAT_PF
= 0x46, /* Pf Punctuation, Final Quote */
93 CUPS_GENCAT_PO
= 0x47, /* Po Punctuation, Other */
94 CUPS_GENCAT_S
= 0x50, /* Symbol major class */
95 CUPS_GENCAT_SM
= 0x51, /* Sm Symbol, Math */
96 CUPS_GENCAT_SC
= 0x52, /* Sc Symbol, Currency */
97 CUPS_GENCAT_SK
= 0x53, /* Sk Symbol, Modifier */
98 CUPS_GENCAT_SO
= 0x54, /* So Symbol, Other */
99 CUPS_GENCAT_Z
= 0x60, /* Separator major class */
100 CUPS_GENCAT_ZS
= 0x61, /* Zs Separator, Space */
101 CUPS_GENCAT_ZL
= 0x62, /* Zl Separator, Line */
102 CUPS_GENCAT_ZP
= 0x63, /* Zp Separator, Paragraph */
103 CUPS_GENCAT_C
= 0x70, /* Other (miscellaneous) major class */
104 CUPS_GENCAT_CC
= 0x71, /* Cc Other, Control */
105 CUPS_GENCAT_CF
= 0x72, /* Cf Other, Format */
106 CUPS_GENCAT_CS
= 0x73, /* Cs Other, Surrogate */
107 CUPS_GENCAT_CO
= 0x74, /* Co Other, Private Use */
108 CUPS_GENCAT_CN
= 0x75 /* Cn Other, Not Assigned */
111 typedef enum /**** Unicode Bidi Category ****/
113 CUPS_BIDI_L
, /* Left-to-Right (Alpha, Ideographic) */
114 CUPS_BIDI_LRE
, /* Left-to-Right Embedding (explicit) */
115 CUPS_BIDI_LRO
, /* Left-to-Right Override (explicit) */
116 CUPS_BIDI_R
, /* Right-to-Left (Hebrew alpha/punct) */
117 CUPS_BIDI_AL
, /* Right-to-Left Arabic (Arabic, etc) */
118 CUPS_BIDI_RLE
, /* Right-to-Left Embedding (explicit) */
119 CUPS_BIDI_RLO
, /* Right-to-Left Override (explicit) */
120 CUPS_BIDI_PDF
, /* Pop Directional Format */
121 CUPS_BIDI_EN
, /* Euro Number (Euro & Indic digits) */
122 CUPS_BIDI_ES
, /* Euro Number Separator (Slash) */
123 CUPS_BIDI_ET
, /* Euro Number Terminator */
124 CUPS_BIDI_AN
, /* Arabic Number (digits, separators) */
125 CUPS_BIDI_CS
, /* Common Number Separator */
126 CUPS_BIDI_NSM
, /* Non-Spacing Mark (Mn/Me in UCD) */
127 CUPS_BIDI_BN
, /* Boundary Neutral (formatting, etc) */
128 CUPS_BIDI_B
, /* Paragraph Separator */
129 CUPS_BIDI_S
, /* Segment Separator (Tab) */
130 CUPS_BIDI_WS
, /* Whitespace Space (Space, etc) */
131 CUPS_BIDI_ON
/* Other Neutrals */
135 * Note - add state table from UAX-14, section 7.3.
136 * Remember to do BK and SP in outer loop (not in state table).
137 * Consider optimization for CM (combining mark).
138 * See 'LineBreak.txt' (12,875) and 'DerivedLineBreak.txt' (1,350).
141 typedef enum /**** Unicode Line Break Class ****/
144 * (A) - Allow Break AFTER
145 * (XA) - Prevent Break AFTER
146 * (B) - Allow Break BEFORE
147 * (XB) - Prevent Break BEFORE
148 * (P) - Allow Break For Pair
149 * (XP) - Prevent Break For Pair
151 CUPS_BREAK_AI
, /* Ambiguous Alphabetic or Ideograph */
152 CUPS_BREAK_AL
, /* Ordinary Alpha/Symbol Chars (XP) */
153 CUPS_BREAK_BA
, /* Break Opportunity After Chars (A) */
154 CUPS_BREAK_BB
, /* Break Opportunity Before Chars (B) */
155 CUPS_BREAK_B2
, /* Break Opportunity Either (B/A/XP) */
156 CUPS_BREAK_BK
, /* Mandatory Break (A) (norm) */
157 CUPS_BREAK_CB
, /* Contingent Break (B/A) (norm) */
158 CUPS_BREAK_CL
, /* Closing Punctuation (XB) */
159 CUPS_BREAK_CM
, /* Attached/Combining (XB) (norm) */
160 CUPS_BREAK_CR
, /* Carriage Return (A) (norm) */
161 CUPS_BREAK_EX
, /* Exclamation/Interrogation (XB) */
162 CUPS_BREAK_GL
, /* Non-breaking "Glue" (XB/XA) (norm) */
163 CUPS_BREAK_HY
, /* Hyphen (XA) */
164 CUPS_BREAK_ID
, /* Ideographic (B/A) */
165 CUPS_BREAK_IN
, /* Inseparable chars (XP) */
166 CUPS_BREAK_IS
, /* Numeric Separator (Infix) (XB) */
167 CUPS_BREAK_LF
, /* Line Feed (A) (norm) */
168 CUPS_BREAK_NS
, /* Non-starters (XB) */
169 CUPS_BREAK_NU
, /* Numeric (XP) */
170 CUPS_BREAK_OP
, /* Opening Punctuation (XA) */
171 CUPS_BREAK_PO
, /* Postfix (Numeric) (XB) */
172 CUPS_BREAK_PR
, /* Prefix (Numeric) (XA) */
173 CUPS_BREAK_QU
, /* Ambiguous Quotation (XB/XA) */
174 CUPS_BREAK_SA
, /* Context Dependent (SE Asian) (P) */
175 CUPS_BREAK_SG
, /* Surrogates (XP) (norm) */
176 CUPS_BREAK_SP
, /* Space (A) (norm) */
177 CUPS_BREAK_SY
, /* Symbols Allowing Break After (A) */
178 CUPS_BREAK_XX
, /* Unknown (XP) */
179 CUPS_BREAK_ZW
/* Zero Width Space (A) (norm) */
180 } cups_break_class_t
;
182 typedef int _cups_comb_class_t
; /**** Unicode Combining Class ****/
183 /* 0=base, 1..254=combining char */
189 typedef struct _cups_normmap_s
/**** Normalize Map Cache Struct ****/
191 struct _cups_normmap_s
*next
; /* Next normalize in cache */
192 int used
; /* Number of times entry used */
193 cups_normalize_t normalize
; /* Normalization type */
194 int normcount
; /* Count of Source Chars */
195 cups_ucs2_t
*uni2norm
; /* Char -> Normalization */
196 /* ...only supports UCS-2 */
199 typedef struct _cups_foldmap_s
/**** Case Fold Map Cache Struct ****/
201 struct _cups_foldmap_s
*next
; /* Next case fold in cache */
202 int used
; /* Number of times entry used */
203 cups_folding_t fold
; /* Case folding type */
204 int foldcount
; /* Count of Source Chars */
205 cups_ucs2_t
*uni2fold
; /* Char -> Folded Char(s) */
206 /* ...only supports UCS-2 */
209 typedef struct _cups_prop_s
/**** Char Property Struct ****/
211 cups_ucs2_t ch
; /* Unicode Char as UCS-2 */
212 unsigned char gencat
; /* General Category */
213 unsigned char bidicat
; /* Bidirectional Category */
216 typedef struct _cups_prop_map_s
/**** Char Property Map Struct ****/
218 int used
; /* Number of times entry used */
219 int propcount
; /* Count of Source Chars */
220 _cups_prop_t
*uni2prop
; /* Char -> Properties */
223 typedef struct _cups_break_map_s
/**** Line Break Class Map Struct ****/
225 int used
; /* Number of times entry used */
226 int breakcount
; /* Count of Source Chars */
227 cups_ucs2_t
*uni2break
; /* Char -> Line Break Class */
230 typedef struct _cups_comb_s
/**** Char Combining Class Struct ****/
232 cups_ucs2_t ch
; /* Unicode Char as UCS-2 */
233 unsigned char combclass
; /* Combining Class */
234 unsigned char reserved
; /* Reserved for alignment */
237 typedef struct _cups_comb_map_s
/**** Combining Class Map Struct ****/
239 int used
; /* Number of times entry used */
240 int combcount
; /* Count of Source Chars */
241 _cups_comb_t
*uni2comb
; /* Char -> Combining Class */
249 * Utility functions for normalization module
251 extern int cupsNormalizeMapsGet(void);
252 extern int cupsNormalizeMapsFree(void);
253 extern void cupsNormalizeMapsFlush(void);
256 * Normalize UTF-8 string to Unicode UAX-15 Normalization Form
257 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
258 * unsafe for subsequent transcoding to legacy charsets
260 extern int cupsUTF8Normalize(cups_utf8_t
*dest
,
261 const cups_utf8_t
*src
,
263 const cups_normalize_t normalize
);
266 * Normalize UTF-32 string to Unicode UAX-15 Normalization Form
267 * Note - Compatibility Normalization Forms (NFKD/NFKC) are
268 * unsafe for subsequent transcoding to legacy charsets
270 extern int cupsUTF32Normalize(cups_utf32_t
*dest
,
271 const cups_utf32_t
*src
,
273 const cups_normalize_t normalize
);
276 * Case Fold UTF-8 string per Unicode UAX-21 Section 2.3
277 * Note - Case folding output is
278 * unsafe for subsequent transcoding to legacy charsets
280 extern int cupsUTF8CaseFold(cups_utf8_t
*dest
,
281 const cups_utf8_t
*src
,
283 const cups_folding_t fold
);
286 * Case Fold UTF-32 string per Unicode UAX-21 Section 2.3
287 * Note - Case folding output is
288 * unsafe for subsequent transcoding to legacy charsets
290 extern int cupsUTF32CaseFold(cups_utf32_t
*dest
,
291 const cups_utf32_t
*src
,
293 const cups_folding_t fold
);
296 * Compare UTF-8 strings after case folding
298 extern int cupsUTF8CompareCaseless(const cups_utf8_t
*s1
,
299 const cups_utf8_t
*s2
);
302 * Compare UTF-32 strings after case folding
304 extern int cupsUTF32CompareCaseless(const cups_utf32_t
*s1
,
305 const cups_utf32_t
*s2
);
308 * Compare UTF-8 strings after case folding and NFKC normalization
310 extern int cupsUTF8CompareIdentifier(const cups_utf8_t
*s1
,
311 const cups_utf8_t
*s2
);
314 * Compare UTF-32 strings after case folding and NFKC normalization
316 extern int cupsUTF32CompareIdentifier(const cups_utf32_t
*s1
,
317 const cups_utf32_t
*s2
);
320 * Get UTF-32 character property
322 extern int cupsUTF32CharacterProperty(const cups_utf32_t ch
,
323 const cups_property_t prop
);
327 # endif /* __cplusplus */
329 #endif /* !_CUPS_NORMALIZE_H_ */
333 * End of "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $"