cups/normalize.h

   1 /*
   2  * "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $"
   3  *
   4  *   Unicode normalization for the Common UNIX Printing System (CUPS).
   5  *
   6  *   Copyright 1997-2005 by Easy Software Products.
   7  *
   8  *   These coded instructions, statements, and computer programs are
   9  *   the property of Easy Software Products and are protected by Federal
  10  *   copyright law.  Distribution and use rights are outlined in the
  11  *   file "LICENSE.txt" which should have been included with this file.
  12  *   If this file is missing or damaged please contact Easy Software
  13  *   Products at:
  14  *
  15  *       Attn: CUPS Licensing Information
  16  *       Easy Software Products
  17  *       44141 Airport View Drive, Suite 204
  18  *       Hollywood, Maryland 20636 USA
  19  *
  20  *       Voice: (301) 373-9600
  21  *       EMail: cups-info@cups.org
  22  *         WWW: http://www.cups.org
  23  */
  24
  25 #ifndef _CUPS_NORMALIZE_H_
  26 #  define _CUPS_NORMALIZE_H_
  27
  28 /*
  29  * Include necessary headers...
  30  */
  31
  32 #include "transcode.h"
  33
  34 #  ifdef __cplusplus
  35 extern "C" {
  36 #  endif /* __cplusplus */
  37
  38
  39 /*
  40  * Types...
  41  */
  42
  43 typedef enum                    /**** Normalizataion Types ****/
  44 {
  45   CUPS_NORM_NFD,                /* Canonical Decomposition */
  46   CUPS_NORM_NFKD,               /* Compatibility Decomposition */
  47   CUPS_NORM_NFC,                /* NFD, them Canonical Composition */
  48   CUPS_NORM_NFKC                /* NFKD, them Canonical Composition */
  49 } cups_normalize_t;
  50
  51 typedef enum                    /**** Case Folding Types ****/
  52 {
  53   CUPS_FOLD_SIMPLE,             /* Simple - no expansion in size */
  54   CUPS_FOLD_FULL                /* Full - possible expansion in size */
  55 } cups_folding_t;
  56
  57 typedef enum                    /**** Unicode Char Property Types ****/
  58 {
  59   CUPS_PROP_GENERAL_CATEGORY,   /* See 'cups_gencat_t' enum */
  60   CUPS_PROP_BIDI_CATEGORY,      /* See 'cups_bidi_t' enum */
  61   CUPS_PROP_COMBINING_CLASS,    /* See '_cups_comb_class_t' type */
  62   CUPS_PROP_BREAK_CLASS         /* See 'cups_break_class_t' enum */
  63 } cups_property_t;
  64
  65
  66 /*
  67  * Note - Use major classes for logic optimizations (by mask).
  68  */
  69
  70 typedef enum                    /**** Unicode General Category ****/
  71 {
  72   CUPS_GENCAT_L  = 0x10,        /* Letter major class */
  73   CUPS_GENCAT_LU = 0x11,        /* Lu Letter, Uppercase */
  74   CUPS_GENCAT_LL = 0x12,        /* Ll Letter, Lowercase */
  75   CUPS_GENCAT_LT = 0x13,        /* Lt Letter, Titlecase */
  76   CUPS_GENCAT_LM = 0x14,        /* Lm Letter, Modifier */
  77   CUPS_GENCAT_LO = 0x15,        /* Lo Letter, Other */
  78   CUPS_GENCAT_M  = 0x20,        /* Mark major class */
  79   CUPS_GENCAT_MN = 0x21,        /* Mn Mark, Non-Spacing */
  80   CUPS_GENCAT_MC = 0x22,        /* Mc Mark, Spacing Combining */
  81   CUPS_GENCAT_ME = 0x23,        /* Me Mark, Enclosing */
  82   CUPS_GENCAT_N  = 0x30,        /* Number major class */
  83   CUPS_GENCAT_ND = 0x31,        /* Nd Number, Decimal Digit */
  84   CUPS_GENCAT_NL = 0x32,        /* Nl Number, Letter */
  85   CUPS_GENCAT_NO = 0x33,        /* No Number, Other */
  86   CUPS_GENCAT_P  = 0x40,        /* Punctuation major class */
  87   CUPS_GENCAT_PC = 0x41,        /* Pc Punctuation, Connector */
  88   CUPS_GENCAT_PD = 0x42,        /* Pd Punctuation, Dash */
  89   CUPS_GENCAT_PS = 0x43,        /* Ps Punctuation, Open (start) */
  90   CUPS_GENCAT_PE = 0x44,        /* Pe Punctuation, Close (end) */
  91   CUPS_GENCAT_PI = 0x45,        /* Pi Punctuation, Initial Quote */
  92   CUPS_GENCAT_PF = 0x46,        /* Pf Punctuation, Final Quote */
  93   CUPS_GENCAT_PO = 0x47,        /* Po Punctuation, Other */
  94   CUPS_GENCAT_S  = 0x50,        /* Symbol major class */
  95   CUPS_GENCAT_SM = 0x51,        /* Sm Symbol, Math */
  96   CUPS_GENCAT_SC = 0x52,        /* Sc Symbol, Currency */
  97   CUPS_GENCAT_SK = 0x53,        /* Sk Symbol, Modifier */
  98   CUPS_GENCAT_SO = 0x54,        /* So Symbol, Other */
  99   CUPS_GENCAT_Z  = 0x60,        /* Separator major class */
 100   CUPS_GENCAT_ZS = 0x61,        /* Zs Separator, Space */
 101   CUPS_GENCAT_ZL = 0x62,        /* Zl Separator, Line */
 102   CUPS_GENCAT_ZP = 0x63,        /* Zp Separator, Paragraph */
 103   CUPS_GENCAT_C  = 0x70,        /* Other (miscellaneous) major class */
 104   CUPS_GENCAT_CC = 0x71,        /* Cc Other, Control */
 105   CUPS_GENCAT_CF = 0x72,        /* Cf Other, Format */
 106   CUPS_GENCAT_CS = 0x73,        /* Cs Other, Surrogate */
 107   CUPS_GENCAT_CO = 0x74,        /* Co Other, Private Use */
 108   CUPS_GENCAT_CN = 0x75         /* Cn Other, Not Assigned */
 109 } cups_gencat_t;
 110
 111 typedef enum                    /**** Unicode Bidi Category ****/
 112 {
 113   CUPS_BIDI_L,                  /* Left-to-Right (Alpha, Ideographic) */
 114   CUPS_BIDI_LRE,                /* Left-to-Right Embedding (explicit) */
 115   CUPS_BIDI_LRO,                /* Left-to-Right Override (explicit) */
 116   CUPS_BIDI_R,                  /* Right-to-Left (Hebrew alpha/punct) */
 117   CUPS_BIDI_AL,                 /* Right-to-Left Arabic (Arabic, etc) */
 118   CUPS_BIDI_RLE,                /* Right-to-Left Embedding (explicit) */
 119   CUPS_BIDI_RLO,                /* Right-to-Left Override (explicit) */
 120   CUPS_BIDI_PDF,                /* Pop Directional Format */
 121   CUPS_BIDI_EN,                 /* Euro Number (Euro & Indic digits) */
 122   CUPS_BIDI_ES,                 /* Euro Number Separator (Slash) */
 123   CUPS_BIDI_ET,                 /* Euro Number Terminator */
 124   CUPS_BIDI_AN,                 /* Arabic Number (digits, separators) */
 125   CUPS_BIDI_CS,                 /* Common Number Separator */
 126   CUPS_BIDI_NSM,                /* Non-Spacing Mark (Mn/Me in UCD) */
 127   CUPS_BIDI_BN,                 /* Boundary Neutral (formatting, etc) */
 128   CUPS_BIDI_B,                  /* Paragraph Separator */
 129   CUPS_BIDI_S,                  /* Segment Separator (Tab) */
 130   CUPS_BIDI_WS,                 /* Whitespace Space (Space, etc) */
 131   CUPS_BIDI_ON                  /* Other Neutrals */
 132 } cups_bidi_t;
 133
 134 /*
 135  * Note - add state table from UAX-14, section 7.3.
 136  * Remember to do BK and SP in outer loop (not in state table).
 137  * Consider optimization for CM (combining mark).
 138  * See 'LineBreak.txt' (12,875) and 'DerivedLineBreak.txt' (1,350).
 139  */
 140
 141 typedef enum                    /**** Unicode Line Break Class ****/
 142 {
 143  /*
 144   * (A) - Allow Break AFTER
 145   * (XA) - Prevent Break AFTER
 146   * (B) - Allow Break BEFORE
 147   * (XB) - Prevent Break BEFORE
 148   * (P) - Allow Break For Pair
 149   * (XP) - Prevent Break For Pair
 150   */
 151   CUPS_BREAK_AI,                /* Ambiguous Alphabetic or Ideograph */
 152   CUPS_BREAK_AL,                /* Ordinary Alpha/Symbol Chars (XP) */
 153   CUPS_BREAK_BA,                /* Break Opportunity After Chars (A) */
 154   CUPS_BREAK_BB,                /* Break Opportunity Before Chars (B) */
 155   CUPS_BREAK_B2,                /* Break Opportunity Either (B/A/XP) */
 156   CUPS_BREAK_BK,                /* Mandatory Break (A) (norm) */
 157   CUPS_BREAK_CB,                /* Contingent Break (B/A) (norm) */
 158   CUPS_BREAK_CL,                /* Closing Punctuation (XB) */
 159   CUPS_BREAK_CM,                /* Attached/Combining (XB) (norm) */
 160   CUPS_BREAK_CR,                /* Carriage Return (A) (norm) */
 161   CUPS_BREAK_EX,                /* Exclamation/Interrogation (XB) */
 162   CUPS_BREAK_GL,                /* Non-breaking "Glue" (XB/XA) (norm) */
 163   CUPS_BREAK_HY,                /* Hyphen (XA) */
 164   CUPS_BREAK_ID,                /* Ideographic (B/A) */
 165   CUPS_BREAK_IN,                /* Inseparable chars (XP) */
 166   CUPS_BREAK_IS,                /* Numeric Separator (Infix) (XB) */
 167   CUPS_BREAK_LF,                /* Line Feed (A) (norm) */
 168   CUPS_BREAK_NS,                /* Non-starters (XB) */
 169   CUPS_BREAK_NU,                /* Numeric (XP) */
 170   CUPS_BREAK_OP,                /* Opening Punctuation (XA) */
 171   CUPS_BREAK_PO,                /* Postfix (Numeric) (XB) */
 172   CUPS_BREAK_PR,                /* Prefix (Numeric) (XA) */
 173   CUPS_BREAK_QU,                /* Ambiguous Quotation (XB/XA) */
 174   CUPS_BREAK_SA,                /* Context Dependent (SE Asian) (P) */
 175   CUPS_BREAK_SG,                /* Surrogates (XP) (norm) */
 176   CUPS_BREAK_SP,                /* Space (A) (norm) */
 177   CUPS_BREAK_SY,                /* Symbols Allowing Break After (A) */
 178   CUPS_BREAK_XX,                /* Unknown (XP) */
 179   CUPS_BREAK_ZW                 /* Zero Width Space (A) (norm) */
 180 } cups_break_class_t;
 181
 182 typedef int _cups_comb_class_t;   /**** Unicode Combining Class ****/
 183                                 /* 0=base, 1..254=combining char */
 184
 185 /*
 186  * Structures...
 187  */
 188
 189 typedef struct _cups_normmap_s          /**** Normalize Map Cache Struct ****/
 190 {
 191   struct _cups_normmap_s *next;        /* Next normalize in cache */
 192   int                   used;           /* Number of times entry used */
 193   cups_normalize_t      normalize;      /* Normalization type */
 194   int                   normcount;      /* Count of Source Chars */
 195   cups_ucs2_t           *uni2norm;      /* Char -> Normalization */
 196                                         /* ...only supports UCS-2 */
 197 } _cups_norm_map_t;
 198
 199 typedef struct _cups_foldmap_s          /**** Case Fold Map Cache Struct ****/
 200 {
 201   struct _cups_foldmap_s *next;        /* Next case fold in cache */
 202   int                   used;           /* Number of times entry used */
 203   cups_folding_t        fold;           /* Case folding type */
 204   int                   foldcount;      /* Count of Source Chars */
 205   cups_ucs2_t           *uni2fold;      /* Char -> Folded Char(s) */
 206                                         /* ...only supports UCS-2 */
 207 } _cups_fold_map_t;
 208
 209 typedef struct _cups_prop_s             /**** Char Property Struct ****/
 210 {
 211   cups_ucs2_t           ch;             /* Unicode Char as UCS-2 */
 212   unsigned char         gencat;         /* General Category */
 213   unsigned char         bidicat;        /* Bidirectional Category */
 214 } _cups_prop_t;
 215
 216 typedef struct _cups_prop_map_s         /**** Char Property Map Struct ****/
 217 {
 218   int                   used;           /* Number of times entry used */
 219   int                   propcount;      /* Count of Source Chars */
 220   _cups_prop_t           *uni2prop;      /* Char -> Properties */
 221 } _cups_prop_map_t;
 222
 223 typedef struct _cups_break_map_s        /**** Line Break Class Map Struct ****/
 224 {
 225   int                   used;           /* Number of times entry used */
 226   int                   breakcount;     /* Count of Source Chars */
 227   cups_ucs2_t           *uni2break;     /* Char -> Line Break Class */
 228 } _cups_break_map_t;
 229
 230 typedef struct _cups_comb_s             /**** Char Combining Class Struct ****/
 231 {
 232   cups_ucs2_t           ch;             /* Unicode Char as UCS-2 */
 233   unsigned char         combclass;      /* Combining Class */
 234   unsigned char         reserved;       /* Reserved for alignment */
 235 } _cups_comb_t;
 236
 237 typedef struct _cups_comb_map_s         /**** Combining Class Map Struct ****/
 238 {
 239   int                   used;           /* Number of times entry used */
 240   int                   combcount;      /* Count of Source Chars */
 241   _cups_comb_t           *uni2comb;      /* Char -> Combining Class */
 242 } _cups_comb_map_t;
 243
 244 /*
 245  * Prototypes...
 246  */
 247
 248 /*
 249  * Utility functions for normalization module
 250  */
 251 extern int      cupsNormalizeMapsGet(void);
 252 extern int      cupsNormalizeMapsFree(void);
 253 extern void     cupsNormalizeMapsFlush(void);
 254
 255 /*
 256  * Normalize UTF-8 string to Unicode UAX-15 Normalization Form
 257  * Note - Compatibility Normalization Forms (NFKD/NFKC) are
 258  * unsafe for subsequent transcoding to legacy charsets
 259  */
 260 extern int      cupsUTF8Normalize(cups_utf8_t *dest,
 261                                   const cups_utf8_t *src,
 262                                   const int maxout,
 263                                   const cups_normalize_t normalize);
 264
 265 /*
 266  * Normalize UTF-32 string to Unicode UAX-15 Normalization Form
 267  * Note - Compatibility Normalization Forms (NFKD/NFKC) are
 268  * unsafe for subsequent transcoding to legacy charsets
 269  */
 270 extern int      cupsUTF32Normalize(cups_utf32_t *dest,
 271                                    const cups_utf32_t *src,
 272                                    const int maxout,
 273                                    const cups_normalize_t normalize);
 274
 275 /*
 276  * Case Fold UTF-8 string per Unicode UAX-21 Section 2.3
 277  * Note - Case folding output is
 278  * unsafe for subsequent transcoding to legacy charsets
 279  */
 280 extern int      cupsUTF8CaseFold(cups_utf8_t *dest,
 281                                  const cups_utf8_t *src,
 282                                  const int maxout,
 283                                  const cups_folding_t fold);
 284
 285 /*
 286  * Case Fold UTF-32 string per Unicode UAX-21 Section 2.3
 287  * Note - Case folding output is
 288  * unsafe for subsequent transcoding to legacy charsets
 289  */
 290 extern int      cupsUTF32CaseFold(cups_utf32_t *dest,
 291                                   const cups_utf32_t *src,
 292                                   const int maxout,
 293                                   const cups_folding_t fold);
 294
 295 /*
 296  * Compare UTF-8 strings after case folding
 297  */
 298 extern int      cupsUTF8CompareCaseless(const cups_utf8_t *s1,
 299                                         const cups_utf8_t *s2);
 300
 301 /*
 302  * Compare UTF-32 strings after case folding
 303  */
 304 extern int      cupsUTF32CompareCaseless(const cups_utf32_t *s1,
 305                                          const cups_utf32_t *s2);
 306
 307 /*
 308  * Compare UTF-8 strings after case folding and NFKC normalization
 309  */
 310 extern int      cupsUTF8CompareIdentifier(const cups_utf8_t *s1,
 311                                           const cups_utf8_t *s2);
 312
 313 /*
 314  * Compare UTF-32 strings after case folding and NFKC normalization
 315  */
 316 extern int      cupsUTF32CompareIdentifier(const cups_utf32_t *s1,
 317                                            const cups_utf32_t *s2);
 318
 319 /*
 320  * Get UTF-32 character property
 321  */
 322 extern int      cupsUTF32CharacterProperty(const cups_utf32_t ch,
 323                                            const cups_property_t prop);
 324
 325 #  ifdef __cplusplus
 326 }
 327 #  endif /* __cplusplus */
 328
 329 #endif /* !_CUPS_NORMALIZE_H_ */
 330
 331
 332 /*
 333  * End of "$Id: normalize.h 4684 2005-09-22 02:15:56Z mike $"
 334  */