From: Bruno Haible Date: Sun, 11 May 2008 14:30:54 +0000 (+0000) Subject: Move gen-lbrkprop to gnulib. X-Git-Tag: v0.18~425 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e7242c31597b3333399a7a1d0b3737063dcb17a3;p=thirdparty%2Fgettext.git Move gen-lbrkprop to gnulib. --- diff --git a/gnulib-local/ChangeLog b/gnulib-local/ChangeLog index 1c4cd28e9..d8abc0fab 100644 --- a/gnulib-local/ChangeLog +++ b/gnulib-local/ChangeLog @@ -1,3 +1,11 @@ +2008-05-11 Bruno Haible + + * lib/gen-lbrkprop.c: Move to gnulib as lib/unilbrk/gen-lbrk.c. + * lib/3level.h: Remove file. + * modules/gen-lbrkprop: Move to gnulib as modules/unilbrk/gen-lbrk. + * Makefile.am (EXTRA_DIST): Remove lib/gen-lbrkprop.c, lib/3level.h, + modules/gen-lbrkprop. + 2008-05-11 Bruno Haible * lib/gen-lbrkprop.c (output_lbp): Output to two different streams. diff --git a/gnulib-local/Makefile.am b/gnulib-local/Makefile.am index a723fa667..298269cf4 100644 --- a/gnulib-local/Makefile.am +++ b/gnulib-local/Makefile.am @@ -21,7 +21,6 @@ EXTRA_DIST = \ ChangeLog \ build-aux/moopp \ -lib/3level.h \ lib/addext.c \ lib/alloca.in.h \ lib/argmatch.h.diff \ @@ -43,7 +42,6 @@ lib/file-ostream.oo.c \ lib/file-ostream.oo.h \ lib/fnmatch.c.diff \ lib/fnmatch_loop.c.diff \ -lib/gen-lbrkprop.c \ lib/getopt.in.h.diff \ lib/gettext.h \ lib/gl_array_list.h.diff \ @@ -273,7 +271,6 @@ modules/fd-ostream \ modules/file-ostream \ modules/fnmatch.diff \ modules/gcj \ -modules/gen-lbrkprop \ modules/gettext-runtime-misc \ modules/gettext-tools-misc \ modules/hash \ diff --git a/gnulib-local/lib/3level.h b/gnulib-local/lib/3level.h deleted file mode 100644 index 2d777677a..000000000 --- a/gnulib-local/lib/3level.h +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (C) 2000-2001 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Bruno Haible , 2000. - - - NOTE: The canonical source of this file is maintained with the GNU C Library. - Bugs can be reported to bug-glibc@gnu.org. - - This program is free software: you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by the - Free Software Foundation; either version 3 of the License, or any - later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -/* Construction of sparse 3-level tables. - See wchar-lookup.h or coll-lookup.h for their structure and the - meaning of p and q. - - Before including this file, set - TABLE to the name of the structure to be defined - ELEMENT to the type of every entry - DEFAULT to the default value for empty entries - ITERATE if you want the TABLE_iterate function to be defined - NO_FINALIZE if you don't want the TABLE_finalize function to be defined - - This will define - - struct TABLE; - void TABLE_init (struct TABLE *t); - ELEMENT TABLE_get (struct TABLE *t, uint32_t wc); - void TABLE_add (struct TABLE *t, uint32_t wc, ELEMENT value); - void TABLE_iterate (struct TABLE *t, - void (*fn) (uint32_t wc, ELEMENT value)); - void TABLE_finalize (struct TABLE *t); -*/ - -#define CONCAT(a,b) CONCAT1(a,b) -#define CONCAT1(a,b) a##b - -struct TABLE -{ - /* Parameters. */ - unsigned int p; - unsigned int q; - /* Working representation. */ - size_t level1_alloc; - size_t level1_size; - uint32_t *level1; - size_t level2_alloc; - size_t level2_size; - uint32_t *level2; - size_t level3_alloc; - size_t level3_size; - ELEMENT *level3; - /* Compressed representation. */ - size_t result_size; - char *result; -}; - -/* Initialize. Assumes t->p and t->q have already been set. */ -static inline void -CONCAT(TABLE,_init) (struct TABLE *t) -{ - t->level1 = NULL; - t->level1_alloc = t->level1_size = 0; - t->level2 = NULL; - t->level2_alloc = t->level2_size = 0; - t->level3 = NULL; - t->level3_alloc = t->level3_size = 0; -} - -/* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless - whether 'int' is 16 bit, 32 bit, or 64 bit. */ -#define EMPTY ((uint32_t) ~0) - -/* Retrieve an entry. */ -static inline ELEMENT -CONCAT(TABLE,_get) (struct TABLE *t, uint32_t wc) -{ - uint32_t index1 = wc >> (t->q + t->p); - if (index1 < t->level1_size) - { - uint32_t lookup1 = t->level1[index1]; - if (lookup1 != EMPTY) - { - uint32_t index2 = ((wc >> t->p) & ((1 << t->q) - 1)) - + (lookup1 << t->q); - uint32_t lookup2 = t->level2[index2]; - if (lookup2 != EMPTY) - { - uint32_t index3 = (wc & ((1 << t->p) - 1)) - + (lookup2 << t->p); - ELEMENT lookup3 = t->level3[index3]; - - return lookup3; - } - } - } - return DEFAULT; -} - -/* Add one entry. */ -static void -CONCAT(TABLE,_add) (struct TABLE *t, uint32_t wc, ELEMENT value) -{ - uint32_t index1 = wc >> (t->q + t->p); - uint32_t index2 = (wc >> t->p) & ((1 << t->q) - 1); - uint32_t index3 = wc & ((1 << t->p) - 1); - size_t i, i1, i2; - - if (value == CONCAT(TABLE,_get) (t, wc)) - return; - - if (index1 >= t->level1_size) - { - if (index1 >= t->level1_alloc) - { - size_t alloc = 2 * t->level1_alloc; - if (alloc <= index1) - alloc = index1 + 1; - t->level1 = (uint32_t *) xrealloc ((char *) t->level1, - alloc * sizeof (uint32_t)); - t->level1_alloc = alloc; - } - while (index1 >= t->level1_size) - t->level1[t->level1_size++] = EMPTY; - } - - if (t->level1[index1] == EMPTY) - { - if (t->level2_size == t->level2_alloc) - { - size_t alloc = 2 * t->level2_alloc + 1; - t->level2 = (uint32_t *) xrealloc ((char *) t->level2, - (alloc << t->q) * sizeof (uint32_t)); - t->level2_alloc = alloc; - } - i1 = t->level2_size << t->q; - i2 = (t->level2_size + 1) << t->q; - for (i = i1; i < i2; i++) - t->level2[i] = EMPTY; - t->level1[index1] = t->level2_size++; - } - - index2 += t->level1[index1] << t->q; - - if (t->level2[index2] == EMPTY) - { - if (t->level3_size == t->level3_alloc) - { - size_t alloc = 2 * t->level3_alloc + 1; - t->level3 = (ELEMENT *) xrealloc ((char *) t->level3, - (alloc << t->p) * sizeof (ELEMENT)); - t->level3_alloc = alloc; - } - i1 = t->level3_size << t->p; - i2 = (t->level3_size + 1) << t->p; - for (i = i1; i < i2; i++) - t->level3[i] = DEFAULT; - t->level2[index2] = t->level3_size++; - } - - index3 += t->level2[index2] << t->p; - - t->level3[index3] = value; -} - -#ifdef ITERATE -/* Apply a function to all entries in the table. */ -static void -CONCAT(TABLE,_iterate) (struct TABLE *t, - void (*fn) (uint32_t wc, ELEMENT value)) -{ - uint32_t index1; - for (index1 = 0; index1 < t->level1_size; index1++) - { - uint32_t lookup1 = t->level1[index1]; - if (lookup1 != EMPTY) - { - uint32_t lookup1_shifted = lookup1 << t->q; - uint32_t index2; - for (index2 = 0; index2 < (1 << t->q); index2++) - { - uint32_t lookup2 = t->level2[index2 + lookup1_shifted]; - if (lookup2 != EMPTY) - { - uint32_t lookup2_shifted = lookup2 << t->p; - uint32_t index3; - for (index3 = 0; index3 < (1 << t->p); index3++) - { - ELEMENT lookup3 = t->level3[index3 + lookup2_shifted]; - if (lookup3 != DEFAULT) - fn ((((index1 << t->q) + index2) << t->p) + index3, - lookup3); - } - } - } - } - } -} -#endif - -#ifndef NO_FINALIZE -/* Finalize and shrink. */ -static void -CONCAT(TABLE,_finalize) (struct TABLE *t) -{ - size_t i, j, k; - uint32_t reorder3[t->level3_size]; - uint32_t reorder2[t->level2_size]; - uint32_t level1_offset, level2_offset, level3_offset, last_offset; - - /* Uniquify level3 blocks. */ - k = 0; - for (j = 0; j < t->level3_size; j++) - { - for (i = 0; i < k; i++) - if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p], - (1 << t->p) * sizeof (ELEMENT)) == 0) - break; - /* Relocate block j to block i. */ - reorder3[j] = i; - if (i == k) - { - if (i != j) - memcpy (&t->level3[i << t->p], &t->level3[j << t->p], - (1 << t->p) * sizeof (ELEMENT)); - k++; - } - } - t->level3_size = k; - - for (i = 0; i < (t->level2_size << t->q); i++) - if (t->level2[i] != EMPTY) - t->level2[i] = reorder3[t->level2[i]]; - - /* Uniquify level2 blocks. */ - k = 0; - for (j = 0; j < t->level2_size; j++) - { - for (i = 0; i < k; i++) - if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q], - (1 << t->q) * sizeof (uint32_t)) == 0) - break; - /* Relocate block j to block i. */ - reorder2[j] = i; - if (i == k) - { - if (i != j) - memcpy (&t->level2[i << t->q], &t->level2[j << t->q], - (1 << t->q) * sizeof (uint32_t)); - k++; - } - } - t->level2_size = k; - - for (i = 0; i < t->level1_size; i++) - if (t->level1[i] != EMPTY) - t->level1[i] = reorder2[t->level1[i]]; - - /* Create and fill the resulting compressed representation. */ - last_offset = - 5 * sizeof (uint32_t) - + t->level1_size * sizeof (uint32_t) - + (t->level2_size << t->q) * sizeof (uint32_t) - + (t->level3_size << t->p) * sizeof (ELEMENT); - t->result_size = (last_offset + 3) & ~3ul; - t->result = (char *) xmalloc (t->result_size); - - level1_offset = - 5 * sizeof (uint32_t); - level2_offset = - 5 * sizeof (uint32_t) - + t->level1_size * sizeof (uint32_t); - level3_offset = - 5 * sizeof (uint32_t) - + t->level1_size * sizeof (uint32_t) - + (t->level2_size << t->q) * sizeof (uint32_t); - - ((uint32_t *) t->result)[0] = t->q + t->p; - ((uint32_t *) t->result)[1] = t->level1_size; - ((uint32_t *) t->result)[2] = t->p; - ((uint32_t *) t->result)[3] = (1 << t->q) - 1; - ((uint32_t *) t->result)[4] = (1 << t->p) - 1; - - for (i = 0; i < t->level1_size; i++) - ((uint32_t *) (t->result + level1_offset))[i] = - (t->level1[i] == EMPTY - ? 0 - : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset); - - for (i = 0; i < (t->level2_size << t->q); i++) - ((uint32_t *) (t->result + level2_offset))[i] = - (t->level2[i] == EMPTY - ? 0 - : (t->level2[i] << t->p) * sizeof (ELEMENT) + level3_offset); - - for (i = 0; i < (t->level3_size << t->p); i++) - ((ELEMENT *) (t->result + level3_offset))[i] = t->level3[i]; - - if (last_offset < t->result_size) - memset (t->result + last_offset, 0, t->result_size - last_offset); - - if (t->level1_alloc > 0) - free (t->level1); - if (t->level2_alloc > 0) - free (t->level2); - if (t->level3_alloc > 0) - free (t->level3); -} -#endif - -#undef EMPTY -#undef TABLE -#undef ELEMENT -#undef DEFAULT -#undef ITERATE -#undef NO_FINALIZE diff --git a/gnulib-local/lib/gen-lbrkprop.c b/gnulib-local/lib/gen-lbrkprop.c deleted file mode 100644 index 5e118a5b9..000000000 --- a/gnulib-local/lib/gen-lbrkprop.c +++ /dev/null @@ -1,1341 +0,0 @@ -/* Generate a Unicode conforming Line Break Properties tables from a - UnicodeData file. - Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc. - Written by Bruno Haible , 2000-2002. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -/* Usage example: - $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \ - /usr/local/share/Unidata/EastAsianWidth.txt \ - /usr/local/share/Unidata/LineBreak.txt \ - 3.1.0 - */ - -#include -#include -#include -#include -#include -#include - -/* This structure represents one line in the UnicodeData.txt file. */ -struct unicode_attribute -{ - const char *name; /* Character name */ - const char *category; /* General category */ - const char *combining; /* Canonical combining classes */ - const char *bidi; /* Bidirectional category */ - const char *decomposition; /* Character decomposition mapping */ - const char *decdigit; /* Decimal digit value */ - const char *digit; /* Digit value */ - const char *numeric; /* Numeric value */ - int mirrored; /* mirrored */ - const char *oldname; /* Old Unicode 1.0 name */ - const char *comment; /* Comment */ - unsigned int upper; /* Uppercase mapping */ - unsigned int lower; /* Lowercase mapping */ - unsigned int title; /* Titlecase mapping */ -}; - -/* Missing fields are represented with "" for strings, and NONE for - characters. */ -#define NONE (~(unsigned int)0) - -/* The entire contents of the UnicodeData.txt file. */ -struct unicode_attribute unicode_attributes [0x110000]; - -/* Stores in unicode_attributes[i] the values from the given fields. */ -static void -fill_attribute (unsigned int i, - const char *field1, const char *field2, - const char *field3, const char *field4, - const char *field5, const char *field6, - const char *field7, const char *field8, - const char *field9, const char *field10, - const char *field11, const char *field12, - const char *field13, const char *field14) -{ - struct unicode_attribute * uni; - - if (i >= 0x110000) - { - fprintf (stderr, "index too large\n"); - exit (1); - } - uni = &unicode_attributes[i]; - /* Copy the strings. */ - uni->name = strdup (field1); - uni->category = (field2[0] == '\0' ? "" : strdup (field2)); - uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); - uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); - uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); - uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); - uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); - uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); - uni->mirrored = (field9[0] == 'Y'); - uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); - uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); - uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); - uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); - uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); -} - -/* Maximum length of a field in the UnicodeData.txt file. */ -#define FIELDLEN 120 - -/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN. - Reads up to (but excluding) DELIM. - Returns 1 when a field was successfully read, otherwise 0. */ -static int -getfield (FILE *stream, char *buffer, int delim) -{ - int count = 0; - int c; - - for (; (c = getc (stream)), (c != EOF && c != delim); ) - { - /* The original unicode.org UnicodeData.txt file happens to have - CR/LF line terminators. Silently convert to LF. */ - if (c == '\r') - continue; - - /* Put c into the buffer. */ - if (++count >= FIELDLEN - 1) - { - fprintf (stderr, "field too long\n"); - exit (1); - } - *buffer++ = c; - } - - if (c == EOF) - return 0; - - *buffer = '\0'; - return 1; -} - -/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt - file. */ -static void -fill_attributes (const char *unicodedata_filename) -{ - unsigned int i, j; - FILE *stream; - char field0[FIELDLEN]; - char field1[FIELDLEN]; - char field2[FIELDLEN]; - char field3[FIELDLEN]; - char field4[FIELDLEN]; - char field5[FIELDLEN]; - char field6[FIELDLEN]; - char field7[FIELDLEN]; - char field8[FIELDLEN]; - char field9[FIELDLEN]; - char field10[FIELDLEN]; - char field11[FIELDLEN]; - char field12[FIELDLEN]; - char field13[FIELDLEN]; - char field14[FIELDLEN]; - int lineno = 0; - - for (i = 0; i < 0x110000; i++) - unicode_attributes[i].name = NULL; - - stream = fopen (unicodedata_filename, "r"); - if (stream == NULL) - { - fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); - exit (1); - } - - for (;;) - { - int n; - - lineno++; - n = getfield (stream, field0, ';'); - n += getfield (stream, field1, ';'); - n += getfield (stream, field2, ';'); - n += getfield (stream, field3, ';'); - n += getfield (stream, field4, ';'); - n += getfield (stream, field5, ';'); - n += getfield (stream, field6, ';'); - n += getfield (stream, field7, ';'); - n += getfield (stream, field8, ';'); - n += getfield (stream, field9, ';'); - n += getfield (stream, field10, ';'); - n += getfield (stream, field11, ';'); - n += getfield (stream, field12, ';'); - n += getfield (stream, field13, ';'); - n += getfield (stream, field14, '\n'); - if (n == 0) - break; - if (n != 15) - { - fprintf (stderr, "short line in'%s':%d\n", - unicodedata_filename, lineno); - exit (1); - } - i = strtoul (field0, NULL, 16); - if (field1[0] == '<' - && strlen (field1) >= 9 - && !strcmp (field1 + strlen(field1) - 8, ", First>")) - { - /* Deal with a range. */ - lineno++; - n = getfield (stream, field0, ';'); - n += getfield (stream, field1, ';'); - n += getfield (stream, field2, ';'); - n += getfield (stream, field3, ';'); - n += getfield (stream, field4, ';'); - n += getfield (stream, field5, ';'); - n += getfield (stream, field6, ';'); - n += getfield (stream, field7, ';'); - n += getfield (stream, field8, ';'); - n += getfield (stream, field9, ';'); - n += getfield (stream, field10, ';'); - n += getfield (stream, field11, ';'); - n += getfield (stream, field12, ';'); - n += getfield (stream, field13, ';'); - n += getfield (stream, field14, '\n'); - if (n != 15) - { - fprintf (stderr, "missing end range in '%s':%d\n", - unicodedata_filename, lineno); - exit (1); - } - if (!(field1[0] == '<' - && strlen (field1) >= 8 - && !strcmp (field1 + strlen (field1) - 7, ", Last>"))) - { - fprintf (stderr, "missing end range in '%s':%d\n", - unicodedata_filename, lineno); - exit (1); - } - field1[strlen (field1) - 7] = '\0'; - j = strtoul (field0, NULL, 16); - for (; i <= j; i++) - fill_attribute (i, field1+1, field2, field3, field4, field5, - field6, field7, field8, field9, field10, - field11, field12, field13, field14); - } - else - { - /* Single character line */ - fill_attribute (i, field1, field2, field3, field4, field5, - field6, field7, field8, field9, field10, - field11, field12, field13, field14); - } - } - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); - exit (1); - } -} - -/* The width property from the EastAsianWidth.txt file. - Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ -const char * unicode_width[0x110000]; - -/* Stores in unicode_width[] the width property from the EastAsianWidth.txt - file. */ -static void -fill_width (const char *width_filename) -{ - unsigned int i, j; - FILE *stream; - char field0[FIELDLEN]; - char field1[FIELDLEN]; - char field2[FIELDLEN]; - int lineno = 0; - - for (i = 0; i < 0x110000; i++) - unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); - - stream = fopen (width_filename, "r"); - if (stream == NULL) - { - fprintf (stderr, "error during fopen of '%s'\n", width_filename); - exit (1); - } - - for (;;) - { - int n; - int c; - - lineno++; - c = getc (stream); - if (c == EOF) - break; - if (c == '#') - { - do c = getc (stream); while (c != EOF && c != '\n'); - continue; - } - ungetc (c, stream); - n = getfield (stream, field0, ';'); - n += getfield (stream, field1, ' '); - n += getfield (stream, field2, '\n'); - if (n == 0) - break; - if (n != 3) - { - fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); - exit (1); - } - i = strtoul (field0, NULL, 16); - if (strstr (field0, "..") != NULL) - { - /* Deal with a range. */ - j = strtoul (strstr (field0, "..") + 2, NULL, 16); - for (; i <= j; i++) - unicode_width[i] = strdup (field1); - } - else - { - /* Single character line. */ - unicode_width[i] = strdup (field1); - } - } - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error reading from '%s'\n", width_filename); - exit (1); - } -} - -/* Line breaking classification. */ - -enum -{ - /* Values >= 20 are resolved at run time. */ - LBP_BK = 0, /* mandatory break */ -/*LBP_CR, carriage return - not used here because it's a DOSism */ -/*LBP_LF, line feed - not used here because it's a DOSism */ - LBP_CM = 20, /* attached characters and combining marks */ -/*LBP_SG, surrogates - not used here because they are not characters */ - LBP_ZW = 1, /* zero width space */ - LBP_IN = 2, /* inseparable */ - LBP_GL = 3, /* non-breaking (glue) */ - LBP_CB = 22, /* contingent break opportunity */ - LBP_SP = 21, /* space */ - LBP_BA = 4, /* break opportunity after */ - LBP_BB = 5, /* break opportunity before */ - LBP_B2 = 6, /* break opportunity before and after */ - LBP_HY = 7, /* hyphen */ - LBP_NS = 8, /* non starter */ - LBP_OP = 9, /* opening punctuation */ - LBP_CL = 10, /* closing punctuation */ - LBP_QU = 11, /* ambiguous quotation */ - LBP_EX = 12, /* exclamation/interrogation */ - LBP_ID = 13, /* ideographic */ - LBP_NU = 14, /* numeric */ - LBP_IS = 15, /* infix separator (numeric) */ - LBP_SY = 16, /* symbols allowing breaks */ - LBP_AL = 17, /* ordinary alphabetic and symbol characters */ - LBP_PR = 18, /* prefix (numeric) */ - LBP_PO = 19, /* postfix (numeric) */ - LBP_SA = 23, /* complex context (South East Asian) */ - LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ - LBP_XX = 25 /* unknown */ -}; - -/* Returns the line breaking classification for ch, as a bit mask. */ -static int -get_lbp (unsigned int ch) -{ - int attr = 0; - - if (unicode_attributes[ch].name != NULL) - { - /* mandatory break */ - if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ - || ch == 0x000C /* form feed */ - || ch == 0x2028 /* LINE SEPARATOR */ - || ch == 0x2029 /* PARAGRAPH SEPARATOR */) - attr |= 1 << LBP_BK; - - /* zero width space */ - if (ch == 0x200B /* ZERO WIDTH SPACE */) - attr |= 1 << LBP_ZW; - - /* inseparable */ - if (ch == 0x2024 /* ONE DOT LEADER */ - || ch == 0x2025 /* TWO DOT LEADER */ - || ch == 0x2026 /* HORIZONTAL ELLIPSIS */) - attr |= 1 << LBP_IN; - - /* non-breaking (glue) */ - if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */ - || ch == 0x00A0 /* NO-BREAK SPACE */ - || ch == 0x202F /* NARROW NO-BREAK SPACE */ - || ch == 0x2007 /* FIGURE SPACE */ - || ch == 0x2011 /* NON-BREAKING HYPHEN */ - || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */) - attr |= 1 << LBP_GL; - - /* contingent break opportunity */ - if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) - attr |= 1 << LBP_CB; - - /* space */ - if (ch == 0x0020 /* SPACE */) - attr |= 1 << LBP_SP; - - /* break opportunity after */ - if (ch == 0x2000 /* EN QUAD */ - || ch == 0x2001 /* EM QUAD */ - || ch == 0x2002 /* EN SPACE */ - || ch == 0x2003 /* EM SPACE */ - || ch == 0x2004 /* THREE-PER-EM SPACE */ - || ch == 0x2005 /* FOUR-PER-EM SPACE */ - || ch == 0x2006 /* SIX-PER-EM SPACE */ - || ch == 0x2008 /* PUNCTUATION SPACE */ - || ch == 0x2009 /* THIN SPACE */ - || ch == 0x200A /* HAIR SPACE */ - || ch == 0x0009 /* tab */ - || ch == 0x058A /* ARMENIAN HYPHEN */ - || ch == 0x2010 /* HYPHEN */ - || ch == 0x2012 /* FIGURE DASH */ - || ch == 0x2013 /* EN DASH */ - || ch == 0x00AD /* SOFT HYPHEN */ - || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ - || ch == 0x1361 /* ETHIOPIC WORDSPACE */ - || ch == 0x1680 /* OGHAM SPACE MARK */ - || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ - || ch == 0x2027 /* HYPHENATION POINT */ - || ch == 0x007C /* VERTICAL LINE */) - attr |= 1 << LBP_BA; - - /* break opportunity before */ - if (ch == 0x00B4 /* ACUTE ACCENT */ - || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ - || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ - || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) - attr |= 1 << LBP_BB; - - /* break opportunity before and after */ - if (ch == 0x2014 /* EM DASH */) - attr |= 1 << LBP_B2; - - /* hyphen */ - if (ch == 0x002D /* HYPHEN-MINUS */) - attr |= 1 << LBP_HY; - - /* exclamation/interrogation */ - if (ch == 0x0021 /* EXCLAMATION MARK */ - || ch == 0x003F /* QUESTION MARK */ - || ch == 0xFE56 /* SMALL QUESTION MARK */ - || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ - || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ - || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) - attr |= 1 << LBP_EX; - - /* opening punctuation */ - if (unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 's') - attr |= 1 << LBP_OP; - - /* closing punctuation */ - if (ch == 0x3001 /* IDEOGRAPHIC COMMA */ - || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ - || ch == 0xFE50 /* SMALL COMMA */ - || ch == 0xFE52 /* SMALL FULL STOP */ - || ch == 0xFF0C /* FULLWIDTH COMMA */ - || ch == 0xFF0E /* FULLWIDTH FULL STOP */ - || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ - || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ - || (unicode_attributes[ch].category[0] == 'P' - && unicode_attributes[ch].category[1] == 'e')) - attr |= 1 << LBP_CL; - - /* ambiguous quotation */ - if (ch == 0x0022 /* QUOTATION MARK */ - || ch == 0x0027 /* APOSTROPHE */ - || (unicode_attributes[ch].category[0] == 'P' - && (unicode_attributes[ch].category[1] == 'f' - || unicode_attributes[ch].category[1] == 'i'))) - attr |= 1 << LBP_QU; - - /* attached characters and combining marks */ - if ((unicode_attributes[ch].category[0] == 'M' - && (unicode_attributes[ch].category[1] == 'n' - || unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'e')) - || (ch >= 0x1160 && ch <= 0x11F9) - || (unicode_attributes[ch].category[0] == 'C' - && (unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'f'))) - if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL)))) - attr |= 1 << LBP_CM; - - /* non starter */ - if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ - || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ - || ch == 0x17D4 /* KHMER SIGN KHAN */ - || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ - || ch == 0x17D7 /* KHMER SIGN LEK TOO */ - || ch == 0x17D8 /* KHMER SIGN BEYYAL */ - || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */ - || ch == 0x17DA /* KHMER SIGN KOOMUUT */ - || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ - || ch == 0x2044 /* FRACTION SLASH */ - || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ - || ch == 0x301C /* WAVE DASH */ - || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ - || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ - || ch == 0x309D /* HIRAGANA ITERATION MARK */ - || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ - || ch == 0x30FB /* KATAKANA MIDDLE DOT */ - || ch == 0x30FD /* KATAKANA ITERATION MARK */ - || ch == 0xFE54 /* SMALL SEMICOLON */ - || ch == 0xFE55 /* SMALL COLON */ - || ch == 0xFF1A /* FULLWIDTH COLON */ - || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ - || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ - || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ - || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ - || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ - || (unicode_attributes[ch].category[0] == 'L' - && unicode_attributes[ch].category[1] == 'm' - && (unicode_width[ch][0] == 'W' - || unicode_width[ch][0] == 'H')) - || (unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'k' - && unicode_width[ch][0] == 'W') - || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL - || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) - attr |= 1 << LBP_NS; - - /* numeric */ - if (unicode_attributes[ch].category[0] == 'N' - && unicode_attributes[ch].category[1] == 'd' - && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) - attr |= 1 << LBP_NU; - - /* infix separator (numeric) */ - if (ch == 0x002C /* COMMA */ - || ch == 0x002E /* FULL STOP */ - || ch == 0x003A /* COLON */ - || ch == 0x003B /* SEMICOLON */ - || ch == 0x0589 /* ARMENIAN FULL STOP */) - attr |= 1 << LBP_IS; - - /* symbols allowing breaks */ - if (ch == 0x002F /* SOLIDUS */) - attr |= 1 << LBP_SY; - - /* postfix (numeric) */ - if (ch == 0x0025 /* PERCENT SIGN */ - || ch == 0x00A2 /* CENT SIGN */ - || ch == 0x00B0 /* DEGREE SIGN */ - || ch == 0x2030 /* PER MILLE SIGN */ - || ch == 0x2031 /* PER TEN THOUSAND SIGN */ - || ch == 0x2032 /* PRIME */ - || ch == 0x2033 /* DOUBLE PRIME */ - || ch == 0x2034 /* TRIPLE PRIME */ - || ch == 0x2035 /* REVERSED PRIME */ - || ch == 0x2036 /* REVERSED DOUBLE PRIME */ - || ch == 0x2037 /* REVERSED TRIPLE PRIME */ - || ch == 0x20A7 /* PESETA SIGN */ - || ch == 0x2103 /* DEGREE CELSIUS */ - || ch == 0x2109 /* DEGREE FAHRENHEIT */ - || ch == 0x2126 /* OHM SIGN */ - || ch == 0xFE6A /* SMALL PERCENT SIGN */ - || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ - || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */) - attr |= 1 << LBP_PO; - - /* prefix (numeric) */ - if (ch == 0x002B /* PLUS SIGN */ - || ch == 0x005C /* REVERSE SOLIDUS */ - || ch == 0x00B1 /* PLUS-MINUS SIGN */ - || ch == 0x2116 /* NUMERO SIGN */ - || ch == 0x2212 /* MINUS SIGN */ - || ch == 0x2213 /* MINUS-OR-PLUS SIGN */ - || (unicode_attributes[ch].category[0] == 'S' - && unicode_attributes[ch].category[1] == 'c')) - if (!(attr & (1 << LBP_PO))) - attr |= 1 << LBP_PR; - - /* complex context (South East Asian) */ - if (((ch >= 0x0E00 && ch <= 0x0EFF) - || (ch >= 0x1000 && ch <= 0x109F) - || (ch >= 0x1780 && ch <= 0x17FF)) - && unicode_attributes[ch].category[0] == 'L' - && (unicode_attributes[ch].category[1] == 'm' - || unicode_attributes[ch].category[1] == 'o')) - if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR)))) - attr |= 1 << LBP_SA; - - /* ideographic */ - if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */ - || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ - || ch == 0x3000 /* IDEOGRAPHIC SPACE */ - || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */ - || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ - || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */ - || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */ - || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */ - || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */ - || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */ - || ch == 0xFE62 /* SMALL PLUS SIGN */ - || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ - || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ - || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ - || ch == 0xFE66 /* SMALL EQUALS SIGN */ - || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ - || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ - || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ - || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL - || (ch >= 0x3000 && ch <= 0x33FF - && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ - || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ - || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ - || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ - || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ - || ch == 0xFE49 /* DASHED OVERLINE */ - || ch == 0xFE4A /* CENTRELINE OVERLINE */ - || ch == 0xFE4B /* WAVY OVERLINE */ - || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ - || ch == 0xFE4D /* DASHED LOW LINE */ - || ch == 0xFE4E /* CENTRELINE LOW LINE */ - || ch == 0xFE4F /* WAVY LOW LINE */ - || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ - || ch == 0xFE58 /* SMALL EM DASH */ - || ch == 0xFE5F /* SMALL NUMBER SIGN */ - || ch == 0xFE60 /* SMALL AMPERSAND */ - || ch == 0xFE61 /* SMALL ASTERISK */ - || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ - || ch == 0xFE6B /* SMALL COMMERCIAL AT */ - || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ - || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ - || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ - || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ - || ch == 0xFF0A /* FULLWIDTH ASTERISK */ - || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ - || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ - || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ - || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ - || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ - || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ - || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ - || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ - || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ - || ch == 0xFF3F /* FULLWIDTH LOW LINE */ - || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ - || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ - || ch == 0xFF5E /* FULLWIDTH TILDE */ - || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ - || ch == 0xFFE3 /* FULLWIDTH MACRON */ - || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */ - { - /* ambiguous (ideograph) ? */ - if (unicode_width[ch] != NULL - && unicode_width[ch][0] == 'A') - attr |= 1 << LBP_AI; - else - attr |= 1 << LBP_ID; - } - - /* ordinary alphabetic and symbol characters */ - if ((unicode_attributes[ch].category[0] == 'L' - && (unicode_attributes[ch].category[1] == 'u' - || unicode_attributes[ch].category[1] == 'l' - || unicode_attributes[ch].category[1] == 't' - || unicode_attributes[ch].category[1] == 'm' - || unicode_attributes[ch].category[1] == 'o')) - || (unicode_attributes[ch].category[0] == 'S' - && (unicode_attributes[ch].category[1] == 'm' - || unicode_attributes[ch].category[1] == 'c' - || unicode_attributes[ch].category[1] == 'k' - || unicode_attributes[ch].category[1] == 'o')) - /* Extra characters for compatibility with Unicode LineBreak.txt. */ - || ch == 0x0023 /* NUMBER SIGN */ - || ch == 0x0026 /* AMPERSAND */ - || ch == 0x002A /* ASTERISK */ - || ch == 0x0040 /* COMMERCIAL AT */ - || ch == 0x005F /* LOW LINE */ - || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ - || ch == 0x00B2 /* SUPERSCRIPT TWO */ - || ch == 0x00B3 /* SUPERSCRIPT THREE */ - || ch == 0x00B7 /* MIDDLE DOT */ - || ch == 0x00B9 /* SUPERSCRIPT ONE */ - || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ - || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ - || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ - || ch == 0x00BF /* INVERTED QUESTION MARK */ - || ch == 0x037E /* GREEK QUESTION MARK */ - || ch == 0x0387 /* GREEK ANO TELEIA */ - || ch == 0x055A /* ARMENIAN APOSTROPHE */ - || ch == 0x055B /* ARMENIAN EMPHASIS MARK */ - || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */ - || ch == 0x055D /* ARMENIAN COMMA */ - || ch == 0x055E /* ARMENIAN QUESTION MARK */ - || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */ - || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ - || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */ - || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */ - || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */ - || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */ - || ch == 0x060C /* ARABIC COMMA */ - || ch == 0x061B /* ARABIC SEMICOLON */ - || ch == 0x061F /* ARABIC QUESTION MARK */ - || ch == 0x066A /* ARABIC PERCENT SIGN */ - || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ - || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */ - || ch == 0x066D /* ARABIC FIVE POINTED STAR */ - || ch == 0x06D4 /* ARABIC FULL STOP */ - || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */ - || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */ - || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */ - || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */ - || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */ - || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */ - || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */ - || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */ - || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */ - || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */ - || ch == 0x070A /* SYRIAC CONTRACTION */ - || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */ - || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */ - || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */ - || ch == 0x0964 /* DEVANAGARI DANDA */ - || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ - || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */ - || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */ - || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */ - || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */ - || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */ - || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ - || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ - || ch == 0x0BF0 /* TAMIL NUMBER TEN */ - || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */ - || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */ - || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */ - || ch == 0x0E4F /* THAI CHARACTER FONGMAN */ - || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ - || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */ - || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ - || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ - || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ - || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ - || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ - || ch == 0x0F0D /* TIBETAN MARK SHAD */ - || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ - || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ - || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ - || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ - || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ - || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */ - || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */ - || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */ - || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */ - || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */ - || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */ - || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */ - || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */ - || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */ - || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */ - || ch == 0x0F85 /* TIBETAN MARK PALUTA */ - || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ - || ch == 0x104B /* MYANMAR SIGN SECTION */ - || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */ - || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */ - || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */ - || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */ - || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */ - || ch == 0x1362 /* ETHIOPIC FULL STOP */ - || ch == 0x1363 /* ETHIOPIC COMMA */ - || ch == 0x1364 /* ETHIOPIC SEMICOLON */ - || ch == 0x1365 /* ETHIOPIC COLON */ - || ch == 0x1366 /* ETHIOPIC PREFACE COLON */ - || ch == 0x1367 /* ETHIOPIC QUESTION MARK */ - || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */ - || ch == 0x1372 /* ETHIOPIC NUMBER TEN */ - || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */ - || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */ - || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */ - || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */ - || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */ - || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */ - || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */ - || ch == 0x137A /* ETHIOPIC NUMBER NINETY */ - || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */ - || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */ - || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */ - || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */ - || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ - || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ - || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ - || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */ - || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */ - || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */ - || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */ - || ch == 0x1800 /* MONGOLIAN BIRGA */ - || ch == 0x1801 /* MONGOLIAN ELLIPSIS */ - || ch == 0x1802 /* MONGOLIAN COMMA */ - || ch == 0x1803 /* MONGOLIAN FULL STOP */ - || ch == 0x1804 /* MONGOLIAN COLON */ - || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ - || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */ - || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ - || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ - || ch == 0x180A /* MONGOLIAN NIRUGU */ - || ch == 0x2015 /* HORIZONTAL BAR */ - || ch == 0x2016 /* DOUBLE VERTICAL LINE */ - || ch == 0x2017 /* DOUBLE LOW LINE */ - || ch == 0x2020 /* DAGGER */ - || ch == 0x2021 /* DOUBLE DAGGER */ - || ch == 0x2022 /* BULLET */ - || ch == 0x2023 /* TRIANGULAR BULLET */ - || ch == 0x2038 /* CARET */ - || ch == 0x203B /* REFERENCE MARK */ - || ch == 0x203D /* INTERROBANG */ - || ch == 0x203E /* OVERLINE */ - || ch == 0x203F /* UNDERTIE */ - || ch == 0x2040 /* CHARACTER TIE */ - || ch == 0x2041 /* CARET INSERTION POINT */ - || ch == 0x2042 /* ASTERISM */ - || ch == 0x2043 /* HYPHEN BULLET */ - || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ - || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ - || ch == 0x204A /* TIRONIAN SIGN ET */ - || ch == 0x204B /* REVERSED PILCROW SIGN */ - || ch == 0x204C /* BLACK LEFTWARDS BULLET */ - || ch == 0x204D /* BLACK RIGHTWARDS BULLET */ - || ch == 0x2070 /* SUPERSCRIPT ZERO */ - || ch == 0x2074 /* SUPERSCRIPT FOUR */ - || ch == 0x2075 /* SUPERSCRIPT FIVE */ - || ch == 0x2076 /* SUPERSCRIPT SIX */ - || ch == 0x2077 /* SUPERSCRIPT SEVEN */ - || ch == 0x2078 /* SUPERSCRIPT EIGHT */ - || ch == 0x2079 /* SUPERSCRIPT NINE */ - || ch == 0x2080 /* SUBSCRIPT ZERO */ - || ch == 0x2081 /* SUBSCRIPT ONE */ - || ch == 0x2082 /* SUBSCRIPT TWO */ - || ch == 0x2083 /* SUBSCRIPT THREE */ - || ch == 0x2084 /* SUBSCRIPT FOUR */ - || ch == 0x2085 /* SUBSCRIPT FIVE */ - || ch == 0x2086 /* SUBSCRIPT SIX */ - || ch == 0x2087 /* SUBSCRIPT SEVEN */ - || ch == 0x2088 /* SUBSCRIPT EIGHT */ - || ch == 0x2089 /* SUBSCRIPT NINE */ - || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */ - || ch == 0x215F /* FRACTION NUMERATOR ONE */ - || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */ - || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */ - || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */ - || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */ - || ch == 0x24EA /* CIRCLED DIGIT ZERO */ - || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */ - || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */ - || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */ - || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */ - || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */ - || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ - if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB)))) - { - /* ambiguous (alphabetic) ? */ - if (unicode_width[ch] != NULL - && unicode_width[ch][0] == 'A') - attr |= 1 << LBP_AI; - else - attr |= 1 << LBP_AL; - } - } - - if (attr == 0) - /* unknown */ - attr |= 1 << LBP_XX; - - return attr; -} - -/* Output the line breaking properties in a human readable format. */ -static void -debug_output_lbp (FILE *stream) -{ - unsigned int i; - - for (i = 0; i < 0x110000; i++) - { - int attr = get_lbp (i); - if (attr != 1 << LBP_XX) - { - fprintf (stream, "0x%04X", i); -#define PRINT_BIT(attr,bit) \ - if (attr & (1 << bit)) fprintf (stream, " " #bit); - PRINT_BIT(attr,LBP_BK); - PRINT_BIT(attr,LBP_CM); - PRINT_BIT(attr,LBP_ZW); - PRINT_BIT(attr,LBP_IN); - PRINT_BIT(attr,LBP_GL); - PRINT_BIT(attr,LBP_CB); - PRINT_BIT(attr,LBP_SP); - PRINT_BIT(attr,LBP_BA); - PRINT_BIT(attr,LBP_BB); - PRINT_BIT(attr,LBP_B2); - PRINT_BIT(attr,LBP_HY); - PRINT_BIT(attr,LBP_NS); - PRINT_BIT(attr,LBP_OP); - PRINT_BIT(attr,LBP_CL); - PRINT_BIT(attr,LBP_QU); - PRINT_BIT(attr,LBP_EX); - PRINT_BIT(attr,LBP_ID); - PRINT_BIT(attr,LBP_NU); - PRINT_BIT(attr,LBP_IS); - PRINT_BIT(attr,LBP_SY); - PRINT_BIT(attr,LBP_AL); - PRINT_BIT(attr,LBP_PR); - PRINT_BIT(attr,LBP_PO); - PRINT_BIT(attr,LBP_SA); - PRINT_BIT(attr,LBP_XX); - PRINT_BIT(attr,LBP_AI); -#undef PRINT_BIT - fprintf (stream, "\n"); - } - } -} - -static void -debug_output_tables (const char *filename) -{ - FILE *stream; - - stream = fopen (filename, "w"); - if (stream == NULL) - { - fprintf (stderr, "cannot open '%s' for writing\n", filename); - exit (1); - } - - debug_output_lbp (stream); - - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error writing to '%s'\n", filename); - exit (1); - } -} - -/* The line breaking property from the LineBreak.txt file. */ -int unicode_org_lbp[0x110000]; - -/* Stores in unicode_org_lbp[] the line breaking property from the - LineBreak.txt file. */ -static void -fill_org_lbp (const char *linebreak_filename) -{ - unsigned int i, j; - FILE *stream; - char field0[FIELDLEN]; - char field1[FIELDLEN]; - char field2[FIELDLEN]; - int lineno = 0; - - for (i = 0; i < 0x110000; i++) - unicode_org_lbp[i] = LBP_XX; - - stream = fopen (linebreak_filename, "r"); - if (stream == NULL) - { - fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename); - exit (1); - } - - for (;;) - { - int n; - int c; - int value; - - lineno++; - c = getc (stream); - if (c == EOF) - break; - if (c == '#') - { - do c = getc (stream); while (c != EOF && c != '\n'); - continue; - } - ungetc (c, stream); - n = getfield (stream, field0, ';'); - n += getfield (stream, field1, ' '); - n += getfield (stream, field2, '\n'); - if (n == 0) - break; - if (n != 3) - { - fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, - lineno); - exit (1); - } -#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit; - if (false) {} - TRY(LBP_BK) - TRY(LBP_CM) - TRY(LBP_ZW) - TRY(LBP_IN) - TRY(LBP_GL) - TRY(LBP_CB) - TRY(LBP_SP) - TRY(LBP_BA) - TRY(LBP_BB) - TRY(LBP_B2) - TRY(LBP_HY) - TRY(LBP_NS) - TRY(LBP_OP) - TRY(LBP_CL) - TRY(LBP_QU) - TRY(LBP_EX) - TRY(LBP_ID) - TRY(LBP_NU) - TRY(LBP_IS) - TRY(LBP_SY) - TRY(LBP_AL) - TRY(LBP_PR) - TRY(LBP_PO) - TRY(LBP_SA) - TRY(LBP_XX) - TRY(LBP_AI) -#undef TRY - else if (strcmp (field1, "LF") == 0) value = LBP_BK; - else if (strcmp (field1, "CR") == 0) value = LBP_BK; - else if (strcmp (field1, "SG") == 0) value = LBP_XX; - else - { - fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", - field1, linebreak_filename, lineno); - exit (1); - } - i = strtoul (field0, NULL, 16); - if (strstr (field0, "..") != NULL) - { - /* Deal with a range. */ - j = strtoul (strstr (field0, "..") + 2, NULL, 16); - for (; i <= j; i++) - unicode_org_lbp[i] = value; - } - else - { - /* Single character line. */ - unicode_org_lbp[i] = value; - } - } - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error reading from '%s'\n", linebreak_filename); - exit (1); - } -} - -/* Output the line breaking properties in a human readable format. */ -static void -debug_output_org_lbp (FILE *stream) -{ - unsigned int i; - - for (i = 0; i < 0x110000; i++) - { - int attr = unicode_org_lbp[i]; - if (attr != LBP_XX) - { - fprintf (stream, "0x%04X", i); -#define PRINT_BIT(attr,bit) \ - if (attr == bit) fprintf (stream, " " #bit); - PRINT_BIT(attr,LBP_BK); - PRINT_BIT(attr,LBP_CM); - PRINT_BIT(attr,LBP_ZW); - PRINT_BIT(attr,LBP_IN); - PRINT_BIT(attr,LBP_GL); - PRINT_BIT(attr,LBP_CB); - PRINT_BIT(attr,LBP_SP); - PRINT_BIT(attr,LBP_BA); - PRINT_BIT(attr,LBP_BB); - PRINT_BIT(attr,LBP_B2); - PRINT_BIT(attr,LBP_HY); - PRINT_BIT(attr,LBP_NS); - PRINT_BIT(attr,LBP_OP); - PRINT_BIT(attr,LBP_CL); - PRINT_BIT(attr,LBP_QU); - PRINT_BIT(attr,LBP_EX); - PRINT_BIT(attr,LBP_ID); - PRINT_BIT(attr,LBP_NU); - PRINT_BIT(attr,LBP_IS); - PRINT_BIT(attr,LBP_SY); - PRINT_BIT(attr,LBP_AL); - PRINT_BIT(attr,LBP_PR); - PRINT_BIT(attr,LBP_PO); - PRINT_BIT(attr,LBP_SA); - PRINT_BIT(attr,LBP_XX); - PRINT_BIT(attr,LBP_AI); -#undef PRINT_BIT - fprintf (stream, "\n"); - } - } -} - -static void -debug_output_org_tables (const char *filename) -{ - FILE *stream; - - stream = fopen (filename, "w"); - if (stream == NULL) - { - fprintf (stderr, "cannot open '%s' for writing\n", filename); - exit (1); - } - - debug_output_org_lbp (stream); - - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error writing to '%s'\n", filename); - exit (1); - } -} - -/* Construction of sparse 3-level tables. */ -#define TABLE lbp_table -#define ELEMENT unsigned char -#define DEFAULT LBP_XX -#define xmalloc malloc -#define xrealloc realloc -#include "3level.h" - -static void -output_lbp (FILE *stream1, FILE *stream2) -{ - unsigned int i; - struct lbp_table t; - unsigned int level1_offset, level2_offset, level3_offset; - - t.p = 7; - t.q = 9; - lbp_table_init (&t); - - for (i = 0; i < 0x110000; i++) - { - int attr = get_lbp (i); - - /* Now attr should contain exactly one bit. */ - if (attr == 0 || ((attr & (attr - 1)) != 0)) - abort (); - - if (attr != 1 << LBP_XX) - { - unsigned int log2_attr; - for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); - - lbp_table_add (&t, i, log2_attr); - } - } - - lbp_table_finalize (&t); - - level1_offset = - 5 * sizeof (uint32_t); - level2_offset = - 5 * sizeof (uint32_t) - + t.level1_size * sizeof (uint32_t); - level3_offset = - 5 * sizeof (uint32_t) - + t.level1_size * sizeof (uint32_t) - + (t.level2_size << t.q) * sizeof (uint32_t); - - for (i = 0; i < 5; i++) - fprintf (stream1, "#define lbrkprop_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); - fprintf (stream1, "\n"); - fprintf (stream1, "typedef struct\n"); - fprintf (stream1, " {\n"); - fprintf (stream1, " int level1[%d];\n", t.level1_size); - fprintf (stream1, " int level2[%d << %d];\n", t.level2_size, t.q); - fprintf (stream1, " unsigned char level3[%d << %d];\n", t.level3_size, t.p); - fprintf (stream1, " }\n"); - fprintf (stream1, "lbrkprop_t;\n"); - fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n"); - - fprintf (stream2, "const lbrkprop_t unilbrkprop =\n"); - fprintf (stream2, "{\n"); - fprintf (stream2, " {"); - for (i = 0; i < t.level1_size; i++) - { - uint32_t offset; - if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); - offset = ((uint32_t *) (t.result + level1_offset))[i]; - fprintf (stream2, " %5d%s", - offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t), - (i+1 < t.level1_size ? "," : "")); - } - fprintf (stream2, " },\n"); - fprintf (stream2, " {"); - if (t.level2_size << t.q > 8) - fprintf (stream2, "\n "); - for (i = 0; i < t.level2_size << t.q; i++) - { - uint32_t offset; - if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); - offset = ((uint32_t *) (t.result + level2_offset))[i]; - fprintf (stream2, " %5d%s", - offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t), - (i+1 < t.level2_size << t.q ? "," : "")); - } - if (t.level2_size << t.q > 8) - fprintf (stream2, "\n "); - fprintf (stream2, " },\n"); - fprintf (stream2, " {"); - if (t.level3_size << t.p > 8) - fprintf (stream2, "\n "); - for (i = 0; i < t.level3_size << t.p; i++) - { - unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; - const char *value_string; - switch (value) - { -#define CASE(x) case x: value_string = #x; break; - CASE(LBP_BK); - CASE(LBP_CM); - CASE(LBP_ZW); - CASE(LBP_IN); - CASE(LBP_GL); - CASE(LBP_CB); - CASE(LBP_SP); - CASE(LBP_BA); - CASE(LBP_BB); - CASE(LBP_B2); - CASE(LBP_HY); - CASE(LBP_NS); - CASE(LBP_OP); - CASE(LBP_CL); - CASE(LBP_QU); - CASE(LBP_EX); - CASE(LBP_ID); - CASE(LBP_NU); - CASE(LBP_IS); - CASE(LBP_SY); - CASE(LBP_AL); - CASE(LBP_PR); - CASE(LBP_PO); - CASE(LBP_SA); - CASE(LBP_XX); - CASE(LBP_AI); -#undef CASE - default: - abort (); - } - if (i > 0 && (i % 8) == 0) - fprintf (stream2, "\n "); - fprintf (stream2, " %s%s", value_string, - (i+1 < t.level3_size << t.p ? "," : "")); - } - if (t.level3_size << t.p > 8) - fprintf (stream2, "\n "); - fprintf (stream2, " }\n"); - fprintf (stream2, "};\n"); -} - -static void -output_tables (const char *filename1, const char *filename2, const char *version) -{ - const char *filenames[2]; - FILE *streams[2]; - size_t i; - - filenames[0] = filename1; - filenames[1] = filename2; - - for (i = 0; i < 2; i++) - { - streams[i] = fopen (filenames[i], "w"); - if (streams[i] == NULL) - { - fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); - exit (1); - } - } - - for (i = 0; i < 2; i++) - { - FILE *stream = streams[i]; - - fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n", - version); - fprintf (stream, "\n"); - - /* Put a GPL header on it. The gnulib module is under LGPL (although it - still carries the GPL header), and it's gnulib-tool which replaces the - GPL header with an LGPL header. */ - fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n"); - fprintf (stream, "\n"); - fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); - fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); - fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); - fprintf (stream, " (at your option) any later version.\n"); - fprintf (stream, "\n"); - fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); - fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); - fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); - fprintf (stream, " GNU General Public License for more details.\n"); - fprintf (stream, "\n"); - fprintf (stream, " You should have received a copy of the GNU General Public License\n"); - fprintf (stream, " along with this program. If not, see . */\n"); - fprintf (stream, "\n"); - } - - output_lbp (streams[0], streams[1]); - - for (i = 0; i < 2; i++) - { - if (ferror (streams[i]) || fclose (streams[i])) - { - fprintf (stderr, "error writing to '%s'\n", filenames[i]); - exit (1); - } - } -} - -int -main (int argc, char * argv[]) -{ - if (argc != 5) - { - fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt LineBreak.txt version\n", - argv[0]); - exit (1); - } - - fill_attributes (argv[1]); - fill_width (argv[2]); - fill_org_lbp (argv[3]); - - debug_output_tables ("lbrkprop.txt"); - debug_output_org_tables ("lbrkprop_org.txt"); - - output_tables ("lbrkprop1.h", "lbrkprop2.h", argv[4]); - - return 0; -} diff --git a/gnulib-local/modules/gen-lbrkprop b/gnulib-local/modules/gen-lbrkprop deleted file mode 100644 index de117307e..000000000 --- a/gnulib-local/modules/gen-lbrkprop +++ /dev/null @@ -1,21 +0,0 @@ -Description: -Generates lbrkprop.h. - -Files: -lib/gen-lbrkprop.c -lib/3level.h - -Depends-on: - -configure.ac: - -Makefile.am: - -Include: - -License: -GPLed build tool - -Maintainer: -Bruno Haible -