localedata/unicode-gen/gen_unicode_ctype.py

   1 #!/usr/bin/python3
   2 #
   3 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
   4 # Copyright (C) 2014-2017 Free Software Foundation, Inc.
   5 # This file is part of the GNU C Library.
   6 # Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
   7 #
   8 # The GNU C Library is free software; you can redistribute it and/or
   9 # modify it under the terms of the GNU Lesser General Public
  10 # License as published by the Free Software Foundation; either
  11 # version 2.1 of the License, or (at your option) any later version.
  12 #
  13 # The GNU C Library is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 # Lesser General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU Lesser General Public
  19 # License along with the GNU C Library; if not, see
  20 # <http://www.gnu.org/licenses/>.
  21
  22 '''
  23 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
  24 DerivedCoreProperties.txt files.
  25
  26 To see how this script is used, call it with the “-h” option:
  27
  28     $ ./gen_unicode_ctype.py -h
  29     … prints usage message …
  30 '''
  31
  32 import argparse
  33 import time
  34 import re
  35 import unicode_utils
  36
  37 def code_point_ranges(is_class_function):
  38     '''Returns a list of ranges of code points for which is_class_function
  39     returns True.
  40
  41     Example:
  42
  43     [[65, 90], [192, 214], [216, 222], [256], … ]
  44     '''
  45     cp_ranges  = []
  46     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
  47         if is_class_function(code_point):
  48             if (cp_ranges
  49                 and cp_ranges[-1][-1] == code_point - 1):
  50                 if len(cp_ranges[-1]) == 1:
  51                     cp_ranges[-1].append(code_point)
  52                 else:
  53                     cp_ranges[-1][-1] = code_point
  54             else:
  55                 cp_ranges.append([code_point])
  56     return cp_ranges
  57
  58 def output_charclass(i18n_file, class_name, is_class_function):
  59     '''Output a LC_CTYPE character class section
  60
  61     Example:
  62
  63     upper /
  64        <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
  65        …
  66        <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
  67        <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
  68     '''
  69     cp_ranges = code_point_ranges(is_class_function)
  70     if cp_ranges:
  71         i18n_file.write('%s /\n' %class_name)
  72         max_column = 75
  73         prefix = '   '
  74         line = prefix
  75         range_string = ''
  76         for code_point_range in cp_ranges:
  77             if line.strip():
  78                 line  += ';'
  79             if len(code_point_range) == 1:
  80                 range_string = unicode_utils.ucs_symbol(code_point_range[0])
  81             else:
  82                 range_string = unicode_utils.ucs_symbol_range(
  83                     code_point_range[0], code_point_range[-1])
  84             if len(line+range_string) > max_column:
  85                 i18n_file.write(line+'/\n')
  86                 line = prefix
  87             line += range_string
  88         if line.strip():
  89             i18n_file.write(line+'\n')
  90         i18n_file.write('\n')
  91
  92 def output_charmap(i18n_file, map_name, map_function):
  93     '''Output a LC_CTYPE character map section
  94
  95     Example:
  96
  97     toupper /
  98       (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
  99       …
 100       (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
 101       (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
 102     '''
 103     max_column = 75
 104     prefix = '   '
 105     line = prefix
 106     map_string = ''
 107     i18n_file.write('%s /\n' %map_name)
 108     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
 109         mapped = map_function(code_point)
 110         if code_point != mapped:
 111             if line.strip():
 112                 line += ';'
 113             map_string = '(' \
 114                          + unicode_utils.ucs_symbol(code_point) \
 115                          + ',' \
 116                          + unicode_utils.ucs_symbol(mapped) \
 117                          + ')'
 118             if len(line+map_string) > max_column:
 119                 i18n_file.write(line+'/\n')
 120                 line = prefix
 121             line += map_string
 122     if line.strip():
 123         i18n_file.write(line+'\n')
 124     i18n_file.write('\n')
 125
 126 def read_input_file(filename):
 127     '''Reads the original glibc i18n file to get the original head
 128     and tail.
 129
 130     We want to replace only the character classes in LC_CTYPE, and the
 131     date stamp. All the rest of the i18n file should stay unchanged.
 132     To avoid having to cut and paste the generated data into the
 133     original file, it is helpful to read the original file here
 134     to be able to generate a complete result file.
 135     '''
 136     head = tail = ''
 137     with open(filename, mode='r') as i18n_file:
 138         for line in i18n_file:
 139             match = re.match(
 140                 r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
 141                 line)
 142             if match:
 143                 line = match.group('key') \
 144                        + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
 145             head = head + line
 146             if line.startswith('LC_CTYPE'):
 147                 break
 148         for line in i18n_file:
 149             if line.startswith('translit_start'):
 150                 tail = line
 151                 break
 152         for line in i18n_file:
 153             tail = tail + line
 154     return (head, tail)
 155
 156 def output_head(i18n_file, unicode_version, head=''):
 157     '''Write the header of the output file, i.e. the part of the file
 158     before the “LC_CTYPE” line.
 159     '''
 160     if ARGS.input_file and head:
 161         i18n_file.write(head)
 162     else:
 163         i18n_file.write('escape_char /\n')
 164         i18n_file.write('comment_char %\n')
 165         i18n_file.write('\n')
 166         i18n_file.write('% Generated automatically by '
 167                         + 'gen_unicode_ctype.py '
 168                         + 'for Unicode {:s}.\n'.format(unicode_version))
 169         i18n_file.write('\n')
 170         i18n_file.write('LC_IDENTIFICATION\n')
 171         i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
 172             unicode_version))
 173         i18n_file.write('source    "UnicodeData.txt, '
 174                         + 'DerivedCoreProperties.txt"\n')
 175         i18n_file.write('address   ""\n')
 176         i18n_file.write('contact   ""\n')
 177         i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
 178         i18n_file.write('tel       ""\n')
 179         i18n_file.write('fax       ""\n')
 180         i18n_file.write('language  ""\n')
 181         i18n_file.write('territory "Earth"\n')
 182         i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
 183         i18n_file.write('date      "{:s}"\n'.format(
 184             time.strftime('%Y-%m-%d')))
 185         i18n_file.write('category  "unicode:2014";LC_CTYPE\n')
 186         i18n_file.write('END LC_IDENTIFICATION\n')
 187         i18n_file.write('\n')
 188         i18n_file.write('LC_CTYPE\n')
 189
 190 def output_tail(i18n_file, tail=''):
 191     '''Write the tail of the output file, i.e. the part of the file
 192     after the last “LC_CTYPE” character class.
 193     '''
 194     if ARGS.input_file and tail:
 195         i18n_file.write(tail)
 196     else:
 197         i18n_file.write('END LC_CTYPE\n')
 198
 199 def output_tables(i18n_file, unicode_version, turkish):
 200     '''Write the new LC_CTYPE character classes to the output file'''
 201     i18n_file.write('% The following is the 14652 i18n fdcc-set '
 202                     + 'LC_CTYPE category.\n')
 203     i18n_file.write('% It covers Unicode version {:s}.\n'.format(
 204         unicode_version))
 205     i18n_file.write('% The character classes and mapping tables were '
 206                     + 'automatically\n')
 207     i18n_file.write('% generated using the gen_unicode_ctype.py '
 208                     + 'program.\n\n')
 209     i18n_file.write('% The "upper" class reflects the uppercase '
 210                     + 'characters of class "alpha"\n')
 211     output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
 212     i18n_file.write('% The "lower" class reflects the lowercase '
 213                     + 'characters of class "alpha"\n')
 214     output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
 215     i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
 216                     + 'reflecting\n')
 217     i18n_file.write('% the recommendations in TR 10176 annex A\n')
 218     output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
 219     i18n_file.write('% The "digit" class must only contain the '
 220                     + 'BASIC LATIN digits, says ISO C 99\n')
 221     i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
 222     output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
 223     i18n_file.write('% The "outdigit" information is by default '
 224                     + '"0" to "9".  We don\'t have to\n')
 225     i18n_file.write('% provide it here since localedef will fill '
 226                + 'in the bits and it would\n')
 227     i18n_file.write('% prevent locales copying this file define '
 228                     + 'their own values.\n')
 229     i18n_file.write('% outdigit /\n')
 230     i18n_file.write('%    <U0030>..<U0039>\n\n')
 231     # output_charclass(i18n_file, 'outdigit', is_outdigit)
 232     output_charclass(i18n_file, 'space', unicode_utils.is_space)
 233     output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
 234     output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
 235     output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
 236     output_charclass(i18n_file, 'print', unicode_utils.is_print)
 237     i18n_file.write('% The "xdigit" class must only contain the '
 238                     + 'BASIC LATIN digits and A-F, a-f,\n')
 239     i18n_file.write('% says ISO C 99 '
 240                     + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
 241     output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
 242     output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
 243     if turkish:
 244         i18n_file.write('% The case conversions reflect '
 245                         + 'Turkish conventions.\n')
 246         output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
 247         output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
 248     else:
 249         output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
 250         output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
 251     output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
 252     i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
 253                     + 'annex B.1\n')
 254     i18n_file.write('% That is, all combining characters (level 2+3).\n')
 255     output_charclass(i18n_file, 'class "combining";',
 256                      unicode_utils.is_combining)
 257     i18n_file.write('% The "combining_level3" class reflects '
 258                     + 'ISO/IEC 10646-1 annex B.2\n')
 259     i18n_file.write('% That is, combining characters of level 3.\n')
 260     output_charclass(i18n_file, 'class "combining_level3";',
 261                      unicode_utils.is_combining_level3)
 262
 263 if __name__ == "__main__":
 264     PARSER = argparse.ArgumentParser(
 265         description='''
 266         Generate a Unicode conforming LC_CTYPE category from
 267         UnicodeData.txt and DerivedCoreProperties.txt files.
 268         ''')
 269     PARSER.add_argument(
 270         '-u', '--unicode_data_file',
 271         nargs='?',
 272         type=str,
 273         default='UnicodeData.txt',
 274         help=('The UnicodeData.txt file to read, '
 275               + 'default: %(default)s'))
 276     PARSER.add_argument(
 277         '-d', '--derived_core_properties_file',
 278         nargs='?',
 279         type=str,
 280         default='DerivedCoreProperties.txt',
 281         help=('The DerivedCoreProperties.txt file to read, '
 282               + 'default: %(default)s'))
 283     PARSER.add_argument(
 284         '-i', '--input_file',
 285         nargs='?',
 286         type=str,
 287         help='''The original glibc/localedata/locales/i18n file.''')
 288     PARSER.add_argument(
 289         '-o', '--output_file',
 290         nargs='?',
 291         type=str,
 292         default='i18n.new',
 293         help='''The file which shall contain the generated LC_CTYPE category,
 294         default: %(default)s.  If the original
 295         glibc/localedata/locales/i18n has been given
 296         as an option, all data from the original file
 297         except the newly generated LC_CTYPE character
 298         classes and the date stamp in
 299         LC_IDENTIFICATION will be copied unchanged
 300         into the output file.  ''')
 301     PARSER.add_argument(
 302         '--unicode_version',
 303         nargs='?',
 304         required=True,
 305         type=str,
 306         help='The Unicode version of the input files used.')
 307     PARSER.add_argument(
 308         '--turkish',
 309         action='store_true',
 310         help='Use Turkish case conversions.')
 311     ARGS = PARSER.parse_args()
 312
 313     unicode_utils.fill_attributes(
 314         ARGS.unicode_data_file)
 315     unicode_utils.fill_derived_core_properties(
 316         ARGS.derived_core_properties_file)
 317     unicode_utils.verifications()
 318     HEAD = TAIL = ''
 319     if ARGS.input_file:
 320         (HEAD, TAIL) = read_input_file(ARGS.input_file)
 321     with open(ARGS.output_file, mode='w') as I18N_FILE:
 322         output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
 323         output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
 324         output_tail(I18N_FILE, tail=TAIL)