[thirdparty/glibc.git] / localedata / unicode-gen / unicode_utils.py

# Utilities to generate Unicode data for glibc from upstream Unicode data.
#
# Copyright (C) 2014-2017 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.

'''
This module contains utilities used by the scripts to generate
Unicode data for glibc from upstream Unicode data files.
'''

import sys
import re


# Common locale header.
COMMENT_HEADER = """
% This file is part of the GNU C Library and contains locale data.
% The Free Software Foundation does not claim any copyright interest
% in the locale data contained in this file.  The foregoing does not
% affect the license of the GNU C Library as a whole.  It does not
% exempt you from the conditions of the license if your use would
% otherwise be governed by that license.
"""

# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
#      'title': None,
#      'digit': '',
#      'name': '<control>',
#      'bidi': 'BN',
#      'combining': '0',
#      'comment': '',
#      'oldname': 'NULL',
#      'decomposition': '',
#      'upper': None,
#      'mirrored': 'N',
#      'lower': None,
#      'decdigit': '',
#      'numeric': ''},
#      …
# }
UNICODE_ATTRIBUTES = {}

# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
#
# Contents of this dictionary look like this:
#
# {917504: ['Default_Ignorable_Code_Point'],
#  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
#  …
# }
DERIVED_CORE_PROPERTIES = {}

# Dictionary holding the entire contents of the EastAsianWidths.txt file
#
# Contents of this dictionary look like this:
#
# {0: 'N', … , 45430: 'W', …}
EAST_ASIAN_WIDTHS = {}

def fill_attribute(code_point, fields):
    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.

    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
    in the UnicodeData.txt file.

    '''
    UNICODE_ATTRIBUTES[code_point] =  {
        'name': fields[1],          # Character name
        'category': fields[2],      # General category
        'combining': fields[3],     # Canonical combining classes
        'bidi': fields[4],          # Bidirectional category
        'decomposition': fields[5], # Character decomposition mapping
        'decdigit': fields[6],      # Decimal digit value
        'digit': fields[7],         # Digit value
        'numeric': fields[8],       # Numeric value
        'mirrored': fields[9],      # mirrored
        'oldname': fields[10],      # Old Unicode 1.0 name
        'comment': fields[11],      # comment
        # Uppercase mapping
        'upper': int(fields[12], 16) if fields[12] else None,
        # Lowercase mapping
        'lower': int(fields[13], 16) if fields[13] else None,
        # Titlecase mapping
        'title': int(fields[14], 16) if fields[14] else None,
    }

def fill_attributes(filename):
    '''Stores the entire contents of the UnicodeData.txt file
    in the UNICODE_ATTRIBUTES dictionary.

    A typical line for a single code point in UnicodeData.txt looks
    like this:

    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;

    Code point ranges are indicated by pairs of lines like this:

    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    '''
    with open(filename, mode='r') as unicode_data_file:
        fields_start = []
        for line in unicode_data_file:
            fields = line.strip().split(';')
            if len(fields) != 15:
                sys.stderr.write(
                    'short line in file "%(f)s": %(l)s\n' %{
                    'f': filename, 'l': line})
                exit(1)
            if fields[2] == 'Cs':
                # Surrogates are UTF-16 artefacts,
                # not real characters. Ignore them.
                fields_start = []
                continue
            if fields[1].endswith(', First>'):
                fields_start = fields
                fields_start[1] = fields_start[1].split(',')[0][1:]
                continue
            if fields[1].endswith(', Last>'):
                fields[1] = fields[1].split(',')[0][1:]
                if fields[1:] != fields_start[1:]:
                    sys.stderr.write(
                        'broken code point range in file "%(f)s": %(l)s\n' %{
                            'f': filename, 'l': line})
                    exit(1)
                for code_point in range(
                        int(fields_start[0], 16),
                        int(fields[0], 16)+1):
                    fill_attribute(code_point, fields)
                fields_start = []
                continue
            fill_attribute(int(fields[0], 16), fields)
            fields_start = []

def fill_derived_core_properties(filename):
    '''Stores the entire contents of the DerivedCoreProperties.txt file
    in the DERIVED_CORE_PROPERTIES dictionary.

    Lines in DerivedCoreProperties.txt are either a code point range like
    this:

    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z

    or a single code point like this:

    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR

    '''
    with open(filename, mode='r') as derived_core_properties_file:
        for line in derived_core_properties_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                prop = match.group('property')
                if code_point in DERIVED_CORE_PROPERTIES:
                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
                else:
                    DERIVED_CORE_PROPERTIES[code_point] = [prop]

def fill_east_asian_widths(filename):
    '''Stores the entire contents of the EastAsianWidths.txt file
    in the EAST_ASIAN_WIDTHS dictionary.

    Lines in EastAsianWidths.txt are either a code point range like
    this:

    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>

    or a single code point like this:

    A015;W           # Lm         YI SYLLABLE WU
    '''
    with open(filename, mode='r') as east_asian_widths_file:
        for line in east_asian_widths_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                EAST_ASIAN_WIDTHS[code_point] = match.group('property')

def to_upper(code_point):
    '''Returns the code point of the uppercase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['upper']):
        return UNICODE_ATTRIBUTES[code_point]['upper']
    else:
        return code_point

def to_lower(code_point):
    '''Returns the code point of the lowercase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['lower']):
        return UNICODE_ATTRIBUTES[code_point]['lower']
    else:
        return code_point

def to_upper_turkish(code_point):
    '''Returns the code point of the Turkish uppercase version
    of the given code point'''
    if code_point == 0x0069:
        return 0x0130
    return to_upper(code_point)

def to_lower_turkish(code_point):
    '''Returns the code point of the Turkish lowercase version
    of the given code point'''
    if code_point == 0x0049:
        return 0x0131
    return to_lower(code_point)

def to_title(code_point):
    '''Returns the code point of the titlecase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['title']):
        return UNICODE_ATTRIBUTES[code_point]['title']
    else:
        return code_point

def is_upper(code_point):
    '''Checks whether the character with this code point is uppercase'''
    return (to_lower(code_point) != code_point
            or (code_point in DERIVED_CORE_PROPERTIES
                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))

def is_lower(code_point):
    '''Checks whether the character with this code point is lowercase'''
    # Some characters are defined as “Lowercase” in
    # DerivedCoreProperties.txt but do not have a mapping to upper
    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
    # one of these.
    return (to_upper(code_point) != code_point
            # <U00DF> is lowercase, but without simple to_upper mapping.
            or code_point == 0x00DF
            or (code_point in DERIVED_CORE_PROPERTIES
                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))

def is_alpha(code_point):
    '''Checks whether the character with this code point is alphabetic'''
    return ((code_point in DERIVED_CORE_PROPERTIES
             and
             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
            or
            # Consider all the non-ASCII digits as alphabetic.
            # ISO C 99 forbids us to have them in category “digit”,
            # but we want iswalnum to return true on them.
            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
             and not (code_point >= 0x0030 and code_point <= 0x0039)))

def is_digit(code_point):
    '''Checks whether the character with this code point is a digit'''
    if False:
        return (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
        # a zero.  Must add <0> in front of them by hand.
    else:
        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
        # takes it away:
        # 7.25.2.1.5:
        #    The iswdigit function tests for any wide character that
        #    corresponds to a decimal-digit character (as defined in 5.2.1).
        # 5.2.1:
        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
        return (code_point >= 0x0030 and code_point <= 0x0039)

def is_outdigit(code_point):
    '''Checks whether the character with this code point is outdigit'''
    return (code_point >= 0x0030 and code_point <= 0x0039)

def is_blank(code_point):
    '''Checks whether the character with this code point is blank'''
    return (code_point == 0x0009 # '\t'
            # Category Zs without mention of '<noBreak>'
            or (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
                and '<noBreak>' not in
                UNICODE_ATTRIBUTES[code_point]['decomposition']))

def is_space(code_point):
    '''Checks whether the character with this code point is a space'''
    # Don’t make U+00A0 a space. Non-breaking space means that all programs
    # should treat it like a punctuation character, not like a space.
    return (code_point == 0x0020 # ' '
            or code_point == 0x000C # '\f'
            or code_point == 0x000A # '\n'
            or code_point == 0x000D # '\r'
            or code_point == 0x0009 # '\t'
            or code_point == 0x000B # '\v'
            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
            or (UNICODE_ATTRIBUTES[code_point]['name']
                and
                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
                 or
                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
                  and
                  '<noBreak>' not in
                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))

def is_cntrl(code_point):
    '''Checks whether the character with this code point is
    a control character'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
                 or
                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))

def is_xdigit(code_point):
    '''Checks whether the character with this code point is
    a hexadecimal digit'''
    if False:
        return (is_digit(code_point)
                or (code_point >= 0x0041 and code_point <= 0x0046)
                or (code_point >= 0x0061 and code_point <= 0x0066))
    else:
        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
        # takes it away:
        # 7.25.2.1.12:
        #    The iswxdigit function tests for any wide character that
        #    corresponds to a hexadecimal-digit character (as defined
        #    in 6.4.4.1).
        # 6.4.4.1:
        #    hexadecimal-digit: one of
        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
        return ((code_point >= 0x0030 and code_point  <= 0x0039)
                or (code_point >= 0x0041 and code_point <= 0x0046)
                or (code_point >= 0x0061 and code_point <= 0x0066))

def is_graph(code_point):
    '''Checks whether the character with this code point is
    a graphical character'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
            and not is_space(code_point))

def is_print(code_point):
    '''Checks whether the character with this code point is printable'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])

def is_punct(code_point):
    '''Checks whether the character with this code point is punctuation'''
    if False:
        return (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
    else:
        # The traditional POSIX definition of punctuation is every graphic,
        # non-alphanumeric character.
        return (is_graph(code_point)
                and not is_alpha(code_point)
                and not is_digit(code_point))

def is_combining(code_point):
    '''Checks whether the character with this code point is
    a combining character'''
    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
    # file. In 3.0.1 it was identical to the union of the general categories
    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
    # PropList.txt file, so we take the latter definition.
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and
            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])

def is_combining_level3(code_point):
    '''Checks whether the character with this code point is
    a combining level3 character'''
    return (is_combining(code_point)
            and
            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))

def ucs_symbol(code_point):
    '''Return the UCS symbol string for a Unicode character.'''
    if code_point < 0x10000:
        return '<U{:04X}>'.format(code_point)
    else:
        return '<U{:08X}>'.format(code_point)

def ucs_symbol_range(code_point_low, code_point_high):
    '''Returns a string UCS symbol string for a code point range.

    Example:

    <U0041>..<U005A>
    '''
    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)

def verifications():
    '''Tests whether the is_* functions observe the known restrictions'''
    for code_point in sorted(UNICODE_ATTRIBUTES):
        # toupper restriction: "Only characters specified for the keywords
        # lower and upper shall be specified.
        if (to_upper(code_point) != code_point
            and not (is_lower(code_point) or is_upper(code_point))):
            sys.stderr.write(
                ('%(sym)s is not upper|lower '
                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
                    'sym': ucs_symbol(code_point),
                    'c': code_point,
                    'uc': to_upper(code_point)})
        # tolower restriction: "Only characters specified for the keywords
        # lower and upper shall be specified.
        if (to_lower(code_point) != code_point
            and not (is_lower(code_point) or is_upper(code_point))):
            sys.stderr.write(
                ('%(sym)s is not upper|lower '
                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
                    'sym': ucs_symbol(code_point),
                    'c': code_point,
                    'uc': to_lower(code_point)})
        # alpha restriction: "Characters classified as either upper or lower
        # shall automatically belong to this class.
        if ((is_lower(code_point) or is_upper(code_point))
             and not is_alpha(code_point)):
            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
                'sym': ucs_symbol(code_point)})
        # alpha restriction: “No character specified for the keywords cntrl,
        # digit, punct or space shall be specified.”
        if (is_alpha(code_point) and is_cntrl(code_point)):
            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is alpha and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_punct(code_point)):
            sys.stderr.write('%(sym)s is alpha and punct\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_space(code_point)):
            sys.stderr.write('%(sym)s is alpha and space\n' %{
                'sym': ucs_symbol(code_point)})
        # space restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, graph or xdigit shall be specified.”
        # upper, lower, alpha already checked above.
        if (is_space(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is space and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_space(code_point) and is_graph(code_point)):
            sys.stderr.write('%(sym)s is space and graph\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_space(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is space and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        # cntrl restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, punct, graph, print or xdigit shall be
        # specified.”  upper, lower, alpha already checked above.
        if (is_cntrl(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_punct(code_point)):
            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_graph(code_point)):
            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_print(code_point)):
            sys.stderr.write('%(sym)s is cntrl and print\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        # punct restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
        # be specified.”  upper, lower, alpha, cntrl already checked above.
        if (is_punct(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is punct and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_punct(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_punct(code_point) and code_point == 0x0020):
            sys.stderr.write('%(sym)s is punct\n' %{
                'sym': ucs_symbol(code_point)})
        # graph restriction: “No character specified for the keyword cntrl
        # shall be specified.”  Already checked above.

        # print restriction: “No character specified for the keyword cntrl
        # shall be specified.”  Already checked above.

        # graph - print relation: differ only in the <space> character.
        # How is this possible if there are more than one space character?!
        # I think susv2/xbd/locale.html should speak of “space characters”,
        # not “space character”.
        if (is_print(code_point)
            and not (is_graph(code_point) or is_space(code_point))):
            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
                'sym': unicode_utils.ucs_symbol(code_point)})
        if (not is_print(code_point)
            and (is_graph(code_point) or code_point == 0x0020)):
            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
                'sym': unicode_utils.ucs_symbol(code_point)})
Commit	Line	Data
dd8e8e54 CD	1	# Utilities to generate Unicode data for glibc from upstream Unicode data.
dd8e8e54 CD	2	#
bfff8b1b	3	# Copyright (C) 2014-2017 Free Software Foundation, Inc.
dd8e8e54 CD	4	# This file is part of the GNU C Library.
	5	#
	6	# The GNU C Library is free software; you can redistribute it and/or
	7	# modify it under the terms of the GNU Lesser General Public
	8	# License as published by the Free Software Foundation; either
	9	# version 2.1 of the License, or (at your option) any later version.
	10	#
	11	# The GNU C Library is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	# Lesser General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU Lesser General Public
	17	# License along with the GNU C Library; if not, see
	18	# <http://www.gnu.org/licenses/>.
	19
	20	'''
	21	This module contains utilities used by the scripts to generate
	22	Unicode data for glibc from upstream Unicode data files.
	23	'''
	24
	25	import sys
	26	import re
	27
277da2ab MF	28
	29	# Common locale header.
	30	COMMENT_HEADER = """
	31	% This file is part of the GNU C Library and contains locale data.
	32	% The Free Software Foundation does not claim any copyright interest
	33	% in the locale data contained in this file. The foregoing does not
	34	% affect the license of the GNU C Library as a whole. It does not
	35	% exempt you from the conditions of the license if your use would
	36	% otherwise be governed by that license.
	37	"""
	38
dd8e8e54 CD	39	# Dictionary holding the entire contents of the UnicodeData.txt file
	40	#
	41	# Contents of this dictionary look like this:
	42	#
	43	# {0: {'category': 'Cc',
	44	# 'title': None,
	45	# 'digit': '',
	46	# 'name': '<control>',
	47	# 'bidi': 'BN',
	48	# 'combining': '0',
	49	# 'comment': '',
	50	# 'oldname': 'NULL',
	51	# 'decomposition': '',
	52	# 'upper': None,
	53	# 'mirrored': 'N',
	54	# 'lower': None,
	55	# 'decdigit': '',
	56	# 'numeric': ''},
	57	# …
	58	# }
	59	UNICODE_ATTRIBUTES = {}
	60
	61	# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
	62	#
	63	# Contents of this dictionary look like this:
	64	#
	65	# {917504: ['Default_Ignorable_Code_Point'],
	66	# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
	67	# …
	68	# }
	69	DERIVED_CORE_PROPERTIES = {}
	70
	71	# Dictionary holding the entire contents of the EastAsianWidths.txt file
	72	#
	73	# Contents of this dictionary look like this:
	74	#
	75	# {0: 'N', … , 45430: 'W', …}
	76	EAST_ASIAN_WIDTHS = {}
	77
	78	def fill_attribute(code_point, fields):
	79	'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
	80
	81	One entry in the UNICODE_ATTRIBUTES dictionary represents one line
	82	in the UnicodeData.txt file.
	83
	84	'''
	85	UNICODE_ATTRIBUTES[code_point] = {
	86	'name': fields[1], # Character name
	87	'category': fields[2], # General category
	88	'combining': fields[3], # Canonical combining classes
	89	'bidi': fields[4], # Bidirectional category
	90	'decomposition': fields[5], # Character decomposition mapping
	91	'decdigit': fields[6], # Decimal digit value
	92	'digit': fields[7], # Digit value
	93	'numeric': fields[8], # Numeric value
	94	'mirrored': fields[9], # mirrored
	95	'oldname': fields[10], # Old Unicode 1.0 name
	96	'comment': fields[11], # comment
	97	# Uppercase mapping
	98	'upper': int(fields[12], 16) if fields[12] else None,
	99	# Lowercase mapping
	100	'lower': int(fields[13], 16) if fields[13] else None,
	101	# Titlecase mapping
	102	'title': int(fields[14], 16) if fields[14] else None,
103	}
104
105	def fill_attributes(filename):
106	'''Stores the entire contents of the UnicodeData.txt file
107	in the UNICODE_ATTRIBUTES dictionary.
108
109	A typical line for a single code point in UnicodeData.txt looks
110	like this:
111
112	0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
113
114	Code point ranges are indicated by pairs of lines like this:
115
116	4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
117	9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
118	'''
119	with open(filename, mode='r') as unicode_data_file:
120	fields_start = []
121	for line in unicode_data_file:
122	fields = line.strip().split(';')
123	if len(fields) != 15:
124	sys.stderr.write(
125	'short line in file "%(f)s": %(l)s\n' %{
126	'f': filename, 'l': line})
127	exit(1)
128	if fields[2] == 'Cs':
129	# Surrogates are UTF-16 artefacts,
130	# not real characters. Ignore them.
131	fields_start = []
132	continue
133	if fields[1].endswith(', First>'):
134	fields_start = fields
135	fields_start[1] = fields_start[1].split(',')[0][1:]
136	continue
137	if fields[1].endswith(', Last>'):
138	fields[1] = fields[1].split(',')[0][1:]
139	if fields[1:] != fields_start[1:]:
140	sys.stderr.write(
141	'broken code point range in file "%(f)s": %(l)s\n' %{
142	'f': filename, 'l': line})
143	exit(1)
144	for code_point in range(
145	int(fields_start[0], 16),
146	int(fields[0], 16)+1):
147	fill_attribute(code_point, fields)
148	fields_start = []
149	continue
150	fill_attribute(int(fields[0], 16), fields)
151	fields_start = []
152
153	def fill_derived_core_properties(filename):
154	'''Stores the entire contents of the DerivedCoreProperties.txt file
155	in the DERIVED_CORE_PROPERTIES dictionary.
156
157	Lines in DerivedCoreProperties.txt are either a code point range like
158	this:
159
160	0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
161
162	or a single code point like this:
163
164	00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
165
166	'''
167	with open(filename, mode='r') as derived_core_properties_file:
168	for line in derived_core_properties_file:
169	match = re.match(
170	r'^(?P<codepoint1>[0-9A-F]{4,6})'
171	+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
172	+ r'\s;\s(?P<property>[a-zA-Z_]+)',
173	line)
174	if not match:
175	continue
176	start = match.group('codepoint1')
177	end = match.group('codepoint2')
178	if not end:
179	end = start
180	for code_point in range(int(start, 16), int(end, 16)+1):
181	prop = match.group('property')
182	if code_point in DERIVED_CORE_PROPERTIES:
183	DERIVED_CORE_PROPERTIES[code_point].append(prop)
184	else:
185	DERIVED_CORE_PROPERTIES[code_point] = [prop]
186
187	def fill_east_asian_widths(filename):
188	'''Stores the entire contents of the EastAsianWidths.txt file
189	in the EAST_ASIAN_WIDTHS dictionary.
190
191	Lines in EastAsianWidths.txt are either a code point range like
192	this:
193
194	9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
195
196	or a single code point like this:
197
198	A015;W # Lm YI SYLLABLE WU
199	'''
200	with open(filename, mode='r') as east_asian_widths_file:
201	for line in east_asian_widths_file:
202	match = re.match(
203	r'^(?P<codepoint1>[0-9A-F]{4,6})'
204	+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
205	+r'\s;\s(?P<property>[a-zA-Z]+)',
206	line)
207	if not match:
208	continue
209	start = match.group('codepoint1')
210	end = match.group('codepoint2')
211	if not end:
212	end = start
213	for code_point in range(int(start, 16), int(end, 16)+1):
214	EAST_ASIAN_WIDTHS[code_point] = match.group('property')
215
216	def to_upper(code_point):
217	'''Returns the code point of the uppercase version
218	of the given code point'''
219	if (UNICODE_ATTRIBUTES[code_point]['name']
220	and UNICODE_ATTRIBUTES[code_point]['upper']):
221	return UNICODE_ATTRIBUTES[code_point]['upper']
222	else:
223	return code_point
224
225	def to_lower(code_point):
226	'''Returns the code point of the lowercase version
227	of the given code point'''
228	if (UNICODE_ATTRIBUTES[code_point]['name']
229	and UNICODE_ATTRIBUTES[code_point]['lower']):
230	return UNICODE_ATTRIBUTES[code_point]['lower']
231	else:
232	return code_point
233
85bafe6f JM	234	def to_upper_turkish(code_point):
	235	'''Returns the code point of the Turkish uppercase version
	236	of the given code point'''
	237	if code_point == 0x0069:
	238	return 0x0130
	239	return to_upper(code_point)
	240
	241	def to_lower_turkish(code_point):
	242	'''Returns the code point of the Turkish lowercase version
	243	of the given code point'''
	244	if code_point == 0x0049:
	245	return 0x0131
	246	return to_lower(code_point)
	247
dd8e8e54 CD	248	def to_title(code_point):
	249	'''Returns the code point of the titlecase version
	250	of the given code point'''
	251	if (UNICODE_ATTRIBUTES[code_point]['name']
	252	and UNICODE_ATTRIBUTES[code_point]['title']):
	253	return UNICODE_ATTRIBUTES[code_point]['title']
	254	else:
	255	return code_point
	256
	257	def is_upper(code_point):
	258	'''Checks whether the character with this code point is uppercase'''
	259	return (to_lower(code_point) != code_point
	260	or (code_point in DERIVED_CORE_PROPERTIES
	261	and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
	262
	263	def is_lower(code_point):
	264	'''Checks whether the character with this code point is lowercase'''
	265	# Some characters are defined as “Lowercase” in
	266	# DerivedCoreProperties.txt but do not have a mapping to upper
	267	# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
	268	# one of these.
	269	return (to_upper(code_point) != code_point
	270	# <U00DF> is lowercase, but without simple to_upper mapping.
	271	or code_point == 0x00DF
	272	or (code_point in DERIVED_CORE_PROPERTIES
	273	and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
	274
	275	def is_alpha(code_point):
	276	'''Checks whether the character with this code point is alphabetic'''
	277	return ((code_point in DERIVED_CORE_PROPERTIES
	278	and
	279	'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
	280	or
	281	# Consider all the non-ASCII digits as alphabetic.
	282	# ISO C 99 forbids us to have them in category “digit”,
	283	# but we want iswalnum to return true on them.
	284	(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
	285	and not (code_point >= 0x0030 and code_point <= 0x0039)))
	286
	287	def is_digit(code_point):
	288	'''Checks whether the character with this code point is a digit'''
	289	if False:
	290	return (UNICODE_ATTRIBUTES[code_point]['name']
	291	and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
	292	# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
	293	# a zero. Must add <0> in front of them by hand.
	294	else:
	295	# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
	296	# takes it away:
	297	# 7.25.2.1.5:
	298	# The iswdigit function tests for any wide character that
	299	# corresponds to a decimal-digit character (as defined in 5.2.1).
	300	# 5.2.1:
	301	# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
	302	return (code_point >= 0x0030 and code_point <= 0x0039)
	303
	304	def is_outdigit(code_point):
	305	'''Checks whether the character with this code point is outdigit'''
	306	return (code_point >= 0x0030 and code_point <= 0x0039)
	307
	308	def is_blank(code_point):
	309	'''Checks whether the character with this code point is blank'''
	310	return (code_point == 0x0009 # '\t'
	311	# Category Zs without mention of '<noBreak>'
312	or (UNICODE_ATTRIBUTES[code_point]['name']
313	and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
314	and '<noBreak>' not in
315	UNICODE_ATTRIBUTES[code_point]['decomposition']))
316
317	def is_space(code_point):
318	'''Checks whether the character with this code point is a space'''
319	# Don’t make U+00A0 a space. Non-breaking space means that all programs
320	# should treat it like a punctuation character, not like a space.
321	return (code_point == 0x0020 # ' '
322	or code_point == 0x000C # '\f'
323	or code_point == 0x000A # '\n'
324	or code_point == 0x000D # '\r'
325	or code_point == 0x0009 # '\t'
326	or code_point == 0x000B # '\v'
327	# Categories Zl, Zp, and Zs without mention of "<noBreak>"
328	or (UNICODE_ATTRIBUTES[code_point]['name']
329	and
330	(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
331	or
332	(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
333	and
334	'<noBreak>' not in
335	UNICODE_ATTRIBUTES[code_point]['decomposition']))))
336
337	def is_cntrl(code_point):
338	'''Checks whether the character with this code point is
339	a control character'''
340	return (UNICODE_ATTRIBUTES[code_point]['name']
341	and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
342	or
343	UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
344
345	def is_xdigit(code_point):
346	'''Checks whether the character with this code point is
347	a hexadecimal digit'''
348	if False:
349	return (is_digit(code_point)
350	or (code_point >= 0x0041 and code_point <= 0x0046)
351	or (code_point >= 0x0061 and code_point <= 0x0066))
352	else:
353	# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
354	# takes it away:
355	# 7.25.2.1.12:
356	# The iswxdigit function tests for any wide character that
357	# corresponds to a hexadecimal-digit character (as defined
358	# in 6.4.4.1).
359	# 6.4.4.1:
360	# hexadecimal-digit: one of
361	# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
362	return ((code_point >= 0x0030 and code_point <= 0x0039)
363	or (code_point >= 0x0041 and code_point <= 0x0046)
364	or (code_point >= 0x0061 and code_point <= 0x0066))
365
366	def is_graph(code_point):
367	'''Checks whether the character with this code point is
368	a graphical character'''
369	return (UNICODE_ATTRIBUTES[code_point]['name']
370	and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
371	and not is_space(code_point))
372
373	def is_print(code_point):
374	'''Checks whether the character with this code point is printable'''
375	return (UNICODE_ATTRIBUTES[code_point]['name']
376	and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
377	and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
378
379	def is_punct(code_point):
380	'''Checks whether the character with this code point is punctuation'''
381	if False:
382	return (UNICODE_ATTRIBUTES[code_point]['name']
383	and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
384	else:
385	# The traditional POSIX definition of punctuation is every graphic,
386	# non-alphanumeric character.
387	return (is_graph(code_point)
388	and not is_alpha(code_point)
389	and not is_digit(code_point))
390
391	def is_combining(code_point):
392	'''Checks whether the character with this code point is
393	a combining character'''
394	# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
395	# file. In 3.0.1 it was identical to the union of the general categories
396	# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
397	# PropList.txt file, so we take the latter definition.
398	return (UNICODE_ATTRIBUTES[code_point]['name']
399	and
400	UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
401
402	def is_combining_level3(code_point):
403	'''Checks whether the character with this code point is
404	a combining level3 character'''
405	return (is_combining(code_point)
406	and
407	int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
408
409	def ucs_symbol(code_point):
410	'''Return the UCS symbol string for a Unicode character.'''
411	if code_point < 0x10000:
412	return '<U{:04X}>'.format(code_point)
413	else:
414	return '<U{:08X}>'.format(code_point)
415
416	def ucs_symbol_range(code_point_low, code_point_high):
417	'''Returns a string UCS symbol string for a code point range.
418
419	Example:
420
421	<U0041>..<U005A>
422	'''
423	return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
424
425	def verifications():
426	'''Tests whether the is_* functions observe the known restrictions'''
427	for code_point in sorted(UNICODE_ATTRIBUTES):
428	# toupper restriction: "Only characters specified for the keywords
429	# lower and upper shall be specified.
430	if (to_upper(code_point) != code_point
431	and not (is_lower(code_point) or is_upper(code_point))):
432	sys.stderr.write(
433	('%(sym)s is not upper\|lower '
434	+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
435	'sym': ucs_symbol(code_point),
436	'c': code_point,
437	'uc': to_upper(code_point)})
438	# tolower restriction: "Only characters specified for the keywords
439	# lower and upper shall be specified.
440	if (to_lower(code_point) != code_point
441	and not (is_lower(code_point) or is_upper(code_point))):
442	sys.stderr.write(
443	('%(sym)s is not upper\|lower '
444	+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
445	'sym': ucs_symbol(code_point),
446	'c': code_point,
447	'uc': to_lower(code_point)})
448	# alpha restriction: "Characters classified as either upper or lower
449	# shall automatically belong to this class.
450	if ((is_lower(code_point) or is_upper(code_point))
451	and not is_alpha(code_point)):
452	sys.stderr.write('%(sym)s is upper\|lower but not alpha\n' %{
453	'sym': ucs_symbol(code_point)})
454	# alpha restriction: “No character specified for the keywords cntrl,
455	# digit, punct or space shall be specified.”
456	if (is_alpha(code_point) and is_cntrl(code_point)):
457	sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
458	'sym': ucs_symbol(code_point)})
459	if (is_alpha(code_point) and is_digit(code_point)):
460	sys.stderr.write('%(sym)s is alpha and digit\n' %{
461	'sym': ucs_symbol(code_point)})
462	if (is_alpha(code_point) and is_punct(code_point)):
463	sys.stderr.write('%(sym)s is alpha and punct\n' %{
464	'sym': ucs_symbol(code_point)})
465	if (is_alpha(code_point) and is_space(code_point)):
466	sys.stderr.write('%(sym)s is alpha and space\n' %{
467	'sym': ucs_symbol(code_point)})
468	# space restriction: “No character specified for the keywords upper,
469	# lower, alpha, digit, graph or xdigit shall be specified.”
470	# upper, lower, alpha already checked above.
471	if (is_space(code_point) and is_digit(code_point)):
472	sys.stderr.write('%(sym)s is space and digit\n' %{
473	'sym': ucs_symbol(code_point)})
474	if (is_space(code_point) and is_graph(code_point)):
475	sys.stderr.write('%(sym)s is space and graph\n' %{
476	'sym': ucs_symbol(code_point)})
477	if (is_space(code_point) and is_xdigit(code_point)):
478	sys.stderr.write('%(sym)s is space and xdigit\n' %{
479	'sym': ucs_symbol(code_point)})
480	# cntrl restriction: “No character specified for the keywords upper,
481	# lower, alpha, digit, punct, graph, print or xdigit shall be
482	# specified.” upper, lower, alpha already checked above.
483	if (is_cntrl(code_point) and is_digit(code_point)):
484	sys.stderr.write('%(sym)s is cntrl and digit\n' %{
485	'sym': ucs_symbol(code_point)})
486	if (is_cntrl(code_point) and is_punct(code_point)):
487	sys.stderr.write('%(sym)s is cntrl and punct\n' %{
488	'sym': ucs_symbol(code_point)})
489	if (is_cntrl(code_point) and is_graph(code_point)):
490	sys.stderr.write('%(sym)s is cntrl and graph\n' %{
491	'sym': ucs_symbol(code_point)})
492	if (is_cntrl(code_point) and is_print(code_point)):
493	sys.stderr.write('%(sym)s is cntrl and print\n' %{
494	'sym': ucs_symbol(code_point)})
495	if (is_cntrl(code_point) and is_xdigit(code_point)):
496	sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
497	'sym': ucs_symbol(code_point)})
498	# punct restriction: “No character specified for the keywords upper,
499	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
500	# be specified.” upper, lower, alpha, cntrl already checked above.
501	if (is_punct(code_point) and is_digit(code_point)):
502	sys.stderr.write('%(sym)s is punct and digit\n' %{
503	'sym': ucs_symbol(code_point)})
504	if (is_punct(code_point) and is_xdigit(code_point)):
505	sys.stderr.write('%(sym)s is punct and xdigit\n' %{
506	'sym': ucs_symbol(code_point)})
507	if (is_punct(code_point) and code_point == 0x0020):
508	sys.stderr.write('%(sym)s is punct\n' %{
509	'sym': ucs_symbol(code_point)})
510	# graph restriction: “No character specified for the keyword cntrl
511	# shall be specified.” Already checked above.
512
513	# print restriction: “No character specified for the keyword cntrl
514	# shall be specified.” Already checked above.
515
516	# graph - print relation: differ only in the <space> character.
517	# How is this possible if there are more than one space character?!
518	# I think susv2/xbd/locale.html should speak of “space characters”,
519	# not “space character”.
520	if (is_print(code_point)
521	and not (is_graph(code_point) or is_space(code_point))):
522	sys.stderr.write('%(sym)s is print but not graph\|<space>\n' %{
523	'sym': unicode_utils.ucs_symbol(code_point)})
524	if (not is_print(code_point)
525	and (is_graph(code_point) or code_point == 0x0020)):
526	sys.stderr.write('%(sym)s is graph\|<space> but not print\n' %{
527	'sym': unicode_utils.ucs_symbol(code_point)})