]>
Commit | Line | Data |
---|---|---|
053c52b1 FW |
1 | #!/usr/bin/python3 |
2 | # Generate the locale/C-translit.h file. | |
6d7e8eda | 3 | # Copyright (C) 2018-2023 Free Software Foundation, Inc. |
053c52b1 FW |
4 | # This file is part of the GNU C Library. |
5 | # | |
6 | # The GNU C Library is free software; you can redistribute it and/or | |
7 | # modify it under the terms of the GNU Lesser General Public | |
8 | # License as published by the Free Software Foundation; either | |
9 | # version 2.1 of the License, or (at your option) any later version. | |
10 | # | |
11 | # The GNU C Library is distributed in the hope that it will be useful, | |
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | # Lesser General Public License for more details. | |
15 | # | |
16 | # You should have received a copy of the GNU Lesser General Public | |
17 | # License along with the GNU C Library; if not, see | |
5a82c748 | 18 | # <https://www.gnu.org/licenses/>. |
053c52b1 FW |
19 | |
20 | import re | |
21 | import sys | |
22 | ||
23 | ||
24 | class StringLiteral: | |
25 | "Source of a string literal and its decomposition into code points." | |
26 | def __init__(self, s): | |
27 | # States: | |
28 | # 0 regular character sequence | |
29 | # 1 backslash seen | |
30 | # 2 in hexadecimal escape sequence | |
31 | state = 0 | |
32 | result = [] | |
33 | for ch in s: | |
34 | if state == 0: | |
35 | if ch == '\\': | |
36 | state = 1 | |
37 | else: | |
38 | result.append(ord(ch)) | |
39 | elif state == 1: | |
40 | if ch in "\\\"": | |
41 | result.append(ord(ch)) | |
42 | state = 0 | |
43 | elif ch == 'x': | |
44 | state = 2 | |
45 | result.append(0) | |
46 | else: | |
47 | raise ValueError("invalid character {!r} in {!r}".format( | |
48 | ch, s)) | |
49 | elif state == 2: | |
50 | if ch in "0123456789abcdefABCDEF": | |
51 | result[-1] = result[-1] * 16 + int(ch, 16) | |
52 | else: | |
53 | if ch == '\\': | |
54 | state = 1 | |
55 | else: | |
56 | state = 0 | |
57 | if state == 1: | |
58 | raise ValueError("trailing backslash in {!r}".format(s)) | |
59 | ||
60 | self.source = s | |
61 | self.decoded = tuple(result) | |
62 | ||
63 | ||
64 | class Translit: | |
65 | "Pair of transliteration and source." | |
66 | ||
67 | __RE_TRANSLIT = re.compile( | |
68 | r'^"((?:[^"\\]|\\x[0-9a-fA-F])+)"\s+' | |
69 | r'"((?:[^"\\]|\\["\\])*)"\s*(?:#.*)?$') | |
70 | ||
71 | def __init__(self, line): | |
72 | match = self.__RE_TRANSLIT.match(line) | |
73 | if not match: | |
74 | raise IOError("invalid line {}: {!r}".format( | |
75 | lineno + 1, line)) | |
76 | codepoints, replacement = match.groups() | |
77 | self.codepoints = StringLiteral(codepoints) | |
78 | self.replacement = StringLiteral(replacement) | |
79 | ||
80 | ||
81 | # List of Translit objects. | |
82 | translits = [] | |
83 | ||
84 | # Read transliterations from standard input. | |
85 | for lineno, line in enumerate(sys.stdin): | |
86 | line = line.strip() | |
87 | # Skip empty lines and comments. | |
88 | if (not line) or line[0] == '#': | |
89 | continue | |
90 | translit = Translit(line) | |
91 | # Check ordering of codepoints. | |
92 | if translits \ | |
93 | and translit.codepoints.decoded <= translits[-1].codepoints.decoded: | |
94 | raise IOError("unexpected codepoint {!r} on line {}: {!r}".format( | |
1d25bd27 | 95 | translit.codepoints.decoded, lineno + 1, line)) |
053c52b1 FW |
96 | translits.append(translit) |
97 | ||
98 | # Generate the C sources. | |
99 | write = sys.stdout.write | |
100 | write("#include <stdint.h>\n") | |
101 | write("#define NTRANSLIT {}\n".format(len(translits))) | |
102 | ||
103 | write("static const uint32_t translit_from_idx[] =\n{\n ") | |
104 | col = 2 | |
105 | total = 0 | |
106 | for translit in translits: | |
107 | if total > 0: | |
108 | if col + 7 >= 79: | |
109 | write(",\n ") | |
110 | col = 2 | |
111 | else: | |
112 | write(", ") | |
113 | col += 2 | |
114 | write("{:4}".format(total)) | |
115 | total += len(translit.codepoints.decoded) + 1 | |
116 | col += 4 | |
117 | write("\n};\n") | |
118 | ||
119 | write("static const wchar_t translit_from_tbl[] =\n ") | |
120 | col = 1 | |
121 | first = True | |
122 | for translit in translits: | |
123 | if first: | |
124 | first = False | |
125 | else: | |
126 | if col + 6 >= 79: | |
127 | write("\n ") | |
128 | col = 1 | |
129 | write(" L\"\\0\"") | |
130 | col += 6 | |
131 | if col > 2 and col + len(translit.codepoints.source) + 4 >= 79: | |
132 | write("\n ") | |
133 | col = 2 | |
134 | else: | |
135 | write(" ") | |
136 | col += 1 | |
137 | write("L\"{}\"".format(translit.codepoints.source)) | |
138 | col += len(translit.codepoints.source) + 3 | |
139 | write(";\n") | |
140 | ||
141 | write("static const uint32_t translit_to_idx[] =\n{\n ") | |
142 | col = 2 | |
143 | total = 0 | |
144 | for translit in translits: | |
145 | if total > 0: | |
146 | if col + 7 >= 79: | |
147 | write(",\n ") | |
148 | col = 2 | |
149 | else: | |
150 | write(", ") | |
151 | col += 2 | |
152 | write("{:4}".format(total)) | |
153 | total += len(translit.replacement.decoded) + 2 | |
154 | col += 4 | |
155 | write("\n};\n") | |
156 | ||
157 | write("static const wchar_t translit_to_tbl[] =\n ") | |
158 | col = 1 | |
159 | first = True | |
160 | for translit in translits: | |
161 | if first: | |
162 | first = False | |
163 | else: | |
164 | if col + 6 >= 79: | |
165 | write("\n ") | |
166 | col = 1 | |
167 | write(" L\"\\0\"") | |
168 | col += 6 | |
169 | if col > 2 and col + len(translit.replacement.source) + 6 >= 79: | |
170 | write("\n ") | |
171 | col = 2 | |
172 | else: | |
173 | write(" ") | |
174 | col += 1 | |
175 | write("L\"{}\\0\"".format(translit.replacement.source)) | |
176 | col += len(translit.replacement.source) + 5 | |
177 | write(";\n") |