]>
Commit | Line | Data |
---|---|---|
adb0401d ILT |
1 | // Copyright 2011 The Go Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package norm | |
6 | ||
9c63abc9 | 7 | import "unicode/utf8" |
adb0401d ILT |
8 | |
9 | const ( | |
d8f41257 ILT |
10 | maxCombiningChars = 30 |
11 | maxBufferSize = maxCombiningChars + 2 // +1 to hold starter +1 to hold CGJ | |
adb0401d ILT |
12 | maxBackRunes = maxCombiningChars - 1 |
13 | maxNFCExpansion = 3 // NFC(0x1D160) | |
14 | maxNFKCExpansion = 18 // NFKC(0xFDFA) | |
15 | ||
d8f41257 | 16 | maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128 |
adb0401d ILT |
17 | ) |
18 | ||
19 | // reorderBuffer is used to normalize a single segment. Characters inserted with | |
d8f41257 | 20 | // insert are decomposed and reordered based on CCC. The compose method can |
adb0401d ILT |
21 | // be used to recombine characters. Note that the byte buffer does not hold |
22 | // the UTF-8 characters in order. Only the rune array is maintained in sorted | |
d8f41257 | 23 | // order. flush writes the resulting segment to a byte array. |
adb0401d | 24 | type reorderBuffer struct { |
4ccad563 ILT |
25 | rune [maxBufferSize]Properties // Per character info. |
26 | byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos. | |
27 | nrune int // Number of runeInfos. | |
28 | nbyte uint8 // Number or bytes. | |
adb0401d | 29 | f formInfo |
d8f41257 ILT |
30 | |
31 | src input | |
32 | nsrc int | |
33 | srcBytes inputBytes | |
34 | srcString inputString | |
35 | tmpBytes inputBytes | |
36 | } | |
37 | ||
38 | func (rb *reorderBuffer) init(f Form, src []byte) { | |
39 | rb.f = *formTable[f] | |
40 | rb.srcBytes = inputBytes(src) | |
41 | rb.src = &rb.srcBytes | |
42 | rb.nsrc = len(src) | |
43 | } | |
44 | ||
45 | func (rb *reorderBuffer) initString(f Form, src string) { | |
46 | rb.f = *formTable[f] | |
47 | rb.srcString = inputString(src) | |
48 | rb.src = &rb.srcString | |
49 | rb.nsrc = len(src) | |
adb0401d ILT |
50 | } |
51 | ||
52 | // reset discards all characters from the buffer. | |
53 | func (rb *reorderBuffer) reset() { | |
54 | rb.nrune = 0 | |
55 | rb.nbyte = 0 | |
56 | } | |
57 | ||
58 | // flush appends the normalized segment to out and resets rb. | |
59 | func (rb *reorderBuffer) flush(out []byte) []byte { | |
60 | for i := 0; i < rb.nrune; i++ { | |
61 | start := rb.rune[i].pos | |
62 | end := start + rb.rune[i].size | |
63 | out = append(out, rb.byte[start:end]...) | |
64 | } | |
65 | rb.reset() | |
66 | return out | |
67 | } | |
68 | ||
501699af ILT |
69 | // flushCopy copies the normalized segment to buf and resets rb. |
70 | // It returns the number of bytes written to buf. | |
71 | func (rb *reorderBuffer) flushCopy(buf []byte) int { | |
72 | p := 0 | |
73 | for i := 0; i < rb.nrune; i++ { | |
74 | runep := rb.rune[i] | |
75 | p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size]) | |
76 | } | |
77 | rb.reset() | |
78 | return p | |
79 | } | |
80 | ||
adb0401d ILT |
81 | // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class. |
82 | // It returns false if the buffer is not large enough to hold the rune. | |
d8f41257 | 83 | // It is used internally by insert and insertString only. |
4ccad563 | 84 | func (rb *reorderBuffer) insertOrdered(info Properties) bool { |
adb0401d | 85 | n := rb.nrune |
d8f41257 | 86 | if n >= maxCombiningChars+1 { |
adb0401d ILT |
87 | return false |
88 | } | |
89 | b := rb.rune[:] | |
90 | cc := info.ccc | |
91 | if cc > 0 { | |
92 | // Find insertion position + move elements to make room. | |
93 | for ; n > 0; n-- { | |
94 | if b[n-1].ccc <= cc { | |
95 | break | |
96 | } | |
97 | b[n] = b[n-1] | |
98 | } | |
99 | } | |
100 | rb.nrune += 1 | |
101 | pos := uint8(rb.nbyte) | |
d8f41257 | 102 | rb.nbyte += utf8.UTFMax |
adb0401d ILT |
103 | info.pos = pos |
104 | b[n] = info | |
105 | return true | |
106 | } | |
107 | ||
108 | // insert inserts the given rune in the buffer ordered by CCC. | |
109 | // It returns true if the buffer was large enough to hold the decomposed rune. | |
4ccad563 | 110 | func (rb *reorderBuffer) insert(src input, i int, info Properties) bool { |
501699af ILT |
111 | if rune := src.hangul(i); rune != 0 { |
112 | return rb.decomposeHangul(rune) | |
adb0401d | 113 | } |
94252f4b | 114 | if info.hasDecomposition() { |
4ccad563 | 115 | return rb.insertDecomposed(info.Decomposition()) |
501699af ILT |
116 | } |
117 | return rb.insertSingle(src, i, info) | |
118 | } | |
119 | ||
120 | // insertDecomposed inserts an entry in to the reorderBuffer for each rune | |
121 | // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes. | |
122 | func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool { | |
123 | saveNrune, saveNbyte := rb.nrune, rb.nbyte | |
124 | rb.tmpBytes = inputBytes(dcomp) | |
125 | for i := 0; i < len(dcomp); { | |
126 | info := rb.f.info(&rb.tmpBytes, i) | |
d8f41257 | 127 | pos := rb.nbyte |
adb0401d | 128 | if !rb.insertOrdered(info) { |
501699af | 129 | rb.nrune, rb.nbyte = saveNrune, saveNbyte |
adb0401d ILT |
130 | return false |
131 | } | |
501699af ILT |
132 | i += copy(rb.byte[pos:], dcomp[i:i+int(info.size)]) |
133 | } | |
134 | return true | |
135 | } | |
136 | ||
137 | // insertSingle inserts an entry in the reorderBuffer for the rune at | |
138 | // position i. info is the runeInfo for the rune at position i. | |
4ccad563 | 139 | func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) bool { |
501699af ILT |
140 | // insertOrder changes nbyte |
141 | pos := rb.nbyte | |
142 | if !rb.insertOrdered(info) { | |
143 | return false | |
adb0401d | 144 | } |
501699af | 145 | src.copySlice(rb.byte[pos:], i, i+int(info.size)) |
adb0401d ILT |
146 | return true |
147 | } | |
148 | ||
149 | // appendRune inserts a rune at the end of the buffer. It is used for Hangul. | |
94252f4b | 150 | func (rb *reorderBuffer) appendRune(r rune) { |
adb0401d | 151 | bn := rb.nbyte |
506cf9aa | 152 | sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
d8f41257 | 153 | rb.nbyte += utf8.UTFMax |
4ccad563 | 154 | rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)} |
adb0401d ILT |
155 | rb.nrune++ |
156 | } | |
157 | ||
158 | // assignRune sets a rune at position pos. It is used for Hangul and recomposition. | |
94252f4b | 159 | func (rb *reorderBuffer) assignRune(pos int, r rune) { |
d8f41257 | 160 | bn := rb.rune[pos].pos |
506cf9aa | 161 | sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
4ccad563 | 162 | rb.rune[pos] = Properties{pos: bn, size: uint8(sz)} |
adb0401d ILT |
163 | } |
164 | ||
165 | // runeAt returns the rune at position n. It is used for Hangul and recomposition. | |
94252f4b | 166 | func (rb *reorderBuffer) runeAt(n int) rune { |
adb0401d | 167 | inf := rb.rune[n] |
506cf9aa | 168 | r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size]) |
94252f4b | 169 | return r |
adb0401d ILT |
170 | } |
171 | ||
172 | // bytesAt returns the UTF-8 encoding of the rune at position n. | |
173 | // It is used for Hangul and recomposition. | |
174 | func (rb *reorderBuffer) bytesAt(n int) []byte { | |
175 | inf := rb.rune[n] | |
176 | return rb.byte[inf.pos : int(inf.pos)+int(inf.size)] | |
177 | } | |
178 | ||
179 | // For Hangul we combine algorithmically, instead of using tables. | |
180 | const ( | |
181 | hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80 | |
182 | hangulBase0 = 0xEA | |
183 | hangulBase1 = 0xB0 | |
184 | hangulBase2 = 0x80 | |
185 | ||
186 | hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4 | |
187 | hangulEnd0 = 0xED | |
188 | hangulEnd1 = 0x9E | |
189 | hangulEnd2 = 0xA4 | |
190 | ||
191 | jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00 | |
192 | jamoLBase0 = 0xE1 | |
193 | jamoLBase1 = 0x84 | |
194 | jamoLEnd = 0x1113 | |
195 | jamoVBase = 0x1161 | |
196 | jamoVEnd = 0x1176 | |
197 | jamoTBase = 0x11A7 | |
198 | jamoTEnd = 0x11C3 | |
199 | ||
200 | jamoTCount = 28 | |
201 | jamoVCount = 21 | |
202 | jamoVTCount = 21 * 28 | |
203 | jamoLVTCount = 19 * 21 * 28 | |
204 | ) | |
205 | ||
501699af ILT |
206 | const hangulUTF8Size = 3 |
207 | ||
adb0401d | 208 | func isHangul(b []byte) bool { |
501699af ILT |
209 | if len(b) < hangulUTF8Size { |
210 | return false | |
211 | } | |
adb0401d ILT |
212 | b0 := b[0] |
213 | if b0 < hangulBase0 { | |
214 | return false | |
215 | } | |
216 | b1 := b[1] | |
217 | switch { | |
218 | case b0 == hangulBase0: | |
219 | return b1 >= hangulBase1 | |
220 | case b0 < hangulEnd0: | |
221 | return true | |
222 | case b0 > hangulEnd0: | |
223 | return false | |
224 | case b1 < hangulEnd1: | |
225 | return true | |
226 | } | |
227 | return b1 == hangulEnd1 && b[2] < hangulEnd2 | |
228 | } | |
229 | ||
adb0401d | 230 | func isHangulString(b string) bool { |
501699af ILT |
231 | if len(b) < hangulUTF8Size { |
232 | return false | |
233 | } | |
adb0401d ILT |
234 | b0 := b[0] |
235 | if b0 < hangulBase0 { | |
236 | return false | |
237 | } | |
238 | b1 := b[1] | |
239 | switch { | |
240 | case b0 == hangulBase0: | |
241 | return b1 >= hangulBase1 | |
242 | case b0 < hangulEnd0: | |
243 | return true | |
244 | case b0 > hangulEnd0: | |
245 | return false | |
246 | case b1 < hangulEnd1: | |
247 | return true | |
248 | } | |
249 | return b1 == hangulEnd1 && b[2] < hangulEnd2 | |
250 | } | |
251 | ||
252 | // Caller must ensure len(b) >= 2. | |
253 | func isJamoVT(b []byte) bool { | |
254 | // True if (rune & 0xff00) == jamoLBase | |
255 | return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1 | |
256 | } | |
257 | ||
258 | func isHangulWithoutJamoT(b []byte) bool { | |
259 | c, _ := utf8.DecodeRune(b) | |
260 | c -= hangulBase | |
261 | return c < jamoLVTCount && c%jamoTCount == 0 | |
262 | } | |
263 | ||
501699af ILT |
264 | // decomposeHangul writes the decomposed Hangul to buf and returns the number |
265 | // of bytes written. len(buf) should be at least 9. | |
266 | func decomposeHangul(buf []byte, r rune) int { | |
267 | const JamoUTF8Len = 3 | |
268 | r -= hangulBase | |
269 | x := r % jamoTCount | |
270 | r /= jamoTCount | |
271 | utf8.EncodeRune(buf, jamoLBase+r/jamoVCount) | |
272 | utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount) | |
273 | if x != 0 { | |
274 | utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x) | |
275 | return 3 * JamoUTF8Len | |
276 | } | |
277 | return 2 * JamoUTF8Len | |
278 | } | |
279 | ||
adb0401d ILT |
280 | // decomposeHangul algorithmically decomposes a Hangul rune into |
281 | // its Jamo components. | |
282 | // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul. | |
94252f4b | 283 | func (rb *reorderBuffer) decomposeHangul(r rune) bool { |
adb0401d ILT |
284 | b := rb.rune[:] |
285 | n := rb.nrune | |
286 | if n+3 > len(b) { | |
287 | return false | |
288 | } | |
506cf9aa ILT |
289 | r -= hangulBase |
290 | x := r % jamoTCount | |
291 | r /= jamoTCount | |
292 | rb.appendRune(jamoLBase + r/jamoVCount) | |
293 | rb.appendRune(jamoVBase + r%jamoVCount) | |
adb0401d ILT |
294 | if x != 0 { |
295 | rb.appendRune(jamoTBase + x) | |
296 | } | |
297 | return true | |
298 | } | |
299 | ||
300 | // combineHangul algorithmically combines Jamo character components into Hangul. | |
301 | // See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul. | |
d8f41257 | 302 | func (rb *reorderBuffer) combineHangul(s, i, k int) { |
adb0401d ILT |
303 | b := rb.rune[:] |
304 | bn := rb.nrune | |
d8f41257 | 305 | for ; i < bn; i++ { |
adb0401d ILT |
306 | cccB := b[k-1].ccc |
307 | cccC := b[i].ccc | |
308 | if cccB == 0 { | |
309 | s = k - 1 | |
310 | } | |
311 | if s != k-1 && cccB >= cccC { | |
312 | // b[i] is blocked by greater-equal cccX below it | |
313 | b[k] = b[i] | |
314 | k++ | |
315 | } else { | |
316 | l := rb.runeAt(s) // also used to compare to hangulBase | |
317 | v := rb.runeAt(i) // also used to compare to jamoT | |
318 | switch { | |
319 | case jamoLBase <= l && l < jamoLEnd && | |
320 | jamoVBase <= v && v < jamoVEnd: | |
321 | // 11xx plus 116x to LV | |
322 | rb.assignRune(s, hangulBase+ | |
323 | (l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount) | |
324 | case hangulBase <= l && l < hangulEnd && | |
325 | jamoTBase < v && v < jamoTEnd && | |
326 | ((l-hangulBase)%jamoTCount) == 0: | |
327 | // ACxx plus 11Ax to LVT | |
328 | rb.assignRune(s, l+v-jamoTBase) | |
329 | default: | |
330 | b[k] = b[i] | |
331 | k++ | |
332 | } | |
333 | } | |
334 | } | |
335 | rb.nrune = k | |
336 | } | |
337 | ||
338 | // compose recombines the runes in the buffer. | |
339 | // It should only be used to recompose a single segment, as it will not | |
340 | // handle alternations between Hangul and non-Hangul characters correctly. | |
341 | func (rb *reorderBuffer) compose() { | |
342 | // UAX #15, section X5 , including Corrigendum #5 | |
343 | // "In any character sequence beginning with starter S, a character C is | |
344 | // blocked from S if and only if there is some character B between S | |
345 | // and C, and either B is a starter or it has the same or higher | |
346 | // combining class as C." | |
d8f41257 ILT |
347 | bn := rb.nrune |
348 | if bn == 0 { | |
349 | return | |
350 | } | |
adb0401d ILT |
351 | k := 1 |
352 | b := rb.rune[:] | |
adb0401d ILT |
353 | for s, i := 0, 1; i < bn; i++ { |
354 | if isJamoVT(rb.bytesAt(i)) { | |
355 | // Redo from start in Hangul mode. Necessary to support | |
356 | // U+320E..U+321E in NFKC mode. | |
d8f41257 | 357 | rb.combineHangul(s, i, k) |
adb0401d ILT |
358 | return |
359 | } | |
360 | ii := b[i] | |
361 | // We can only use combineForward as a filter if we later | |
362 | // get the info for the combined character. This is more | |
363 | // expensive than using the filter. Using combinesBackward() | |
364 | // is safe. | |
94252f4b | 365 | if ii.combinesBackward() { |
adb0401d ILT |
366 | cccB := b[k-1].ccc |
367 | cccC := ii.ccc | |
368 | blocked := false // b[i] blocked by starter or greater or equal CCC? | |
369 | if cccB == 0 { | |
370 | s = k - 1 | |
371 | } else { | |
372 | blocked = s != k-1 && cccB >= cccC | |
373 | } | |
374 | if !blocked { | |
375 | combined := combine(rb.runeAt(s), rb.runeAt(i)) | |
376 | if combined != 0 { | |
377 | rb.assignRune(s, combined) | |
378 | continue | |
379 | } | |
380 | } | |
381 | } | |
382 | b[k] = b[i] | |
383 | k++ | |
384 | } | |
385 | rb.nrune = k | |
386 | } |