]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgo/go/exp/norm/composition.go
libgo: Update to current sources.
[thirdparty/gcc.git] / libgo / go / exp / norm / composition.go
CommitLineData
adb0401d
ILT
1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
9c63abc9 7import "unicode/utf8"
adb0401d
ILT
8
9const (
d8f41257
ILT
10 maxCombiningChars = 30
11 maxBufferSize = maxCombiningChars + 2 // +1 to hold starter +1 to hold CGJ
adb0401d
ILT
12 maxBackRunes = maxCombiningChars - 1
13 maxNFCExpansion = 3 // NFC(0x1D160)
14 maxNFKCExpansion = 18 // NFKC(0xFDFA)
15
d8f41257 16 maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
adb0401d
ILT
17)
18
19// reorderBuffer is used to normalize a single segment. Characters inserted with
d8f41257 20// insert are decomposed and reordered based on CCC. The compose method can
adb0401d
ILT
21// be used to recombine characters. Note that the byte buffer does not hold
22// the UTF-8 characters in order. Only the rune array is maintained in sorted
d8f41257 23// order. flush writes the resulting segment to a byte array.
adb0401d 24type reorderBuffer struct {
4ccad563
ILT
25 rune [maxBufferSize]Properties // Per character info.
26 byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
27 nrune int // Number of runeInfos.
28 nbyte uint8 // Number or bytes.
adb0401d 29 f formInfo
d8f41257
ILT
30
31 src input
32 nsrc int
33 srcBytes inputBytes
34 srcString inputString
35 tmpBytes inputBytes
36}
37
38func (rb *reorderBuffer) init(f Form, src []byte) {
39 rb.f = *formTable[f]
40 rb.srcBytes = inputBytes(src)
41 rb.src = &rb.srcBytes
42 rb.nsrc = len(src)
43}
44
45func (rb *reorderBuffer) initString(f Form, src string) {
46 rb.f = *formTable[f]
47 rb.srcString = inputString(src)
48 rb.src = &rb.srcString
49 rb.nsrc = len(src)
adb0401d
ILT
50}
51
52// reset discards all characters from the buffer.
53func (rb *reorderBuffer) reset() {
54 rb.nrune = 0
55 rb.nbyte = 0
56}
57
58// flush appends the normalized segment to out and resets rb.
59func (rb *reorderBuffer) flush(out []byte) []byte {
60 for i := 0; i < rb.nrune; i++ {
61 start := rb.rune[i].pos
62 end := start + rb.rune[i].size
63 out = append(out, rb.byte[start:end]...)
64 }
65 rb.reset()
66 return out
67}
68
501699af
ILT
69// flushCopy copies the normalized segment to buf and resets rb.
70// It returns the number of bytes written to buf.
71func (rb *reorderBuffer) flushCopy(buf []byte) int {
72 p := 0
73 for i := 0; i < rb.nrune; i++ {
74 runep := rb.rune[i]
75 p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
76 }
77 rb.reset()
78 return p
79}
80
adb0401d
ILT
81// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
82// It returns false if the buffer is not large enough to hold the rune.
d8f41257 83// It is used internally by insert and insertString only.
4ccad563 84func (rb *reorderBuffer) insertOrdered(info Properties) bool {
adb0401d 85 n := rb.nrune
d8f41257 86 if n >= maxCombiningChars+1 {
adb0401d
ILT
87 return false
88 }
89 b := rb.rune[:]
90 cc := info.ccc
91 if cc > 0 {
92 // Find insertion position + move elements to make room.
93 for ; n > 0; n-- {
94 if b[n-1].ccc <= cc {
95 break
96 }
97 b[n] = b[n-1]
98 }
99 }
100 rb.nrune += 1
101 pos := uint8(rb.nbyte)
d8f41257 102 rb.nbyte += utf8.UTFMax
adb0401d
ILT
103 info.pos = pos
104 b[n] = info
105 return true
106}
107
108// insert inserts the given rune in the buffer ordered by CCC.
109// It returns true if the buffer was large enough to hold the decomposed rune.
4ccad563 110func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
501699af
ILT
111 if rune := src.hangul(i); rune != 0 {
112 return rb.decomposeHangul(rune)
adb0401d 113 }
94252f4b 114 if info.hasDecomposition() {
4ccad563 115 return rb.insertDecomposed(info.Decomposition())
501699af
ILT
116 }
117 return rb.insertSingle(src, i, info)
118}
119
120// insertDecomposed inserts an entry in to the reorderBuffer for each rune
121// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
122func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
123 saveNrune, saveNbyte := rb.nrune, rb.nbyte
124 rb.tmpBytes = inputBytes(dcomp)
125 for i := 0; i < len(dcomp); {
126 info := rb.f.info(&rb.tmpBytes, i)
d8f41257 127 pos := rb.nbyte
adb0401d 128 if !rb.insertOrdered(info) {
501699af 129 rb.nrune, rb.nbyte = saveNrune, saveNbyte
adb0401d
ILT
130 return false
131 }
501699af
ILT
132 i += copy(rb.byte[pos:], dcomp[i:i+int(info.size)])
133 }
134 return true
135}
136
137// insertSingle inserts an entry in the reorderBuffer for the rune at
138// position i. info is the runeInfo for the rune at position i.
4ccad563 139func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) bool {
501699af
ILT
140 // insertOrder changes nbyte
141 pos := rb.nbyte
142 if !rb.insertOrdered(info) {
143 return false
adb0401d 144 }
501699af 145 src.copySlice(rb.byte[pos:], i, i+int(info.size))
adb0401d
ILT
146 return true
147}
148
149// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
94252f4b 150func (rb *reorderBuffer) appendRune(r rune) {
adb0401d 151 bn := rb.nbyte
506cf9aa 152 sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
d8f41257 153 rb.nbyte += utf8.UTFMax
4ccad563 154 rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
adb0401d
ILT
155 rb.nrune++
156}
157
158// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
94252f4b 159func (rb *reorderBuffer) assignRune(pos int, r rune) {
d8f41257 160 bn := rb.rune[pos].pos
506cf9aa 161 sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
4ccad563 162 rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
adb0401d
ILT
163}
164
165// runeAt returns the rune at position n. It is used for Hangul and recomposition.
94252f4b 166func (rb *reorderBuffer) runeAt(n int) rune {
adb0401d 167 inf := rb.rune[n]
506cf9aa 168 r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
94252f4b 169 return r
adb0401d
ILT
170}
171
172// bytesAt returns the UTF-8 encoding of the rune at position n.
173// It is used for Hangul and recomposition.
174func (rb *reorderBuffer) bytesAt(n int) []byte {
175 inf := rb.rune[n]
176 return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
177}
178
179// For Hangul we combine algorithmically, instead of using tables.
180const (
181 hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
182 hangulBase0 = 0xEA
183 hangulBase1 = 0xB0
184 hangulBase2 = 0x80
185
186 hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
187 hangulEnd0 = 0xED
188 hangulEnd1 = 0x9E
189 hangulEnd2 = 0xA4
190
191 jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
192 jamoLBase0 = 0xE1
193 jamoLBase1 = 0x84
194 jamoLEnd = 0x1113
195 jamoVBase = 0x1161
196 jamoVEnd = 0x1176
197 jamoTBase = 0x11A7
198 jamoTEnd = 0x11C3
199
200 jamoTCount = 28
201 jamoVCount = 21
202 jamoVTCount = 21 * 28
203 jamoLVTCount = 19 * 21 * 28
204)
205
501699af
ILT
206const hangulUTF8Size = 3
207
adb0401d 208func isHangul(b []byte) bool {
501699af
ILT
209 if len(b) < hangulUTF8Size {
210 return false
211 }
adb0401d
ILT
212 b0 := b[0]
213 if b0 < hangulBase0 {
214 return false
215 }
216 b1 := b[1]
217 switch {
218 case b0 == hangulBase0:
219 return b1 >= hangulBase1
220 case b0 < hangulEnd0:
221 return true
222 case b0 > hangulEnd0:
223 return false
224 case b1 < hangulEnd1:
225 return true
226 }
227 return b1 == hangulEnd1 && b[2] < hangulEnd2
228}
229
adb0401d 230func isHangulString(b string) bool {
501699af
ILT
231 if len(b) < hangulUTF8Size {
232 return false
233 }
adb0401d
ILT
234 b0 := b[0]
235 if b0 < hangulBase0 {
236 return false
237 }
238 b1 := b[1]
239 switch {
240 case b0 == hangulBase0:
241 return b1 >= hangulBase1
242 case b0 < hangulEnd0:
243 return true
244 case b0 > hangulEnd0:
245 return false
246 case b1 < hangulEnd1:
247 return true
248 }
249 return b1 == hangulEnd1 && b[2] < hangulEnd2
250}
251
252// Caller must ensure len(b) >= 2.
253func isJamoVT(b []byte) bool {
254 // True if (rune & 0xff00) == jamoLBase
255 return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
256}
257
258func isHangulWithoutJamoT(b []byte) bool {
259 c, _ := utf8.DecodeRune(b)
260 c -= hangulBase
261 return c < jamoLVTCount && c%jamoTCount == 0
262}
263
501699af
ILT
264// decomposeHangul writes the decomposed Hangul to buf and returns the number
265// of bytes written. len(buf) should be at least 9.
266func decomposeHangul(buf []byte, r rune) int {
267 const JamoUTF8Len = 3
268 r -= hangulBase
269 x := r % jamoTCount
270 r /= jamoTCount
271 utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
272 utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
273 if x != 0 {
274 utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
275 return 3 * JamoUTF8Len
276 }
277 return 2 * JamoUTF8Len
278}
279
adb0401d
ILT
280// decomposeHangul algorithmically decomposes a Hangul rune into
281// its Jamo components.
282// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
94252f4b 283func (rb *reorderBuffer) decomposeHangul(r rune) bool {
adb0401d
ILT
284 b := rb.rune[:]
285 n := rb.nrune
286 if n+3 > len(b) {
287 return false
288 }
506cf9aa
ILT
289 r -= hangulBase
290 x := r % jamoTCount
291 r /= jamoTCount
292 rb.appendRune(jamoLBase + r/jamoVCount)
293 rb.appendRune(jamoVBase + r%jamoVCount)
adb0401d
ILT
294 if x != 0 {
295 rb.appendRune(jamoTBase + x)
296 }
297 return true
298}
299
300// combineHangul algorithmically combines Jamo character components into Hangul.
301// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
d8f41257 302func (rb *reorderBuffer) combineHangul(s, i, k int) {
adb0401d
ILT
303 b := rb.rune[:]
304 bn := rb.nrune
d8f41257 305 for ; i < bn; i++ {
adb0401d
ILT
306 cccB := b[k-1].ccc
307 cccC := b[i].ccc
308 if cccB == 0 {
309 s = k - 1
310 }
311 if s != k-1 && cccB >= cccC {
312 // b[i] is blocked by greater-equal cccX below it
313 b[k] = b[i]
314 k++
315 } else {
316 l := rb.runeAt(s) // also used to compare to hangulBase
317 v := rb.runeAt(i) // also used to compare to jamoT
318 switch {
319 case jamoLBase <= l && l < jamoLEnd &&
320 jamoVBase <= v && v < jamoVEnd:
321 // 11xx plus 116x to LV
322 rb.assignRune(s, hangulBase+
323 (l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
324 case hangulBase <= l && l < hangulEnd &&
325 jamoTBase < v && v < jamoTEnd &&
326 ((l-hangulBase)%jamoTCount) == 0:
327 // ACxx plus 11Ax to LVT
328 rb.assignRune(s, l+v-jamoTBase)
329 default:
330 b[k] = b[i]
331 k++
332 }
333 }
334 }
335 rb.nrune = k
336}
337
338// compose recombines the runes in the buffer.
339// It should only be used to recompose a single segment, as it will not
340// handle alternations between Hangul and non-Hangul characters correctly.
341func (rb *reorderBuffer) compose() {
342 // UAX #15, section X5 , including Corrigendum #5
343 // "In any character sequence beginning with starter S, a character C is
344 // blocked from S if and only if there is some character B between S
345 // and C, and either B is a starter or it has the same or higher
346 // combining class as C."
d8f41257
ILT
347 bn := rb.nrune
348 if bn == 0 {
349 return
350 }
adb0401d
ILT
351 k := 1
352 b := rb.rune[:]
adb0401d
ILT
353 for s, i := 0, 1; i < bn; i++ {
354 if isJamoVT(rb.bytesAt(i)) {
355 // Redo from start in Hangul mode. Necessary to support
356 // U+320E..U+321E in NFKC mode.
d8f41257 357 rb.combineHangul(s, i, k)
adb0401d
ILT
358 return
359 }
360 ii := b[i]
361 // We can only use combineForward as a filter if we later
362 // get the info for the combined character. This is more
363 // expensive than using the filter. Using combinesBackward()
364 // is safe.
94252f4b 365 if ii.combinesBackward() {
adb0401d
ILT
366 cccB := b[k-1].ccc
367 cccC := ii.ccc
368 blocked := false // b[i] blocked by starter or greater or equal CCC?
369 if cccB == 0 {
370 s = k - 1
371 } else {
372 blocked = s != k-1 && cccB >= cccC
373 }
374 if !blocked {
375 combined := combine(rb.runeAt(s), rb.runeAt(i))
376 if combined != 0 {
377 rb.assignRune(s, combined)
378 continue
379 }
380 }
381 }
382 b[k] = b[i]
383 k++
384 }
385 rb.nrune = k
386}