]>
Commit | Line | Data |
---|---|---|
c8376f3e | 1 | /* Optimized strcasecmp implementation for PowerPC64. |
04277e02 | 2 | Copyright (C) 2016-2019 Free Software Foundation, Inc. |
c8376f3e | 3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
c8376f3e | 18 | |
19 | #include <sysdep.h> | |
20 | #include <locale-defines.h> | |
21 | ||
22 | /* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */ | |
23 | ||
24 | #ifndef USE_AS_STRNCASECMP | |
25 | # define __STRCASECMP __strcasecmp | |
26 | # define STRCASECMP strcasecmp | |
27 | #else | |
28 | # define __STRCASECMP __strncasecmp | |
29 | # define STRCASECMP strncasecmp | |
30 | #endif | |
31 | /* Convert 16 bytes to lowercase and compare */ | |
32 | #define TOLOWER() \ | |
33 | vaddubm v8, v4, v1; \ | |
34 | vaddubm v7, v4, v3; \ | |
35 | vcmpgtub v8, v8, v2; \ | |
36 | vsel v4, v7, v4, v8; \ | |
37 | vaddubm v8, v5, v1; \ | |
38 | vaddubm v7, v5, v3; \ | |
39 | vcmpgtub v8, v8, v2; \ | |
40 | vsel v5, v7, v5, v8; \ | |
41 | vcmpequb. v7, v5, v4; | |
42 | ||
30e4cc54 RS |
43 | /* |
44 | * Get 16 bytes for unaligned case. | |
45 | * reg1: Vector to hold next 16 bytes. | |
46 | * reg2: Address to read from. | |
47 | * reg3: Permute control vector. | |
48 | * v8: Tmp vector used to mask unwanted bytes. | |
49 | * v9: Tmp vector,0 when null is found on first 16 bytes | |
50 | */ | |
c8376f3e | 51 | #ifdef __LITTLE_ENDIAN__ |
52 | #define GET16BYTES(reg1, reg2, reg3) \ | |
53 | lvx reg1, 0, reg2; \ | |
30e4cc54 RS |
54 | vspltisb v8, -1; \ |
55 | vperm v8, v8, reg1, reg3; \ | |
56 | vcmpequb. v8, v0, v8; \ | |
c8376f3e | 57 | beq cr6, 1f; \ |
58 | vspltisb v9, 0; \ | |
59 | b 2f; \ | |
60 | .align 4; \ | |
61 | 1: \ | |
62 | addi r6, reg2, 16; \ | |
63 | lvx v9, 0, r6; \ | |
64 | 2: \ | |
65 | vperm reg1, v9, reg1, reg3; | |
66 | #else | |
67 | #define GET16BYTES(reg1, reg2, reg3) \ | |
68 | lvx reg1, 0, reg2; \ | |
30e4cc54 RS |
69 | vspltisb v8, -1; \ |
70 | vperm v8, reg1, v8, reg3; \ | |
71 | vcmpequb. v8, v0, v8; \ | |
c8376f3e | 72 | beq cr6, 1f; \ |
73 | vspltisb v9, 0; \ | |
74 | b 2f; \ | |
75 | .align 4; \ | |
76 | 1: \ | |
77 | addi r6, reg2, 16; \ | |
78 | lvx v9, 0, r6; \ | |
79 | 2: \ | |
80 | vperm reg1, reg1, v9, reg3; | |
81 | #endif | |
82 | ||
83 | /* Check null in v4, v5 and convert to lower. */ | |
84 | #define CHECKNULLANDCONVERT() \ | |
85 | vcmpequb. v7, v0, v5; \ | |
86 | beq cr6, 3f; \ | |
87 | vcmpequb. v7, v0, v4; \ | |
88 | beq cr6, 3f; \ | |
89 | b L(null_found); \ | |
90 | .align 4; \ | |
91 | 3: \ | |
92 | TOLOWER() | |
93 | ||
9250e661 | 94 | .machine power8 |
c8376f3e | 95 | |
96 | ENTRY (__STRCASECMP) | |
97 | #ifdef USE_AS_STRNCASECMP | |
98 | CALL_MCOUNT 3 | |
99 | #else | |
100 | CALL_MCOUNT 2 | |
101 | #endif | |
102 | #define rRTN r3 /* Return value */ | |
103 | #define rSTR1 r10 /* 1st string */ | |
104 | #define rSTR2 r4 /* 2nd string */ | |
105 | #define rCHAR1 r6 /* Byte read from 1st string */ | |
106 | #define rCHAR2 r7 /* Byte read from 2nd string */ | |
107 | #define rADDR1 r8 /* Address of tolower(rCHAR1) */ | |
108 | #define rADDR2 r12 /* Address of tolower(rCHAR2) */ | |
109 | #define rLWR1 r8 /* Word tolower(rCHAR1) */ | |
110 | #define rLWR2 r12 /* Word tolower(rCHAR2) */ | |
111 | #define rTMP r9 | |
112 | #define rLOC r11 /* Default locale address */ | |
113 | ||
114 | cmpd cr7, rRTN, rSTR2 | |
115 | ||
116 | /* Get locale address. */ | |
117 | ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) | |
118 | add rLOC, rTMP, __libc_tsd_LOCALE@tls | |
119 | ld rLOC, 0(rLOC) | |
120 | ||
121 | mr rSTR1, rRTN | |
122 | li rRTN, 0 | |
123 | beqlr cr7 | |
124 | #ifdef USE_AS_STRNCASECMP | |
125 | cmpdi cr7, r5, 0 | |
126 | beq cr7, L(retnull) | |
127 | cmpdi cr7, r5, 16 | |
128 | blt cr7, L(bytebybyte) | |
129 | #endif | |
130 | vspltisb v0, 0 | |
131 | vspltisb v8, -1 | |
132 | /* Check for null in initial characters. | |
133 | Check max of 16 char depending on the alignment. | |
134 | If null is present, proceed byte by byte. */ | |
135 | lvx v4, 0, rSTR1 | |
136 | #ifdef __LITTLE_ENDIAN__ | |
137 | lvsr v10, 0, rSTR1 /* Compute mask. */ | |
138 | vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */ | |
139 | #else | |
140 | lvsl v10, 0, rSTR1 | |
141 | vperm v9, v4, v8, v10 | |
142 | #endif | |
143 | vcmpequb. v9, v0, v9 /* Check for null bytes. */ | |
144 | bne cr6, L(bytebybyte) | |
145 | lvx v5, 0, rSTR2 | |
146 | /* Calculate alignment. */ | |
147 | #ifdef __LITTLE_ENDIAN__ | |
148 | lvsr v6, 0, rSTR2 | |
149 | vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */ | |
150 | #else | |
151 | lvsl v6, 0, rSTR2 | |
152 | vperm v9, v5, v8, v6 | |
153 | #endif | |
154 | vcmpequb. v9, v0, v9 /* Check for null bytes. */ | |
155 | bne cr6, L(bytebybyte) | |
156 | /* Check if locale has non ascii characters. */ | |
157 | ld rTMP, 0(rLOC) | |
158 | addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES | |
159 | lwz rTMP, 0(r6) | |
160 | cmpdi cr7, rTMP, 1 | |
161 | beq cr7, L(bytebybyte) | |
162 | ||
163 | /* Load vector registers with values used for TOLOWER. */ | |
164 | /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */ | |
165 | vspltisb v3, 2 | |
166 | vspltisb v9, 4 | |
167 | vsl v3, v3, v9 | |
168 | vaddubm v1, v3, v3 | |
169 | vnor v1, v1, v1 | |
170 | vspltisb v2, 7 | |
171 | vsububm v2, v3, v2 | |
172 | ||
173 | andi. rADDR1, rSTR1, 0xF | |
174 | beq cr0, L(align) | |
175 | addi r6, rSTR1, 16 | |
176 | lvx v9, 0, r6 | |
177 | /* Compute 16 bytes from previous two loads. */ | |
178 | #ifdef __LITTLE_ENDIAN__ | |
179 | vperm v4, v9, v4, v10 | |
180 | #else | |
181 | vperm v4, v4, v9, v10 | |
182 | #endif | |
183 | L(align): | |
184 | andi. rADDR2, rSTR2, 0xF | |
185 | beq cr0, L(align1) | |
186 | addi r6, rSTR2, 16 | |
187 | lvx v9, 0, r6 | |
188 | /* Compute 16 bytes from previous two loads. */ | |
189 | #ifdef __LITTLE_ENDIAN__ | |
190 | vperm v5, v9, v5, v6 | |
191 | #else | |
192 | vperm v5, v5, v9, v6 | |
193 | #endif | |
194 | L(align1): | |
195 | CHECKNULLANDCONVERT() | |
196 | blt cr6, L(match) | |
197 | b L(different) | |
198 | .align 4 | |
199 | L(match): | |
200 | clrldi r6, rSTR1, 60 | |
201 | subfic r7, r6, 16 | |
202 | #ifdef USE_AS_STRNCASECMP | |
203 | sub r5, r5, r7 | |
204 | #endif | |
205 | add rSTR1, rSTR1, r7 | |
206 | add rSTR2, rSTR2, r7 | |
207 | andi. rADDR2, rSTR2, 0xF | |
208 | addi rSTR1, rSTR1, -16 | |
209 | addi rSTR2, rSTR2, -16 | |
210 | beq cr0, L(aligned) | |
211 | #ifdef __LITTLE_ENDIAN__ | |
212 | lvsr v6, 0, rSTR2 | |
213 | #else | |
214 | lvsl v6, 0, rSTR2 | |
215 | #endif | |
216 | /* There are 2 loops depending on the input alignment. | |
217 | Each loop gets 16 bytes from s1 and s2, check for null, | |
218 | convert to lowercase and compare. Loop till difference | |
219 | or null occurs. */ | |
220 | L(s1_align): | |
221 | addi rSTR1, rSTR1, 16 | |
222 | addi rSTR2, rSTR2, 16 | |
223 | #ifdef USE_AS_STRNCASECMP | |
224 | cmpdi cr7, r5, 16 | |
225 | blt cr7, L(bytebybyte) | |
226 | addi r5, r5, -16 | |
227 | #endif | |
228 | lvx v4, 0, rSTR1 | |
229 | GET16BYTES(v5, rSTR2, v6) | |
230 | CHECKNULLANDCONVERT() | |
231 | blt cr6, L(s1_align) | |
232 | b L(different) | |
233 | .align 4 | |
234 | L(aligned): | |
235 | addi rSTR1, rSTR1, 16 | |
236 | addi rSTR2, rSTR2, 16 | |
237 | #ifdef USE_AS_STRNCASECMP | |
238 | cmpdi cr7, r5, 16 | |
239 | blt cr7, L(bytebybyte) | |
240 | addi r5, r5, -16 | |
241 | #endif | |
242 | lvx v4, 0, rSTR1 | |
243 | lvx v5, 0, rSTR2 | |
244 | CHECKNULLANDCONVERT() | |
245 | blt cr6, L(aligned) | |
246 | ||
247 | /* Calculate and return the difference. */ | |
248 | L(different): | |
249 | vaddubm v1, v3, v3 | |
250 | vcmpequb v7, v0, v7 | |
251 | #ifdef __LITTLE_ENDIAN__ | |
252 | /* Count trailing zero. */ | |
253 | vspltisb v8, -1 | |
9250e661 | 254 | vadduqm v9, v7, v8 |
c8376f3e | 255 | vandc v8, v9, v7 |
9250e661 | 256 | vpopcntd v8, v8 |
c8376f3e | 257 | vspltb v6, v8, 15 |
258 | vcmpequb. v6, v6, v1 | |
259 | blt cr6, L(shift8) | |
260 | #else | |
261 | /* Count leading zero. */ | |
9250e661 | 262 | vclzd v8, v7 |
c8376f3e | 263 | vspltb v6, v8, 7 |
264 | vcmpequb. v6, v6, v1 | |
265 | blt cr6, L(shift8) | |
266 | vsro v8, v8, v1 | |
267 | #endif | |
268 | b L(skipsum) | |
269 | .align 4 | |
270 | L(shift8): | |
271 | vsumsws v8, v8, v0 | |
272 | L(skipsum): | |
273 | #ifdef __LITTLE_ENDIAN__ | |
274 | /* Shift registers based on leading zero count. */ | |
275 | vsro v6, v5, v8 | |
276 | vsro v7, v4, v8 | |
277 | /* Merge and move to GPR. */ | |
278 | vmrglb v6, v6, v7 | |
279 | vslo v1, v6, v1 | |
9250e661 | 280 | mfvrd r3, v1 |
c8376f3e | 281 | /* Place the characters that are different in first position. */ |
282 | sldi rSTR2, rRTN, 56 | |
283 | srdi rSTR2, rSTR2, 56 | |
284 | sldi rSTR1, rRTN, 48 | |
285 | srdi rSTR1, rSTR1, 56 | |
286 | #else | |
287 | vslo v6, v5, v8 | |
288 | vslo v7, v4, v8 | |
289 | vmrghb v1, v6, v7 | |
9250e661 | 290 | mfvrd r3, v1 |
c8376f3e | 291 | srdi rSTR2, rRTN, 48 |
292 | sldi rSTR2, rSTR2, 56 | |
293 | srdi rSTR2, rSTR2, 56 | |
294 | srdi rSTR1, rRTN, 56 | |
295 | #endif | |
296 | subf rRTN, rSTR1, rSTR2 | |
297 | extsw rRTN, rRTN | |
298 | blr | |
299 | ||
300 | .align 4 | |
301 | /* OK. We've hit the end of the string. We need to be careful that | |
302 | we don't compare two strings as different because of junk beyond | |
303 | the end of the strings... */ | |
304 | L(null_found): | |
305 | vaddubm v10, v3, v3 | |
306 | #ifdef __LITTLE_ENDIAN__ | |
307 | /* Count trailing zero. */ | |
308 | vspltisb v8, -1 | |
9250e661 | 309 | vadduqm v9, v7, v8 |
c8376f3e | 310 | vandc v8, v9, v7 |
9250e661 | 311 | vpopcntd v8, v8 |
c8376f3e | 312 | vspltb v6, v8, 15 |
313 | vcmpequb. v6, v6, v10 | |
314 | blt cr6, L(shift_8) | |
315 | #else | |
316 | /* Count leading zero. */ | |
9250e661 | 317 | vclzd v8, v7 |
c8376f3e | 318 | vspltb v6, v8, 7 |
319 | vcmpequb. v6, v6, v10 | |
320 | blt cr6, L(shift_8) | |
321 | vsro v8, v8, v10 | |
322 | #endif | |
323 | b L(skipsum1) | |
324 | .align 4 | |
325 | L(shift_8): | |
326 | vsumsws v8, v8, v0 | |
327 | L(skipsum1): | |
328 | /* Calculate shift count based on count of zero. */ | |
329 | vspltisb v10, 7 | |
330 | vslb v10, v10, v10 | |
331 | vsldoi v9, v0, v10, 1 | |
9250e661 | 332 | vsubudm v9, v9, v8 |
c8376f3e | 333 | vspltisb v8, 8 |
334 | vsldoi v8, v0, v8, 1 | |
9250e661 | 335 | vsubudm v9, v9, v8 |
c8376f3e | 336 | /* Shift and remove junk after null character. */ |
337 | #ifdef __LITTLE_ENDIAN__ | |
338 | vslo v5, v5, v9 | |
339 | vslo v4, v4, v9 | |
340 | #else | |
341 | vsro v5, v5, v9 | |
342 | vsro v4, v4, v9 | |
343 | #endif | |
344 | /* Convert and compare 16 bytes. */ | |
345 | TOLOWER() | |
346 | blt cr6, L(retnull) | |
347 | b L(different) | |
348 | .align 4 | |
349 | L(retnull): | |
350 | li rRTN, 0 | |
351 | blr | |
352 | .align 4 | |
353 | L(bytebybyte): | |
354 | /* Unrolling loop for POWER: loads are done with 'lbz' plus | |
355 | offset and string descriptors are only updated in the end | |
356 | of loop unrolling. */ | |
357 | ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) | |
358 | lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ | |
359 | lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ | |
360 | #ifdef USE_AS_STRNCASECMP | |
361 | rldicl rTMP, r5, 62, 2 | |
362 | cmpdi cr7, rTMP, 0 | |
363 | beq cr7, L(lessthan4) | |
364 | mtctr rTMP | |
365 | #endif | |
366 | L(loop): | |
367 | cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ | |
368 | sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ | |
369 | sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ | |
370 | lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ | |
371 | lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ | |
372 | cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ | |
373 | crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ | |
374 | beq cr1, L(done) | |
375 | lbz rCHAR1, 1(rSTR1) | |
376 | lbz rCHAR2, 1(rSTR2) | |
377 | cmpdi rCHAR1, 0 | |
378 | sldi rADDR1, rCHAR1, 2 | |
379 | sldi rADDR2, rCHAR2, 2 | |
380 | lwzx rLWR1, rLOC, rADDR1 | |
381 | lwzx rLWR2, rLOC, rADDR2 | |
382 | cmpw cr1, rLWR1, rLWR2 | |
383 | crorc 4*cr1+eq,eq,4*cr1+eq | |
384 | beq cr1, L(done) | |
385 | lbz rCHAR1, 2(rSTR1) | |
386 | lbz rCHAR2, 2(rSTR2) | |
387 | cmpdi rCHAR1, 0 | |
388 | sldi rADDR1, rCHAR1, 2 | |
389 | sldi rADDR2, rCHAR2, 2 | |
390 | lwzx rLWR1, rLOC, rADDR1 | |
391 | lwzx rLWR2, rLOC, rADDR2 | |
392 | cmpw cr1, rLWR1, rLWR2 | |
393 | crorc 4*cr1+eq,eq,4*cr1+eq | |
394 | beq cr1, L(done) | |
395 | lbz rCHAR1, 3(rSTR1) | |
396 | lbz rCHAR2, 3(rSTR2) | |
397 | cmpdi rCHAR1, 0 | |
398 | /* Increment both string descriptors */ | |
399 | addi rSTR1, rSTR1, 4 | |
400 | addi rSTR2, rSTR2, 4 | |
401 | sldi rADDR1, rCHAR1, 2 | |
402 | sldi rADDR2, rCHAR2, 2 | |
403 | lwzx rLWR1, rLOC, rADDR1 | |
404 | lwzx rLWR2, rLOC, rADDR2 | |
405 | cmpw cr1, rLWR1, rLWR2 | |
406 | crorc 4*cr1+eq,eq,4*cr1+eq | |
407 | beq cr1, L(done) | |
408 | lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ | |
409 | lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ | |
410 | #ifdef USE_AS_STRNCASECMP | |
411 | bdnz L(loop) | |
412 | #else | |
413 | b L(loop) | |
414 | #endif | |
415 | #ifdef USE_AS_STRNCASECMP | |
416 | L(lessthan4): | |
417 | clrldi r5, r5, 62 | |
418 | cmpdi cr7, r5, 0 | |
419 | beq cr7, L(retnull) | |
420 | mtctr r5 | |
421 | L(loop1): | |
422 | cmpdi rCHAR1, 0 | |
423 | sldi rADDR1, rCHAR1, 2 | |
424 | sldi rADDR2, rCHAR2, 2 | |
425 | lwzx rLWR1, rLOC, rADDR1 | |
426 | lwzx rLWR2, rLOC, rADDR2 | |
427 | cmpw cr1, rLWR1, rLWR2 | |
428 | crorc 4*cr1+eq,eq,4*cr1+eq | |
429 | beq cr1, L(done) | |
430 | addi rSTR1, rSTR1, 1 | |
431 | addi rSTR2, rSTR2, 1 | |
432 | lbz rCHAR1, 0(rSTR1) | |
433 | lbz rCHAR2, 0(rSTR2) | |
434 | bdnz L(loop1) | |
435 | #endif | |
436 | L(done): | |
437 | subf r0, rLWR2, rLWR1 | |
438 | extsw rRTN, r0 | |
439 | blr | |
440 | END (__STRCASECMP) | |
441 | ||
442 | weak_alias (__STRCASECMP, STRCASECMP) | |
443 | libc_hidden_builtin_def (__STRCASECMP) |