]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power8/strcasecmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strcasecmp.S
1 /* Optimized strcasecmp implementation for PowerPC64.
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include <locale-defines.h>
21
22 /* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
23
24 #ifndef USE_AS_STRNCASECMP
25 # define __STRCASECMP __strcasecmp
26 # define STRCASECMP strcasecmp
27 #else
28 # define __STRCASECMP __strncasecmp
29 # define STRCASECMP strncasecmp
30 #endif
31 /* Convert 16 bytes to lowercase and compare */
32 #define TOLOWER() \
33 vaddubm v8, v4, v1; \
34 vaddubm v7, v4, v3; \
35 vcmpgtub v8, v8, v2; \
36 vsel v4, v7, v4, v8; \
37 vaddubm v8, v5, v1; \
38 vaddubm v7, v5, v3; \
39 vcmpgtub v8, v8, v2; \
40 vsel v5, v7, v5, v8; \
41 vcmpequb. v7, v5, v4;
42
43 /*
44 * Get 16 bytes for unaligned case.
45 * reg1: Vector to hold next 16 bytes.
46 * reg2: Address to read from.
47 * reg3: Permute control vector.
48 * v8: Tmp vector used to mask unwanted bytes.
49 * v9: Tmp vector,0 when null is found on first 16 bytes
50 */
51 #ifdef __LITTLE_ENDIAN__
52 #define GET16BYTES(reg1, reg2, reg3) \
53 lvx reg1, 0, reg2; \
54 vspltisb v8, -1; \
55 vperm v8, v8, reg1, reg3; \
56 vcmpequb. v8, v0, v8; \
57 beq cr6, 1f; \
58 vspltisb v9, 0; \
59 b 2f; \
60 .align 4; \
61 1: \
62 addi r6, reg2, 16; \
63 lvx v9, 0, r6; \
64 2: \
65 vperm reg1, v9, reg1, reg3;
66 #else
67 #define GET16BYTES(reg1, reg2, reg3) \
68 lvx reg1, 0, reg2; \
69 vspltisb v8, -1; \
70 vperm v8, reg1, v8, reg3; \
71 vcmpequb. v8, v0, v8; \
72 beq cr6, 1f; \
73 vspltisb v9, 0; \
74 b 2f; \
75 .align 4; \
76 1: \
77 addi r6, reg2, 16; \
78 lvx v9, 0, r6; \
79 2: \
80 vperm reg1, reg1, v9, reg3;
81 #endif
82
83 /* Check null in v4, v5 and convert to lower. */
84 #define CHECKNULLANDCONVERT() \
85 vcmpequb. v7, v0, v5; \
86 beq cr6, 3f; \
87 vcmpequb. v7, v0, v4; \
88 beq cr6, 3f; \
89 b L(null_found); \
90 .align 4; \
91 3: \
92 TOLOWER()
93
94 #ifdef _ARCH_PWR8
95 # define VCLZD_V8_v7 vclzd v8, v7;
96 # define MFVRD_R3_V1 mfvrd r3, v1;
97 # define VSUBUDM_V9_V8 vsubudm v9, v9, v8;
98 # define VPOPCNTD_V8_V8 vpopcntd v8, v8;
99 # define VADDUQM_V7_V8 vadduqm v9, v7, v8;
100 #else
101 # define VCLZD_V8_v7 .long 0x11003fc2
102 # define MFVRD_R3_V1 .long 0x7c230067
103 # define VSUBUDM_V9_V8 .long 0x112944c0
104 # define VPOPCNTD_V8_V8 .long 0x110047c3
105 # define VADDUQM_V7_V8 .long 0x11274100
106 #endif
107
108 .machine power7
109
110 ENTRY (__STRCASECMP)
111 #ifdef USE_AS_STRNCASECMP
112 CALL_MCOUNT 3
113 #else
114 CALL_MCOUNT 2
115 #endif
116 #define rRTN r3 /* Return value */
117 #define rSTR1 r10 /* 1st string */
118 #define rSTR2 r4 /* 2nd string */
119 #define rCHAR1 r6 /* Byte read from 1st string */
120 #define rCHAR2 r7 /* Byte read from 2nd string */
121 #define rADDR1 r8 /* Address of tolower(rCHAR1) */
122 #define rADDR2 r12 /* Address of tolower(rCHAR2) */
123 #define rLWR1 r8 /* Word tolower(rCHAR1) */
124 #define rLWR2 r12 /* Word tolower(rCHAR2) */
125 #define rTMP r9
126 #define rLOC r11 /* Default locale address */
127
128 cmpd cr7, rRTN, rSTR2
129
130 /* Get locale address. */
131 ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
132 add rLOC, rTMP, __libc_tsd_LOCALE@tls
133 ld rLOC, 0(rLOC)
134
135 mr rSTR1, rRTN
136 li rRTN, 0
137 beqlr cr7
138 #ifdef USE_AS_STRNCASECMP
139 cmpdi cr7, r5, 0
140 beq cr7, L(retnull)
141 cmpdi cr7, r5, 16
142 blt cr7, L(bytebybyte)
143 #endif
144 vspltisb v0, 0
145 vspltisb v8, -1
146 /* Check for null in initial characters.
147 Check max of 16 char depending on the alignment.
148 If null is present, proceed byte by byte. */
149 lvx v4, 0, rSTR1
150 #ifdef __LITTLE_ENDIAN__
151 lvsr v10, 0, rSTR1 /* Compute mask. */
152 vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
153 #else
154 lvsl v10, 0, rSTR1
155 vperm v9, v4, v8, v10
156 #endif
157 vcmpequb. v9, v0, v9 /* Check for null bytes. */
158 bne cr6, L(bytebybyte)
159 lvx v5, 0, rSTR2
160 /* Calculate alignment. */
161 #ifdef __LITTLE_ENDIAN__
162 lvsr v6, 0, rSTR2
163 vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
164 #else
165 lvsl v6, 0, rSTR2
166 vperm v9, v5, v8, v6
167 #endif
168 vcmpequb. v9, v0, v9 /* Check for null bytes. */
169 bne cr6, L(bytebybyte)
170 /* Check if locale has non ascii characters. */
171 ld rTMP, 0(rLOC)
172 addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
173 lwz rTMP, 0(r6)
174 cmpdi cr7, rTMP, 1
175 beq cr7, L(bytebybyte)
176
177 /* Load vector registers with values used for TOLOWER. */
178 /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
179 vspltisb v3, 2
180 vspltisb v9, 4
181 vsl v3, v3, v9
182 vaddubm v1, v3, v3
183 vnor v1, v1, v1
184 vspltisb v2, 7
185 vsububm v2, v3, v2
186
187 andi. rADDR1, rSTR1, 0xF
188 beq cr0, L(align)
189 addi r6, rSTR1, 16
190 lvx v9, 0, r6
191 /* Compute 16 bytes from previous two loads. */
192 #ifdef __LITTLE_ENDIAN__
193 vperm v4, v9, v4, v10
194 #else
195 vperm v4, v4, v9, v10
196 #endif
197 L(align):
198 andi. rADDR2, rSTR2, 0xF
199 beq cr0, L(align1)
200 addi r6, rSTR2, 16
201 lvx v9, 0, r6
202 /* Compute 16 bytes from previous two loads. */
203 #ifdef __LITTLE_ENDIAN__
204 vperm v5, v9, v5, v6
205 #else
206 vperm v5, v5, v9, v6
207 #endif
208 L(align1):
209 CHECKNULLANDCONVERT()
210 blt cr6, L(match)
211 b L(different)
212 .align 4
213 L(match):
214 clrldi r6, rSTR1, 60
215 subfic r7, r6, 16
216 #ifdef USE_AS_STRNCASECMP
217 sub r5, r5, r7
218 #endif
219 add rSTR1, rSTR1, r7
220 add rSTR2, rSTR2, r7
221 andi. rADDR2, rSTR2, 0xF
222 addi rSTR1, rSTR1, -16
223 addi rSTR2, rSTR2, -16
224 beq cr0, L(aligned)
225 #ifdef __LITTLE_ENDIAN__
226 lvsr v6, 0, rSTR2
227 #else
228 lvsl v6, 0, rSTR2
229 #endif
230 /* There are 2 loops depending on the input alignment.
231 Each loop gets 16 bytes from s1 and s2, check for null,
232 convert to lowercase and compare. Loop till difference
233 or null occurs. */
234 L(s1_align):
235 addi rSTR1, rSTR1, 16
236 addi rSTR2, rSTR2, 16
237 #ifdef USE_AS_STRNCASECMP
238 cmpdi cr7, r5, 16
239 blt cr7, L(bytebybyte)
240 addi r5, r5, -16
241 #endif
242 lvx v4, 0, rSTR1
243 GET16BYTES(v5, rSTR2, v6)
244 CHECKNULLANDCONVERT()
245 blt cr6, L(s1_align)
246 b L(different)
247 .align 4
248 L(aligned):
249 addi rSTR1, rSTR1, 16
250 addi rSTR2, rSTR2, 16
251 #ifdef USE_AS_STRNCASECMP
252 cmpdi cr7, r5, 16
253 blt cr7, L(bytebybyte)
254 addi r5, r5, -16
255 #endif
256 lvx v4, 0, rSTR1
257 lvx v5, 0, rSTR2
258 CHECKNULLANDCONVERT()
259 blt cr6, L(aligned)
260
261 /* Calculate and return the difference. */
262 L(different):
263 vaddubm v1, v3, v3
264 vcmpequb v7, v0, v7
265 #ifdef __LITTLE_ENDIAN__
266 /* Count trailing zero. */
267 vspltisb v8, -1
268 VADDUQM_V7_V8
269 vandc v8, v9, v7
270 VPOPCNTD_V8_V8
271 vspltb v6, v8, 15
272 vcmpequb. v6, v6, v1
273 blt cr6, L(shift8)
274 #else
275 /* Count leading zero. */
276 VCLZD_V8_v7
277 vspltb v6, v8, 7
278 vcmpequb. v6, v6, v1
279 blt cr6, L(shift8)
280 vsro v8, v8, v1
281 #endif
282 b L(skipsum)
283 .align 4
284 L(shift8):
285 vsumsws v8, v8, v0
286 L(skipsum):
287 #ifdef __LITTLE_ENDIAN__
288 /* Shift registers based on leading zero count. */
289 vsro v6, v5, v8
290 vsro v7, v4, v8
291 /* Merge and move to GPR. */
292 vmrglb v6, v6, v7
293 vslo v1, v6, v1
294 MFVRD_R3_V1
295 /* Place the characters that are different in first position. */
296 sldi rSTR2, rRTN, 56
297 srdi rSTR2, rSTR2, 56
298 sldi rSTR1, rRTN, 48
299 srdi rSTR1, rSTR1, 56
300 #else
301 vslo v6, v5, v8
302 vslo v7, v4, v8
303 vmrghb v1, v6, v7
304 MFVRD_R3_V1
305 srdi rSTR2, rRTN, 48
306 sldi rSTR2, rSTR2, 56
307 srdi rSTR2, rSTR2, 56
308 srdi rSTR1, rRTN, 56
309 #endif
310 subf rRTN, rSTR1, rSTR2
311 extsw rRTN, rRTN
312 blr
313
314 .align 4
315 /* OK. We've hit the end of the string. We need to be careful that
316 we don't compare two strings as different because of junk beyond
317 the end of the strings... */
318 L(null_found):
319 vaddubm v10, v3, v3
320 #ifdef __LITTLE_ENDIAN__
321 /* Count trailing zero. */
322 vspltisb v8, -1
323 VADDUQM_V7_V8
324 vandc v8, v9, v7
325 VPOPCNTD_V8_V8
326 vspltb v6, v8, 15
327 vcmpequb. v6, v6, v10
328 blt cr6, L(shift_8)
329 #else
330 /* Count leading zero. */
331 VCLZD_V8_v7
332 vspltb v6, v8, 7
333 vcmpequb. v6, v6, v10
334 blt cr6, L(shift_8)
335 vsro v8, v8, v10
336 #endif
337 b L(skipsum1)
338 .align 4
339 L(shift_8):
340 vsumsws v8, v8, v0
341 L(skipsum1):
342 /* Calculate shift count based on count of zero. */
343 vspltisb v10, 7
344 vslb v10, v10, v10
345 vsldoi v9, v0, v10, 1
346 VSUBUDM_V9_V8
347 vspltisb v8, 8
348 vsldoi v8, v0, v8, 1
349 VSUBUDM_V9_V8
350 /* Shift and remove junk after null character. */
351 #ifdef __LITTLE_ENDIAN__
352 vslo v5, v5, v9
353 vslo v4, v4, v9
354 #else
355 vsro v5, v5, v9
356 vsro v4, v4, v9
357 #endif
358 /* Convert and compare 16 bytes. */
359 TOLOWER()
360 blt cr6, L(retnull)
361 b L(different)
362 .align 4
363 L(retnull):
364 li rRTN, 0
365 blr
366 .align 4
367 L(bytebybyte):
368 /* Unrolling loop for POWER: loads are done with 'lbz' plus
369 offset and string descriptors are only updated in the end
370 of loop unrolling. */
371 ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
372 lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
373 lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
374 #ifdef USE_AS_STRNCASECMP
375 rldicl rTMP, r5, 62, 2
376 cmpdi cr7, rTMP, 0
377 beq cr7, L(lessthan4)
378 mtctr rTMP
379 #endif
380 L(loop):
381 cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
382 sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
383 sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
384 lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
385 lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
386 cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
387 crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
388 beq cr1, L(done)
389 lbz rCHAR1, 1(rSTR1)
390 lbz rCHAR2, 1(rSTR2)
391 cmpdi rCHAR1, 0
392 sldi rADDR1, rCHAR1, 2
393 sldi rADDR2, rCHAR2, 2
394 lwzx rLWR1, rLOC, rADDR1
395 lwzx rLWR2, rLOC, rADDR2
396 cmpw cr1, rLWR1, rLWR2
397 crorc 4*cr1+eq,eq,4*cr1+eq
398 beq cr1, L(done)
399 lbz rCHAR1, 2(rSTR1)
400 lbz rCHAR2, 2(rSTR2)
401 cmpdi rCHAR1, 0
402 sldi rADDR1, rCHAR1, 2
403 sldi rADDR2, rCHAR2, 2
404 lwzx rLWR1, rLOC, rADDR1
405 lwzx rLWR2, rLOC, rADDR2
406 cmpw cr1, rLWR1, rLWR2
407 crorc 4*cr1+eq,eq,4*cr1+eq
408 beq cr1, L(done)
409 lbz rCHAR1, 3(rSTR1)
410 lbz rCHAR2, 3(rSTR2)
411 cmpdi rCHAR1, 0
412 /* Increment both string descriptors */
413 addi rSTR1, rSTR1, 4
414 addi rSTR2, rSTR2, 4
415 sldi rADDR1, rCHAR1, 2
416 sldi rADDR2, rCHAR2, 2
417 lwzx rLWR1, rLOC, rADDR1
418 lwzx rLWR2, rLOC, rADDR2
419 cmpw cr1, rLWR1, rLWR2
420 crorc 4*cr1+eq,eq,4*cr1+eq
421 beq cr1, L(done)
422 lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
423 lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
424 #ifdef USE_AS_STRNCASECMP
425 bdnz L(loop)
426 #else
427 b L(loop)
428 #endif
429 #ifdef USE_AS_STRNCASECMP
430 L(lessthan4):
431 clrldi r5, r5, 62
432 cmpdi cr7, r5, 0
433 beq cr7, L(retnull)
434 mtctr r5
435 L(loop1):
436 cmpdi rCHAR1, 0
437 sldi rADDR1, rCHAR1, 2
438 sldi rADDR2, rCHAR2, 2
439 lwzx rLWR1, rLOC, rADDR1
440 lwzx rLWR2, rLOC, rADDR2
441 cmpw cr1, rLWR1, rLWR2
442 crorc 4*cr1+eq,eq,4*cr1+eq
443 beq cr1, L(done)
444 addi rSTR1, rSTR1, 1
445 addi rSTR2, rSTR2, 1
446 lbz rCHAR1, 0(rSTR1)
447 lbz rCHAR2, 0(rSTR2)
448 bdnz L(loop1)
449 #endif
450 L(done):
451 subf r0, rLWR2, rLWR1
452 extsw rRTN, r0
453 blr
454 END (__STRCASECMP)
455
456 weak_alias (__STRCASECMP, STRCASECMP)
457 libc_hidden_builtin_def (__STRCASECMP)