]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power8/strcasecmp.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strcasecmp.S
CommitLineData
c8376f3e 1/* Optimized strcasecmp implementation for PowerPC64.
04277e02 2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
c8376f3e 3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
c8376f3e 18
19#include <sysdep.h>
20#include <locale-defines.h>
21
22/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
23
24#ifndef USE_AS_STRNCASECMP
25# define __STRCASECMP __strcasecmp
26# define STRCASECMP strcasecmp
27#else
28# define __STRCASECMP __strncasecmp
29# define STRCASECMP strncasecmp
30#endif
31/* Convert 16 bytes to lowercase and compare */
32#define TOLOWER() \
33 vaddubm v8, v4, v1; \
34 vaddubm v7, v4, v3; \
35 vcmpgtub v8, v8, v2; \
36 vsel v4, v7, v4, v8; \
37 vaddubm v8, v5, v1; \
38 vaddubm v7, v5, v3; \
39 vcmpgtub v8, v8, v2; \
40 vsel v5, v7, v5, v8; \
41 vcmpequb. v7, v5, v4;
42
30e4cc54
RS
43/*
44 * Get 16 bytes for unaligned case.
45 * reg1: Vector to hold next 16 bytes.
46 * reg2: Address to read from.
47 * reg3: Permute control vector.
48 * v8: Tmp vector used to mask unwanted bytes.
49 * v9: Tmp vector,0 when null is found on first 16 bytes
50 */
c8376f3e 51#ifdef __LITTLE_ENDIAN__
52#define GET16BYTES(reg1, reg2, reg3) \
53 lvx reg1, 0, reg2; \
30e4cc54
RS
54 vspltisb v8, -1; \
55 vperm v8, v8, reg1, reg3; \
56 vcmpequb. v8, v0, v8; \
c8376f3e 57 beq cr6, 1f; \
58 vspltisb v9, 0; \
59 b 2f; \
60 .align 4; \
611: \
62 addi r6, reg2, 16; \
63 lvx v9, 0, r6; \
642: \
65 vperm reg1, v9, reg1, reg3;
66#else
67#define GET16BYTES(reg1, reg2, reg3) \
68 lvx reg1, 0, reg2; \
30e4cc54
RS
69 vspltisb v8, -1; \
70 vperm v8, reg1, v8, reg3; \
71 vcmpequb. v8, v0, v8; \
c8376f3e 72 beq cr6, 1f; \
73 vspltisb v9, 0; \
74 b 2f; \
75 .align 4; \
761: \
77 addi r6, reg2, 16; \
78 lvx v9, 0, r6; \
792: \
80 vperm reg1, reg1, v9, reg3;
81#endif
82
83/* Check null in v4, v5 and convert to lower. */
84#define CHECKNULLANDCONVERT() \
85 vcmpequb. v7, v0, v5; \
86 beq cr6, 3f; \
87 vcmpequb. v7, v0, v4; \
88 beq cr6, 3f; \
89 b L(null_found); \
90 .align 4; \
913: \
92 TOLOWER()
93
9250e661 94 .machine power8
c8376f3e 95
96ENTRY (__STRCASECMP)
97#ifdef USE_AS_STRNCASECMP
98 CALL_MCOUNT 3
99#else
100 CALL_MCOUNT 2
101#endif
102#define rRTN r3 /* Return value */
103#define rSTR1 r10 /* 1st string */
104#define rSTR2 r4 /* 2nd string */
105#define rCHAR1 r6 /* Byte read from 1st string */
106#define rCHAR2 r7 /* Byte read from 2nd string */
107#define rADDR1 r8 /* Address of tolower(rCHAR1) */
108#define rADDR2 r12 /* Address of tolower(rCHAR2) */
109#define rLWR1 r8 /* Word tolower(rCHAR1) */
110#define rLWR2 r12 /* Word tolower(rCHAR2) */
111#define rTMP r9
112#define rLOC r11 /* Default locale address */
113
114 cmpd cr7, rRTN, rSTR2
115
116 /* Get locale address. */
117 ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
118 add rLOC, rTMP, __libc_tsd_LOCALE@tls
119 ld rLOC, 0(rLOC)
120
121 mr rSTR1, rRTN
122 li rRTN, 0
123 beqlr cr7
124#ifdef USE_AS_STRNCASECMP
125 cmpdi cr7, r5, 0
126 beq cr7, L(retnull)
127 cmpdi cr7, r5, 16
128 blt cr7, L(bytebybyte)
129#endif
130 vspltisb v0, 0
131 vspltisb v8, -1
132 /* Check for null in initial characters.
133 Check max of 16 char depending on the alignment.
134 If null is present, proceed byte by byte. */
135 lvx v4, 0, rSTR1
136#ifdef __LITTLE_ENDIAN__
137 lvsr v10, 0, rSTR1 /* Compute mask. */
138 vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
139#else
140 lvsl v10, 0, rSTR1
141 vperm v9, v4, v8, v10
142#endif
143 vcmpequb. v9, v0, v9 /* Check for null bytes. */
144 bne cr6, L(bytebybyte)
145 lvx v5, 0, rSTR2
146 /* Calculate alignment. */
147#ifdef __LITTLE_ENDIAN__
148 lvsr v6, 0, rSTR2
149 vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
150#else
151 lvsl v6, 0, rSTR2
152 vperm v9, v5, v8, v6
153#endif
154 vcmpequb. v9, v0, v9 /* Check for null bytes. */
155 bne cr6, L(bytebybyte)
156 /* Check if locale has non ascii characters. */
157 ld rTMP, 0(rLOC)
158 addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
159 lwz rTMP, 0(r6)
160 cmpdi cr7, rTMP, 1
161 beq cr7, L(bytebybyte)
162
163 /* Load vector registers with values used for TOLOWER. */
164 /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
165 vspltisb v3, 2
166 vspltisb v9, 4
167 vsl v3, v3, v9
168 vaddubm v1, v3, v3
169 vnor v1, v1, v1
170 vspltisb v2, 7
171 vsububm v2, v3, v2
172
173 andi. rADDR1, rSTR1, 0xF
174 beq cr0, L(align)
175 addi r6, rSTR1, 16
176 lvx v9, 0, r6
177 /* Compute 16 bytes from previous two loads. */
178#ifdef __LITTLE_ENDIAN__
179 vperm v4, v9, v4, v10
180#else
181 vperm v4, v4, v9, v10
182#endif
183L(align):
184 andi. rADDR2, rSTR2, 0xF
185 beq cr0, L(align1)
186 addi r6, rSTR2, 16
187 lvx v9, 0, r6
188 /* Compute 16 bytes from previous two loads. */
189#ifdef __LITTLE_ENDIAN__
190 vperm v5, v9, v5, v6
191#else
192 vperm v5, v5, v9, v6
193#endif
194L(align1):
195 CHECKNULLANDCONVERT()
196 blt cr6, L(match)
197 b L(different)
198 .align 4
199L(match):
200 clrldi r6, rSTR1, 60
201 subfic r7, r6, 16
202#ifdef USE_AS_STRNCASECMP
203 sub r5, r5, r7
204#endif
205 add rSTR1, rSTR1, r7
206 add rSTR2, rSTR2, r7
207 andi. rADDR2, rSTR2, 0xF
208 addi rSTR1, rSTR1, -16
209 addi rSTR2, rSTR2, -16
210 beq cr0, L(aligned)
211#ifdef __LITTLE_ENDIAN__
212 lvsr v6, 0, rSTR2
213#else
214 lvsl v6, 0, rSTR2
215#endif
216 /* There are 2 loops depending on the input alignment.
217 Each loop gets 16 bytes from s1 and s2, check for null,
218 convert to lowercase and compare. Loop till difference
219 or null occurs. */
220L(s1_align):
221 addi rSTR1, rSTR1, 16
222 addi rSTR2, rSTR2, 16
223#ifdef USE_AS_STRNCASECMP
224 cmpdi cr7, r5, 16
225 blt cr7, L(bytebybyte)
226 addi r5, r5, -16
227#endif
228 lvx v4, 0, rSTR1
229 GET16BYTES(v5, rSTR2, v6)
230 CHECKNULLANDCONVERT()
231 blt cr6, L(s1_align)
232 b L(different)
233 .align 4
234L(aligned):
235 addi rSTR1, rSTR1, 16
236 addi rSTR2, rSTR2, 16
237#ifdef USE_AS_STRNCASECMP
238 cmpdi cr7, r5, 16
239 blt cr7, L(bytebybyte)
240 addi r5, r5, -16
241#endif
242 lvx v4, 0, rSTR1
243 lvx v5, 0, rSTR2
244 CHECKNULLANDCONVERT()
245 blt cr6, L(aligned)
246
247 /* Calculate and return the difference. */
248L(different):
249 vaddubm v1, v3, v3
250 vcmpequb v7, v0, v7
251#ifdef __LITTLE_ENDIAN__
252 /* Count trailing zero. */
253 vspltisb v8, -1
9250e661 254 vadduqm v9, v7, v8
c8376f3e 255 vandc v8, v9, v7
9250e661 256 vpopcntd v8, v8
c8376f3e 257 vspltb v6, v8, 15
258 vcmpequb. v6, v6, v1
259 blt cr6, L(shift8)
260#else
261 /* Count leading zero. */
9250e661 262 vclzd v8, v7
c8376f3e 263 vspltb v6, v8, 7
264 vcmpequb. v6, v6, v1
265 blt cr6, L(shift8)
266 vsro v8, v8, v1
267#endif
268 b L(skipsum)
269 .align 4
270L(shift8):
271 vsumsws v8, v8, v0
272L(skipsum):
273#ifdef __LITTLE_ENDIAN__
274 /* Shift registers based on leading zero count. */
275 vsro v6, v5, v8
276 vsro v7, v4, v8
277 /* Merge and move to GPR. */
278 vmrglb v6, v6, v7
279 vslo v1, v6, v1
9250e661 280 mfvrd r3, v1
c8376f3e 281 /* Place the characters that are different in first position. */
282 sldi rSTR2, rRTN, 56
283 srdi rSTR2, rSTR2, 56
284 sldi rSTR1, rRTN, 48
285 srdi rSTR1, rSTR1, 56
286#else
287 vslo v6, v5, v8
288 vslo v7, v4, v8
289 vmrghb v1, v6, v7
9250e661 290 mfvrd r3, v1
c8376f3e 291 srdi rSTR2, rRTN, 48
292 sldi rSTR2, rSTR2, 56
293 srdi rSTR2, rSTR2, 56
294 srdi rSTR1, rRTN, 56
295#endif
296 subf rRTN, rSTR1, rSTR2
297 extsw rRTN, rRTN
298 blr
299
300 .align 4
301 /* OK. We've hit the end of the string. We need to be careful that
302 we don't compare two strings as different because of junk beyond
303 the end of the strings... */
304L(null_found):
305 vaddubm v10, v3, v3
306#ifdef __LITTLE_ENDIAN__
307 /* Count trailing zero. */
308 vspltisb v8, -1
9250e661 309 vadduqm v9, v7, v8
c8376f3e 310 vandc v8, v9, v7
9250e661 311 vpopcntd v8, v8
c8376f3e 312 vspltb v6, v8, 15
313 vcmpequb. v6, v6, v10
314 blt cr6, L(shift_8)
315#else
316 /* Count leading zero. */
9250e661 317 vclzd v8, v7
c8376f3e 318 vspltb v6, v8, 7
319 vcmpequb. v6, v6, v10
320 blt cr6, L(shift_8)
321 vsro v8, v8, v10
322#endif
323 b L(skipsum1)
324 .align 4
325L(shift_8):
326 vsumsws v8, v8, v0
327L(skipsum1):
328 /* Calculate shift count based on count of zero. */
329 vspltisb v10, 7
330 vslb v10, v10, v10
331 vsldoi v9, v0, v10, 1
9250e661 332 vsubudm v9, v9, v8
c8376f3e 333 vspltisb v8, 8
334 vsldoi v8, v0, v8, 1
9250e661 335 vsubudm v9, v9, v8
c8376f3e 336 /* Shift and remove junk after null character. */
337#ifdef __LITTLE_ENDIAN__
338 vslo v5, v5, v9
339 vslo v4, v4, v9
340#else
341 vsro v5, v5, v9
342 vsro v4, v4, v9
343#endif
344 /* Convert and compare 16 bytes. */
345 TOLOWER()
346 blt cr6, L(retnull)
347 b L(different)
348 .align 4
349L(retnull):
350 li rRTN, 0
351 blr
352 .align 4
353L(bytebybyte):
354 /* Unrolling loop for POWER: loads are done with 'lbz' plus
355 offset and string descriptors are only updated in the end
356 of loop unrolling. */
357 ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
358 lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
359 lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
360#ifdef USE_AS_STRNCASECMP
361 rldicl rTMP, r5, 62, 2
362 cmpdi cr7, rTMP, 0
363 beq cr7, L(lessthan4)
364 mtctr rTMP
365#endif
366L(loop):
367 cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
368 sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
369 sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
370 lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
371 lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
372 cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
373 crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
374 beq cr1, L(done)
375 lbz rCHAR1, 1(rSTR1)
376 lbz rCHAR2, 1(rSTR2)
377 cmpdi rCHAR1, 0
378 sldi rADDR1, rCHAR1, 2
379 sldi rADDR2, rCHAR2, 2
380 lwzx rLWR1, rLOC, rADDR1
381 lwzx rLWR2, rLOC, rADDR2
382 cmpw cr1, rLWR1, rLWR2
383 crorc 4*cr1+eq,eq,4*cr1+eq
384 beq cr1, L(done)
385 lbz rCHAR1, 2(rSTR1)
386 lbz rCHAR2, 2(rSTR2)
387 cmpdi rCHAR1, 0
388 sldi rADDR1, rCHAR1, 2
389 sldi rADDR2, rCHAR2, 2
390 lwzx rLWR1, rLOC, rADDR1
391 lwzx rLWR2, rLOC, rADDR2
392 cmpw cr1, rLWR1, rLWR2
393 crorc 4*cr1+eq,eq,4*cr1+eq
394 beq cr1, L(done)
395 lbz rCHAR1, 3(rSTR1)
396 lbz rCHAR2, 3(rSTR2)
397 cmpdi rCHAR1, 0
398 /* Increment both string descriptors */
399 addi rSTR1, rSTR1, 4
400 addi rSTR2, rSTR2, 4
401 sldi rADDR1, rCHAR1, 2
402 sldi rADDR2, rCHAR2, 2
403 lwzx rLWR1, rLOC, rADDR1
404 lwzx rLWR2, rLOC, rADDR2
405 cmpw cr1, rLWR1, rLWR2
406 crorc 4*cr1+eq,eq,4*cr1+eq
407 beq cr1, L(done)
408 lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
409 lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
410#ifdef USE_AS_STRNCASECMP
411 bdnz L(loop)
412#else
413 b L(loop)
414#endif
415#ifdef USE_AS_STRNCASECMP
416L(lessthan4):
417 clrldi r5, r5, 62
418 cmpdi cr7, r5, 0
419 beq cr7, L(retnull)
420 mtctr r5
421L(loop1):
422 cmpdi rCHAR1, 0
423 sldi rADDR1, rCHAR1, 2
424 sldi rADDR2, rCHAR2, 2
425 lwzx rLWR1, rLOC, rADDR1
426 lwzx rLWR2, rLOC, rADDR2
427 cmpw cr1, rLWR1, rLWR2
428 crorc 4*cr1+eq,eq,4*cr1+eq
429 beq cr1, L(done)
430 addi rSTR1, rSTR1, 1
431 addi rSTR2, rSTR2, 1
432 lbz rCHAR1, 0(rSTR1)
433 lbz rCHAR2, 0(rSTR2)
434 bdnz L(loop1)
435#endif
436L(done):
437 subf r0, rLWR2, rLWR1
438 extsw rRTN, r0
439 blr
440END (__STRCASECMP)
441
442weak_alias (__STRCASECMP, STRCASECMP)
443libc_hidden_builtin_def (__STRCASECMP)