]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power8/strcasestr.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strcasestr.S
CommitLineData
e413b14e 1/* Optimized strcasestr implementation for PowerPC64/POWER8.
bfff8b1b 2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
e413b14e
RS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include <locale-defines.h>
21
22/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
23
24/* The performance gain is obtained by comparing 16 bytes. */
25
26/* When the first char of r4 is hit ITERATIONS times in r3
27 fallback to default. */
28#define ITERATIONS 64
29
30#ifndef STRLEN
31/* For builds without IFUNC support, local calls should be made to internal
32 GLIBC symbol (created by libc_hidden_builtin_def). */
33# ifdef SHARED
34# define STRLEN __GI_strlen
35# else
36# define STRLEN strlen
37# endif
38#endif
39
40#ifndef STRNLEN
41/* For builds without IFUNC support, local calls should be made to internal
42 GLIBC symbol (created by libc_hidden_builtin_def). */
43# ifdef SHARED
44# define STRNLEN __GI_strnlen
45# else
46# define STRNLEN __strnlen
47# endif
48#endif
49
50#ifndef STRCHR
51# ifdef SHARED
52# define STRCHR __GI_strchr
53# else
54# define STRCHR strchr
55# endif
56#endif
57
58/* Convert 16 bytes of v4 and reg to lowercase and compare. */
59#define TOLOWER(reg) \
60 vcmpgtub v6, v4, v1; \
61 vcmpgtub v7, v2, v4; \
62 vand v8, v7, v6; \
63 vand v8, v8, v3; \
64 vor v4, v8, v4; \
65 vcmpgtub v6, reg, v1; \
66 vcmpgtub v7, v2, reg; \
67 vand v8, v7, v6; \
68 vand v8, v8, v3; \
69 vor reg, v8, reg; \
70 vcmpequb. v6, reg, v4;
71
72/* TODO: change these to the actual instructions when the minimum required
73 binutils allows it. */
74#ifdef _ARCH_PWR8
75#define VCLZD_V8_v7 vclzd v8, v7;
76#else
77#define VCLZD_V8_v7 .long 0x11003fc2
78#endif
79
80#define FRAMESIZE (FRAME_MIN_SIZE+48)
81/* TODO: change this to .machine power8 when the minimum required binutils
82 allows it. */
83 .machine power7
c24480ce 84EALIGN (__strcasestr, 4, 0)
e413b14e
RS
85 CALL_MCOUNT 2
86 mflr r0 /* Load link register LR to r0. */
87 std r31, -8(r1) /* Save callers register r31. */
88 std r30, -16(r1) /* Save callers register r30. */
89 std r29, -24(r1) /* Save callers register r29. */
90 std r28, -32(r1) /* Save callers register r28. */
91 std r27, -40(r1) /* Save callers register r27. */
92 std r0, 16(r1) /* Store the link register. */
93 cfi_offset(r31, -8)
94 cfi_offset(r30, -16)
95 cfi_offset(r29, -24)
96 cfi_offset(r28, -32)
97 cfi_offset(r27, -40)
98 cfi_offset(lr, 16)
99 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
100 cfi_adjust_cfa_offset(FRAMESIZE)
101
102 dcbt 0, r3
103 dcbt 0, r4
104 cmpdi cr7, r3, 0 /* Input validation. */
105 beq cr7, L(retnull)
106 cmpdi cr7, r4, 0
107 beq cr7, L(retnull)
108
109 mr r29, r3
110 mr r30, r4
111 /* Load first byte from r4 and check if its null. */
112 lbz r6, 0(r4)
113 cmpdi cr7, r6, 0
114 beq cr7, L(ret_r3)
115
116 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
117 add r9, r10, __libc_tsd_LOCALE@tls
118 ld r9, 0(r9)
119 ld r9, LOCALE_CTYPE_TOUPPER(r9)
120 sldi r10, r6, 2 /* Convert to upper case. */
121 lwzx r28, r9, r10
122
123 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
124 add r11, r10, __libc_tsd_LOCALE@tls
125 ld r11, 0(r11)
126 ld r11, LOCALE_CTYPE_TOLOWER(r11)
127 sldi r10, r6, 2 /* Convert to lower case. */
128 lwzx r27, r11, r10
129
130 /* Check if the first char is present. */
131 mr r4, r27
132 bl STRCHR
133 nop
134 mr r5, r3
135 mr r3, r29
136 mr r29, r5
137 mr r4, r28
138 bl STRCHR
139 nop
140 cmpdi cr7, r29, 0
141 beq cr7, L(firstpos)
142 cmpdi cr7, r3, 0
143 beq cr7, L(skipcheck)
144 cmpw cr7, r3, r29
145 ble cr7, L(firstpos)
146 /* Move r3 to the first occurence. */
147L(skipcheck):
148 mr r3, r29
149L(firstpos):
150 mr r29, r3
151
152 sldi r9, r27, 8
153 or r28, r9, r28
154 /* Reg r27 is used to count the number of iterations. */
155 li r27, 0
156 /* If first char of search str is not present. */
157 cmpdi cr7, r3, 0
158 ble cr7, L(end)
159
160 /* Find the length of pattern. */
161 mr r3, r30
162 bl STRLEN
163 nop
164
165 cmpdi cr7, r3, 0 /* If search str is null. */
166 beq cr7, L(ret_r3)
167
168 mr r31, r3
169 mr r4, r3
170 mr r3, r29
171 bl STRNLEN
172 nop
173
174 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
175 blt cr7, L(retnull)
176
177 mr r3, r29
178
179 /* Locales not matching ASCII for single bytes. */
180 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
181 add r9, r10, __libc_tsd_LOCALE@tls
182 ld r9, 0(r9)
183 ld r7, 0(r9)
184 addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
185 lwz r8, 0(r7)
186 cmpdi cr7, r8, 1
187 beq cr7, L(bytebybyte)
188
189 /* If len(r4) < 16 handle byte by byte. */
190 /* For shorter strings we will not use vector registers. */
191 cmpdi cr7, r31, 16
192 blt cr7, L(bytebybyte)
193
194 /* Comparison values used for TOLOWER. */
195 /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
196 vspltish v0, 0
197 vspltisb v5, 2
198 vspltisb v4, 4
199 vsl v3, v5, v4
200 vaddubm v1, v3, v3
201 vspltisb v5, 15
202 vaddubm v2, v5, v5
203 vaddubm v2, v1, v2
204 vspltisb v4, -3
205 vaddubm v2, v2, v4
206
207 /*
208 1. Load 16 bytes from r3 and r4
209 2. Check if there is null, If yes, proceed byte by byte path.
210 3. Else,Convert both to lowercase and compare.
211 4. If they are same proceed to 1.
212 5. If they dont match, find if first char of r4 is present in the
213 loaded 16 byte of r3.
214 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
215 */
216
217 mr r8, r3 /* Save r3 for future use. */
218 mr r4, r30 /* Restore r4. */
219 clrldi r10, r4, 60
220 lvx v5, 0, r4 /* Load 16 bytes from r4. */
221 cmpdi cr7, r10, 0
222 beq cr7, L(begin2)
223 /* If r4 is unaligned, load another 16 bytes. */
224#ifdef __LITTLE_ENDIAN__
225 lvsr v7, 0, r4
226#else
227 lvsl v7, 0, r4
228#endif
229 addi r5, r4, 16
230 lvx v9, 0, r5
231#ifdef __LITTLE_ENDIAN__
232 vperm v5, v9, v5, v7
233#else
234 vperm v5, v5, v9, v7
235#endif
236L(begin2):
237 lvx v4, 0, r3
238 vcmpequb. v7, v0, v4 /* Check for null. */
239 beq cr6, L(nullchk6)
240 b L(trailcheck)
241
242 .align 4
243L(nullchk6):
244 clrldi r10, r3, 60
245 cmpdi cr7, r10, 0
246 beq cr7, L(next16)
247#ifdef __LITTLE_ENDIAN__
248 lvsr v7, 0, r3
249#else
250 lvsl v7, 0, r3
251#endif
252 addi r5, r3, 16
253 /* If r3 is unaligned, load another 16 bytes. */
254 lvx v10, 0, r5
255#ifdef __LITTLE_ENDIAN__
256 vperm v4, v10, v4, v7
257#else
258 vperm v4, v4, v10, v7
259#endif
260L(next16):
261 vcmpequb. v6, v0, v5 /* Check for null. */
262 beq cr6, L(nullchk)
263 b L(trailcheck)
264
265 .align 4
266L(nullchk):
267 vcmpequb. v6, v0, v4
268 beq cr6, L(nullchk1)
269 b L(retnull)
270
271 .align 4
272L(nullchk1):
273 /* Convert both v3 and v4 to lower. */
274 TOLOWER(v5)
275 /* If both are same, branch to match. */
276 blt cr6, L(match)
277 /* Find if the first char is present in next 15 bytes. */
278#ifdef __LITTLE_ENDIAN__
279 vspltb v6, v5, 15
280 vsldoi v7, v0, v4, 15
281#else
282 vspltb v6, v5, 0
283 vspltisb v7, 8
284 vslo v7, v4, v7
285#endif
286 vcmpequb v7, v6, v7
287 vcmpequb. v6, v0, v7
288 /* Shift r3 by 16 bytes and proceed. */
289 blt cr6, L(shift16)
290 VCLZD_V8_v7
291#ifdef __LITTLE_ENDIAN__
292 vspltb v6, v8, 15
293#else
294 vspltb v6, v8, 7
295#endif
296 vcmpequb. v6, v6, v1
297 /* Shift r3 by 8 bytes and proceed. */
298 blt cr6, L(shift8)
299 b L(begin)
300
301 .align 4
302L(match):
303 /* There is a match of 16 bytes, check next bytes. */
304 cmpdi cr7, r31, 16
305 mr r29, r3
306 beq cr7, L(ret_r3)
307
308L(secondmatch):
309 addi r3, r3, 16
310 addi r4, r4, 16
311 /* Load next 16 bytes of r3 and r4 and compare. */
312 clrldi r10, r4, 60
313 cmpdi cr7, r10, 0
314 beq cr7, L(nextload)
315 /* Handle unaligned case. */
316 vor v6, v9, v9
317 vcmpequb. v7, v0, v6
318 beq cr6, L(nullchk2)
319 b L(trailcheck)
320
321 .align 4
322L(nullchk2):
323#ifdef __LITTLE_ENDIAN__
324 lvsr v7, 0, r4
325#else
326 lvsl v7, 0, r4
327#endif
328 addi r5, r4, 16
329 /* If r4 is unaligned, load another 16 bytes. */
330 lvx v9, 0, r5
331#ifdef __LITTLE_ENDIAN__
332 vperm v11, v9, v6, v7
333#else
334 vperm v11, v6, v9, v7
335#endif
336 b L(compare)
337
338 .align 4
339L(nextload):
340 lvx v11, 0, r4
341L(compare):
342 vcmpequb. v7, v0, v11
343 beq cr6, L(nullchk3)
344 b L(trailcheck)
345
346 .align 4
347L(nullchk3):
348 clrldi r10, r3, 60
349 cmpdi cr7, r10, 0
350 beq cr7, L(nextload1)
351 /* Handle unaligned case. */
352 vor v4, v10, v10
353 vcmpequb. v7, v0, v4
354 beq cr6, L(nullchk4)
355 b L(retnull)
356
357 .align 4
358L(nullchk4):
359#ifdef __LITTLE_ENDIAN__
360 lvsr v7, 0, r3
361#else
362 lvsl v7, 0, r3
363#endif
364 addi r5, r3, 16
365 /* If r3 is unaligned, load another 16 bytes. */
366 lvx v10, 0, r5
367#ifdef __LITTLE_ENDIAN__
368 vperm v4, v10, v4, v7
369#else
370 vperm v4, v4, v10, v7
371#endif
372 b L(compare1)
373
374 .align 4
375L(nextload1):
376 lvx v4, 0, r3
377L(compare1):
378 vcmpequb. v7, v0, v4
379 beq cr6, L(nullchk5)
380 b L(retnull)
381
382 .align 4
383L(nullchk5):
384 /* Convert both v3 and v4 to lower. */
385 TOLOWER(v11)
386 /* If both are same, branch to secondmatch. */
387 blt cr6, L(secondmatch)
388 /* Continue the search. */
389 b L(begin)
390
391 .align 4
392L(trailcheck):
393 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
394 add r11, r10, __libc_tsd_LOCALE@tls
395 ld r11, 0(r11)
396 ld r11, LOCALE_CTYPE_TOLOWER(r11)
397L(loop2):
398 lbz r5, 0(r3) /* Load byte from r3. */
399 lbz r6, 0(r4) /* Load next byte from r4. */
400 cmpdi cr7, r6, 0 /* Is it null? */
401 beq cr7, L(updater3)
402 cmpdi cr7, r5, 0 /* Is it null? */
403 beq cr7, L(retnull) /* If yes, return. */
404 addi r3, r3, 1
405 addi r4, r4, 1 /* Increment r4. */
406 sldi r10, r5, 2 /* Convert to lower case. */
407 lwzx r10, r11, r10
408 sldi r7, r6, 2 /* Convert to lower case. */
409 lwzx r7, r11, r7
410 cmpw cr7, r7, r10 /* Compare with byte from r4. */
411 bne cr7, L(begin)
412 b L(loop2)
413
414 .align 4
415L(shift8):
416 addi r8, r8, 7
417 b L(begin)
418 .align 4
419L(shift16):
420 addi r8, r8, 15
421 .align 4
422L(begin):
423 addi r8, r8, 1
424 mr r3, r8
425 /* When our iterations exceed ITERATIONS,fall back to default. */
426 addi r27, r27, 1
427 cmpdi cr7, r27, ITERATIONS
428 beq cr7, L(default)
429 mr r4, r30 /* Restore r4. */
430 b L(begin2)
431
432 /* Handling byte by byte. */
433 .align 4
434L(loop1):
435 mr r3, r8
436 addi r27, r27, 1
437 cmpdi cr7, r27, ITERATIONS
438 beq cr7, L(default)
439 mr r29, r8
440 srdi r4, r28, 8
441 /* Check if the first char is present. */
442 bl STRCHR
443 nop
444 mr r5, r3
445 mr r3, r29
446 mr r29, r5
447 sldi r4, r28, 56
448 srdi r4, r4, 56
449 bl STRCHR
450 nop
451 cmpdi cr7, r29, 0
452 beq cr7, L(nextpos)
453 cmpdi cr7, r3, 0
454 beq cr7, L(skipcheck1)
455 cmpw cr7, r3, r29
456 ble cr7, L(nextpos)
457 /* Move r3 to first occurence. */
458L(skipcheck1):
459 mr r3, r29
460L(nextpos):
461 mr r29, r3
462 cmpdi cr7, r3, 0
463 ble cr7, L(retnull)
464L(bytebybyte):
465 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
466 add r11, r10, __libc_tsd_LOCALE@tls
467 ld r11, 0(r11)
468 ld r11, LOCALE_CTYPE_TOLOWER(r11)
469 mr r4, r30 /* Restore r4. */
470 mr r8, r3 /* Save r3. */
471 addi r8, r8, 1
472
473L(loop):
474 addi r3, r3, 1
475 lbz r5, 0(r3) /* Load byte from r3. */
476 addi r4, r4, 1 /* Increment r4. */
477 lbz r6, 0(r4) /* Load next byte from r4. */
478 cmpdi cr7, r6, 0 /* Is it null? */
479 beq cr7, L(updater3)
480 cmpdi cr7, r5, 0 /* Is it null? */
481 beq cr7, L(retnull) /* If yes, return. */
482 sldi r10, r5, 2 /* Convert to lower case. */
483 lwzx r10, r11, r10
484 sldi r7, r6, 2 /* Convert to lower case. */
485 lwzx r7, r11, r7
486 cmpw cr7, r7, r10 /* Compare with byte from r4. */
487 bne cr7, L(loop1)
488 b L(loop)
489
490 /* Handling return values. */
491 .align 4
492L(updater3):
493 subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
494 b L(end)
495
496 .align 4
497L(ret_r3):
498 mr r3, r29 /* Return point of match. */
499 b L(end)
500
501 .align 4
502L(retnull):
503 li r3, 0 /* Substring was not found. */
504 b L(end)
505
506 .align 4
507L(default):
508 mr r4, r30
509 bl __strcasestr_ppc
510 nop
511
512 .align 4
513L(end):
514 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
515 cfi_adjust_cfa_offset(-FRAMESIZE)
516 ld r0, 16(r1) /* Restore the saved link register. */
517 ld r27, -40(r1)
518 ld r28, -32(r1)
519 ld r29, -24(r1) /* Restore callers save register r29. */
520 ld r30, -16(r1) /* Restore callers save register r30. */
521 ld r31, -8(r1) /* Restore callers save register r31. */
522 cfi_restore(lr)
523 cfi_restore(r27)
524 cfi_restore(r28)
525 cfi_restore(r29)
526 cfi_restore(r30)
527 cfi_restore(r31)
528 mtlr r0 /* Branch to link register. */
529 blr
c24480ce
TMQMF
530END (__strcasestr)
531
532weak_alias (__strcasestr, strcasestr)
533libc_hidden_def (__strcasestr)
e413b14e 534libc_hidden_builtin_def (strcasestr)