]>
Commit | Line | Data |
---|---|---|
e413b14e | 1 | /* Optimized strcasestr implementation for PowerPC64/POWER8. |
2b778ceb | 2 | Copyright (C) 2016-2021 Free Software Foundation, Inc. |
e413b14e RS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
e413b14e RS |
18 | |
19 | #include <sysdep.h> | |
20 | #include <locale-defines.h> | |
21 | ||
22 | /* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */ | |
23 | ||
24 | /* The performance gain is obtained by comparing 16 bytes. */ | |
25 | ||
26 | /* When the first char of r4 is hit ITERATIONS times in r3 | |
27 | fallback to default. */ | |
28 | #define ITERATIONS 64 | |
29 | ||
f0748b70 WSM |
30 | #ifndef STRCASESTR |
31 | # define STRCASESTR __strcasestr | |
32 | #endif | |
33 | ||
e413b14e RS |
34 | #ifndef STRLEN |
35 | /* For builds without IFUNC support, local calls should be made to internal | |
36 | GLIBC symbol (created by libc_hidden_builtin_def). */ | |
37 | # ifdef SHARED | |
38 | # define STRLEN __GI_strlen | |
39 | # else | |
40 | # define STRLEN strlen | |
41 | # endif | |
42 | #endif | |
43 | ||
44 | #ifndef STRNLEN | |
45 | /* For builds without IFUNC support, local calls should be made to internal | |
46 | GLIBC symbol (created by libc_hidden_builtin_def). */ | |
47 | # ifdef SHARED | |
48 | # define STRNLEN __GI_strnlen | |
49 | # else | |
50 | # define STRNLEN __strnlen | |
51 | # endif | |
52 | #endif | |
53 | ||
54 | #ifndef STRCHR | |
55 | # ifdef SHARED | |
56 | # define STRCHR __GI_strchr | |
57 | # else | |
58 | # define STRCHR strchr | |
59 | # endif | |
60 | #endif | |
61 | ||
62 | /* Convert 16 bytes of v4 and reg to lowercase and compare. */ | |
63 | #define TOLOWER(reg) \ | |
64 | vcmpgtub v6, v4, v1; \ | |
65 | vcmpgtub v7, v2, v4; \ | |
66 | vand v8, v7, v6; \ | |
67 | vand v8, v8, v3; \ | |
68 | vor v4, v8, v4; \ | |
69 | vcmpgtub v6, reg, v1; \ | |
70 | vcmpgtub v7, v2, reg; \ | |
71 | vand v8, v7, v6; \ | |
72 | vand v8, v8, v3; \ | |
73 | vor reg, v8, reg; \ | |
74 | vcmpequb. v6, reg, v4; | |
75 | ||
e413b14e | 76 | #define FRAMESIZE (FRAME_MIN_SIZE+48) |
9250e661 | 77 | .machine power8 |
d5b41185 | 78 | ENTRY (STRCASESTR, 4) |
e413b14e RS |
79 | CALL_MCOUNT 2 |
80 | mflr r0 /* Load link register LR to r0. */ | |
81 | std r31, -8(r1) /* Save callers register r31. */ | |
82 | std r30, -16(r1) /* Save callers register r30. */ | |
83 | std r29, -24(r1) /* Save callers register r29. */ | |
84 | std r28, -32(r1) /* Save callers register r28. */ | |
85 | std r27, -40(r1) /* Save callers register r27. */ | |
86 | std r0, 16(r1) /* Store the link register. */ | |
87 | cfi_offset(r31, -8) | |
88 | cfi_offset(r30, -16) | |
89 | cfi_offset(r29, -24) | |
90 | cfi_offset(r28, -32) | |
91 | cfi_offset(r27, -40) | |
92 | cfi_offset(lr, 16) | |
93 | stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ | |
94 | cfi_adjust_cfa_offset(FRAMESIZE) | |
95 | ||
96 | dcbt 0, r3 | |
97 | dcbt 0, r4 | |
98 | cmpdi cr7, r3, 0 /* Input validation. */ | |
99 | beq cr7, L(retnull) | |
100 | cmpdi cr7, r4, 0 | |
101 | beq cr7, L(retnull) | |
102 | ||
103 | mr r29, r3 | |
104 | mr r30, r4 | |
105 | /* Load first byte from r4 and check if its null. */ | |
106 | lbz r6, 0(r4) | |
107 | cmpdi cr7, r6, 0 | |
108 | beq cr7, L(ret_r3) | |
109 | ||
110 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
111 | add r9, r10, __libc_tsd_LOCALE@tls | |
112 | ld r9, 0(r9) | |
113 | ld r9, LOCALE_CTYPE_TOUPPER(r9) | |
114 | sldi r10, r6, 2 /* Convert to upper case. */ | |
115 | lwzx r28, r9, r10 | |
116 | ||
117 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
118 | add r11, r10, __libc_tsd_LOCALE@tls | |
119 | ld r11, 0(r11) | |
120 | ld r11, LOCALE_CTYPE_TOLOWER(r11) | |
121 | sldi r10, r6, 2 /* Convert to lower case. */ | |
122 | lwzx r27, r11, r10 | |
123 | ||
124 | /* Check if the first char is present. */ | |
125 | mr r4, r27 | |
126 | bl STRCHR | |
127 | nop | |
128 | mr r5, r3 | |
129 | mr r3, r29 | |
130 | mr r29, r5 | |
131 | mr r4, r28 | |
132 | bl STRCHR | |
133 | nop | |
134 | cmpdi cr7, r29, 0 | |
135 | beq cr7, L(firstpos) | |
136 | cmpdi cr7, r3, 0 | |
137 | beq cr7, L(skipcheck) | |
138 | cmpw cr7, r3, r29 | |
139 | ble cr7, L(firstpos) | |
140 | /* Move r3 to the first occurence. */ | |
141 | L(skipcheck): | |
142 | mr r3, r29 | |
143 | L(firstpos): | |
144 | mr r29, r3 | |
145 | ||
146 | sldi r9, r27, 8 | |
147 | or r28, r9, r28 | |
148 | /* Reg r27 is used to count the number of iterations. */ | |
149 | li r27, 0 | |
150 | /* If first char of search str is not present. */ | |
151 | cmpdi cr7, r3, 0 | |
152 | ble cr7, L(end) | |
153 | ||
154 | /* Find the length of pattern. */ | |
155 | mr r3, r30 | |
156 | bl STRLEN | |
157 | nop | |
158 | ||
159 | cmpdi cr7, r3, 0 /* If search str is null. */ | |
160 | beq cr7, L(ret_r3) | |
161 | ||
162 | mr r31, r3 | |
163 | mr r4, r3 | |
164 | mr r3, r29 | |
165 | bl STRNLEN | |
166 | nop | |
167 | ||
168 | cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ | |
169 | blt cr7, L(retnull) | |
170 | ||
171 | mr r3, r29 | |
172 | ||
173 | /* Locales not matching ASCII for single bytes. */ | |
174 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
175 | add r9, r10, __libc_tsd_LOCALE@tls | |
176 | ld r9, 0(r9) | |
177 | ld r7, 0(r9) | |
178 | addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES | |
179 | lwz r8, 0(r7) | |
180 | cmpdi cr7, r8, 1 | |
181 | beq cr7, L(bytebybyte) | |
182 | ||
183 | /* If len(r4) < 16 handle byte by byte. */ | |
184 | /* For shorter strings we will not use vector registers. */ | |
185 | cmpdi cr7, r31, 16 | |
186 | blt cr7, L(bytebybyte) | |
187 | ||
188 | /* Comparison values used for TOLOWER. */ | |
189 | /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */ | |
190 | vspltish v0, 0 | |
191 | vspltisb v5, 2 | |
192 | vspltisb v4, 4 | |
193 | vsl v3, v5, v4 | |
194 | vaddubm v1, v3, v3 | |
195 | vspltisb v5, 15 | |
196 | vaddubm v2, v5, v5 | |
197 | vaddubm v2, v1, v2 | |
198 | vspltisb v4, -3 | |
199 | vaddubm v2, v2, v4 | |
200 | ||
201 | /* | |
202 | 1. Load 16 bytes from r3 and r4 | |
203 | 2. Check if there is null, If yes, proceed byte by byte path. | |
204 | 3. Else,Convert both to lowercase and compare. | |
205 | 4. If they are same proceed to 1. | |
206 | 5. If they dont match, find if first char of r4 is present in the | |
207 | loaded 16 byte of r3. | |
208 | 6. If yes, move position, load next 16 bytes of r3 and proceed to 2. | |
209 | */ | |
210 | ||
211 | mr r8, r3 /* Save r3 for future use. */ | |
212 | mr r4, r30 /* Restore r4. */ | |
213 | clrldi r10, r4, 60 | |
214 | lvx v5, 0, r4 /* Load 16 bytes from r4. */ | |
215 | cmpdi cr7, r10, 0 | |
216 | beq cr7, L(begin2) | |
217 | /* If r4 is unaligned, load another 16 bytes. */ | |
218 | #ifdef __LITTLE_ENDIAN__ | |
219 | lvsr v7, 0, r4 | |
220 | #else | |
221 | lvsl v7, 0, r4 | |
222 | #endif | |
223 | addi r5, r4, 16 | |
224 | lvx v9, 0, r5 | |
225 | #ifdef __LITTLE_ENDIAN__ | |
226 | vperm v5, v9, v5, v7 | |
227 | #else | |
228 | vperm v5, v5, v9, v7 | |
229 | #endif | |
230 | L(begin2): | |
231 | lvx v4, 0, r3 | |
232 | vcmpequb. v7, v0, v4 /* Check for null. */ | |
233 | beq cr6, L(nullchk6) | |
234 | b L(trailcheck) | |
235 | ||
236 | .align 4 | |
237 | L(nullchk6): | |
238 | clrldi r10, r3, 60 | |
239 | cmpdi cr7, r10, 0 | |
240 | beq cr7, L(next16) | |
241 | #ifdef __LITTLE_ENDIAN__ | |
242 | lvsr v7, 0, r3 | |
243 | #else | |
244 | lvsl v7, 0, r3 | |
245 | #endif | |
246 | addi r5, r3, 16 | |
247 | /* If r3 is unaligned, load another 16 bytes. */ | |
248 | lvx v10, 0, r5 | |
249 | #ifdef __LITTLE_ENDIAN__ | |
250 | vperm v4, v10, v4, v7 | |
251 | #else | |
252 | vperm v4, v4, v10, v7 | |
253 | #endif | |
254 | L(next16): | |
255 | vcmpequb. v6, v0, v5 /* Check for null. */ | |
256 | beq cr6, L(nullchk) | |
257 | b L(trailcheck) | |
258 | ||
259 | .align 4 | |
260 | L(nullchk): | |
261 | vcmpequb. v6, v0, v4 | |
262 | beq cr6, L(nullchk1) | |
263 | b L(retnull) | |
264 | ||
265 | .align 4 | |
266 | L(nullchk1): | |
267 | /* Convert both v3 and v4 to lower. */ | |
268 | TOLOWER(v5) | |
269 | /* If both are same, branch to match. */ | |
270 | blt cr6, L(match) | |
271 | /* Find if the first char is present in next 15 bytes. */ | |
272 | #ifdef __LITTLE_ENDIAN__ | |
273 | vspltb v6, v5, 15 | |
274 | vsldoi v7, v0, v4, 15 | |
275 | #else | |
276 | vspltb v6, v5, 0 | |
277 | vspltisb v7, 8 | |
278 | vslo v7, v4, v7 | |
279 | #endif | |
280 | vcmpequb v7, v6, v7 | |
281 | vcmpequb. v6, v0, v7 | |
282 | /* Shift r3 by 16 bytes and proceed. */ | |
283 | blt cr6, L(shift16) | |
9250e661 | 284 | vclzd v8, v7 |
e413b14e RS |
285 | #ifdef __LITTLE_ENDIAN__ |
286 | vspltb v6, v8, 15 | |
287 | #else | |
288 | vspltb v6, v8, 7 | |
289 | #endif | |
290 | vcmpequb. v6, v6, v1 | |
291 | /* Shift r3 by 8 bytes and proceed. */ | |
292 | blt cr6, L(shift8) | |
293 | b L(begin) | |
294 | ||
295 | .align 4 | |
296 | L(match): | |
297 | /* There is a match of 16 bytes, check next bytes. */ | |
298 | cmpdi cr7, r31, 16 | |
299 | mr r29, r3 | |
300 | beq cr7, L(ret_r3) | |
301 | ||
302 | L(secondmatch): | |
303 | addi r3, r3, 16 | |
304 | addi r4, r4, 16 | |
305 | /* Load next 16 bytes of r3 and r4 and compare. */ | |
306 | clrldi r10, r4, 60 | |
307 | cmpdi cr7, r10, 0 | |
308 | beq cr7, L(nextload) | |
309 | /* Handle unaligned case. */ | |
310 | vor v6, v9, v9 | |
311 | vcmpequb. v7, v0, v6 | |
312 | beq cr6, L(nullchk2) | |
313 | b L(trailcheck) | |
314 | ||
315 | .align 4 | |
316 | L(nullchk2): | |
317 | #ifdef __LITTLE_ENDIAN__ | |
318 | lvsr v7, 0, r4 | |
319 | #else | |
320 | lvsl v7, 0, r4 | |
321 | #endif | |
322 | addi r5, r4, 16 | |
323 | /* If r4 is unaligned, load another 16 bytes. */ | |
324 | lvx v9, 0, r5 | |
325 | #ifdef __LITTLE_ENDIAN__ | |
326 | vperm v11, v9, v6, v7 | |
327 | #else | |
328 | vperm v11, v6, v9, v7 | |
329 | #endif | |
330 | b L(compare) | |
331 | ||
332 | .align 4 | |
333 | L(nextload): | |
334 | lvx v11, 0, r4 | |
335 | L(compare): | |
336 | vcmpequb. v7, v0, v11 | |
337 | beq cr6, L(nullchk3) | |
338 | b L(trailcheck) | |
339 | ||
340 | .align 4 | |
341 | L(nullchk3): | |
342 | clrldi r10, r3, 60 | |
343 | cmpdi cr7, r10, 0 | |
344 | beq cr7, L(nextload1) | |
345 | /* Handle unaligned case. */ | |
346 | vor v4, v10, v10 | |
347 | vcmpequb. v7, v0, v4 | |
348 | beq cr6, L(nullchk4) | |
349 | b L(retnull) | |
350 | ||
351 | .align 4 | |
352 | L(nullchk4): | |
353 | #ifdef __LITTLE_ENDIAN__ | |
354 | lvsr v7, 0, r3 | |
355 | #else | |
356 | lvsl v7, 0, r3 | |
357 | #endif | |
358 | addi r5, r3, 16 | |
359 | /* If r3 is unaligned, load another 16 bytes. */ | |
360 | lvx v10, 0, r5 | |
361 | #ifdef __LITTLE_ENDIAN__ | |
362 | vperm v4, v10, v4, v7 | |
363 | #else | |
364 | vperm v4, v4, v10, v7 | |
365 | #endif | |
366 | b L(compare1) | |
367 | ||
368 | .align 4 | |
369 | L(nextload1): | |
370 | lvx v4, 0, r3 | |
371 | L(compare1): | |
372 | vcmpequb. v7, v0, v4 | |
373 | beq cr6, L(nullchk5) | |
374 | b L(retnull) | |
375 | ||
376 | .align 4 | |
377 | L(nullchk5): | |
378 | /* Convert both v3 and v4 to lower. */ | |
379 | TOLOWER(v11) | |
380 | /* If both are same, branch to secondmatch. */ | |
381 | blt cr6, L(secondmatch) | |
382 | /* Continue the search. */ | |
383 | b L(begin) | |
384 | ||
385 | .align 4 | |
386 | L(trailcheck): | |
387 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
388 | add r11, r10, __libc_tsd_LOCALE@tls | |
389 | ld r11, 0(r11) | |
390 | ld r11, LOCALE_CTYPE_TOLOWER(r11) | |
391 | L(loop2): | |
392 | lbz r5, 0(r3) /* Load byte from r3. */ | |
393 | lbz r6, 0(r4) /* Load next byte from r4. */ | |
394 | cmpdi cr7, r6, 0 /* Is it null? */ | |
395 | beq cr7, L(updater3) | |
396 | cmpdi cr7, r5, 0 /* Is it null? */ | |
397 | beq cr7, L(retnull) /* If yes, return. */ | |
398 | addi r3, r3, 1 | |
399 | addi r4, r4, 1 /* Increment r4. */ | |
400 | sldi r10, r5, 2 /* Convert to lower case. */ | |
401 | lwzx r10, r11, r10 | |
402 | sldi r7, r6, 2 /* Convert to lower case. */ | |
403 | lwzx r7, r11, r7 | |
404 | cmpw cr7, r7, r10 /* Compare with byte from r4. */ | |
405 | bne cr7, L(begin) | |
406 | b L(loop2) | |
407 | ||
408 | .align 4 | |
409 | L(shift8): | |
410 | addi r8, r8, 7 | |
411 | b L(begin) | |
412 | .align 4 | |
413 | L(shift16): | |
414 | addi r8, r8, 15 | |
415 | .align 4 | |
416 | L(begin): | |
417 | addi r8, r8, 1 | |
418 | mr r3, r8 | |
419 | /* When our iterations exceed ITERATIONS,fall back to default. */ | |
420 | addi r27, r27, 1 | |
421 | cmpdi cr7, r27, ITERATIONS | |
422 | beq cr7, L(default) | |
423 | mr r4, r30 /* Restore r4. */ | |
424 | b L(begin2) | |
425 | ||
426 | /* Handling byte by byte. */ | |
427 | .align 4 | |
428 | L(loop1): | |
429 | mr r3, r8 | |
430 | addi r27, r27, 1 | |
431 | cmpdi cr7, r27, ITERATIONS | |
432 | beq cr7, L(default) | |
433 | mr r29, r8 | |
434 | srdi r4, r28, 8 | |
435 | /* Check if the first char is present. */ | |
436 | bl STRCHR | |
437 | nop | |
438 | mr r5, r3 | |
439 | mr r3, r29 | |
440 | mr r29, r5 | |
441 | sldi r4, r28, 56 | |
442 | srdi r4, r4, 56 | |
443 | bl STRCHR | |
444 | nop | |
445 | cmpdi cr7, r29, 0 | |
446 | beq cr7, L(nextpos) | |
447 | cmpdi cr7, r3, 0 | |
448 | beq cr7, L(skipcheck1) | |
449 | cmpw cr7, r3, r29 | |
450 | ble cr7, L(nextpos) | |
451 | /* Move r3 to first occurence. */ | |
452 | L(skipcheck1): | |
453 | mr r3, r29 | |
454 | L(nextpos): | |
455 | mr r29, r3 | |
456 | cmpdi cr7, r3, 0 | |
457 | ble cr7, L(retnull) | |
458 | L(bytebybyte): | |
459 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
460 | add r11, r10, __libc_tsd_LOCALE@tls | |
461 | ld r11, 0(r11) | |
462 | ld r11, LOCALE_CTYPE_TOLOWER(r11) | |
463 | mr r4, r30 /* Restore r4. */ | |
464 | mr r8, r3 /* Save r3. */ | |
465 | addi r8, r8, 1 | |
466 | ||
467 | L(loop): | |
468 | addi r3, r3, 1 | |
469 | lbz r5, 0(r3) /* Load byte from r3. */ | |
470 | addi r4, r4, 1 /* Increment r4. */ | |
471 | lbz r6, 0(r4) /* Load next byte from r4. */ | |
472 | cmpdi cr7, r6, 0 /* Is it null? */ | |
473 | beq cr7, L(updater3) | |
474 | cmpdi cr7, r5, 0 /* Is it null? */ | |
475 | beq cr7, L(retnull) /* If yes, return. */ | |
476 | sldi r10, r5, 2 /* Convert to lower case. */ | |
477 | lwzx r10, r11, r10 | |
478 | sldi r7, r6, 2 /* Convert to lower case. */ | |
479 | lwzx r7, r11, r7 | |
480 | cmpw cr7, r7, r10 /* Compare with byte from r4. */ | |
481 | bne cr7, L(loop1) | |
482 | b L(loop) | |
483 | ||
484 | /* Handling return values. */ | |
485 | .align 4 | |
486 | L(updater3): | |
487 | subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */ | |
488 | b L(end) | |
489 | ||
490 | .align 4 | |
491 | L(ret_r3): | |
492 | mr r3, r29 /* Return point of match. */ | |
493 | b L(end) | |
494 | ||
495 | .align 4 | |
496 | L(retnull): | |
497 | li r3, 0 /* Substring was not found. */ | |
498 | b L(end) | |
499 | ||
500 | .align 4 | |
501 | L(default): | |
502 | mr r4, r30 | |
503 | bl __strcasestr_ppc | |
504 | nop | |
505 | ||
506 | .align 4 | |
507 | L(end): | |
508 | addi r1, r1, FRAMESIZE /* Restore stack pointer. */ | |
509 | cfi_adjust_cfa_offset(-FRAMESIZE) | |
510 | ld r0, 16(r1) /* Restore the saved link register. */ | |
511 | ld r27, -40(r1) | |
512 | ld r28, -32(r1) | |
513 | ld r29, -24(r1) /* Restore callers save register r29. */ | |
514 | ld r30, -16(r1) /* Restore callers save register r30. */ | |
515 | ld r31, -8(r1) /* Restore callers save register r31. */ | |
516 | cfi_restore(lr) | |
517 | cfi_restore(r27) | |
518 | cfi_restore(r28) | |
519 | cfi_restore(r29) | |
520 | cfi_restore(r30) | |
521 | cfi_restore(r31) | |
522 | mtlr r0 /* Branch to link register. */ | |
523 | blr | |
f0748b70 | 524 | END (STRCASESTR) |
c24480ce TMQMF |
525 | |
526 | weak_alias (__strcasestr, strcasestr) | |
527 | libc_hidden_def (__strcasestr) | |
e413b14e | 528 | libc_hidden_builtin_def (strcasestr) |