]>
Commit | Line | Data |
---|---|---|
e413b14e | 1 | /* Optimized strcasestr implementation for PowerPC64/POWER8. |
bfff8b1b | 2 | Copyright (C) 2016-2017 Free Software Foundation, Inc. |
e413b14e RS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | #include <locale-defines.h> | |
21 | ||
22 | /* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */ | |
23 | ||
24 | /* The performance gain is obtained by comparing 16 bytes. */ | |
25 | ||
26 | /* When the first char of r4 is hit ITERATIONS times in r3 | |
27 | fallback to default. */ | |
28 | #define ITERATIONS 64 | |
29 | ||
30 | #ifndef STRLEN | |
31 | /* For builds without IFUNC support, local calls should be made to internal | |
32 | GLIBC symbol (created by libc_hidden_builtin_def). */ | |
33 | # ifdef SHARED | |
34 | # define STRLEN __GI_strlen | |
35 | # else | |
36 | # define STRLEN strlen | |
37 | # endif | |
38 | #endif | |
39 | ||
40 | #ifndef STRNLEN | |
41 | /* For builds without IFUNC support, local calls should be made to internal | |
42 | GLIBC symbol (created by libc_hidden_builtin_def). */ | |
43 | # ifdef SHARED | |
44 | # define STRNLEN __GI_strnlen | |
45 | # else | |
46 | # define STRNLEN __strnlen | |
47 | # endif | |
48 | #endif | |
49 | ||
50 | #ifndef STRCHR | |
51 | # ifdef SHARED | |
52 | # define STRCHR __GI_strchr | |
53 | # else | |
54 | # define STRCHR strchr | |
55 | # endif | |
56 | #endif | |
57 | ||
58 | /* Convert 16 bytes of v4 and reg to lowercase and compare. */ | |
59 | #define TOLOWER(reg) \ | |
60 | vcmpgtub v6, v4, v1; \ | |
61 | vcmpgtub v7, v2, v4; \ | |
62 | vand v8, v7, v6; \ | |
63 | vand v8, v8, v3; \ | |
64 | vor v4, v8, v4; \ | |
65 | vcmpgtub v6, reg, v1; \ | |
66 | vcmpgtub v7, v2, reg; \ | |
67 | vand v8, v7, v6; \ | |
68 | vand v8, v8, v3; \ | |
69 | vor reg, v8, reg; \ | |
70 | vcmpequb. v6, reg, v4; | |
71 | ||
72 | /* TODO: change these to the actual instructions when the minimum required | |
73 | binutils allows it. */ | |
74 | #ifdef _ARCH_PWR8 | |
75 | #define VCLZD_V8_v7 vclzd v8, v7; | |
76 | #else | |
77 | #define VCLZD_V8_v7 .long 0x11003fc2 | |
78 | #endif | |
79 | ||
80 | #define FRAMESIZE (FRAME_MIN_SIZE+48) | |
81 | /* TODO: change this to .machine power8 when the minimum required binutils | |
82 | allows it. */ | |
83 | .machine power7 | |
c24480ce | 84 | EALIGN (__strcasestr, 4, 0) |
e413b14e RS |
85 | CALL_MCOUNT 2 |
86 | mflr r0 /* Load link register LR to r0. */ | |
87 | std r31, -8(r1) /* Save callers register r31. */ | |
88 | std r30, -16(r1) /* Save callers register r30. */ | |
89 | std r29, -24(r1) /* Save callers register r29. */ | |
90 | std r28, -32(r1) /* Save callers register r28. */ | |
91 | std r27, -40(r1) /* Save callers register r27. */ | |
92 | std r0, 16(r1) /* Store the link register. */ | |
93 | cfi_offset(r31, -8) | |
94 | cfi_offset(r30, -16) | |
95 | cfi_offset(r29, -24) | |
96 | cfi_offset(r28, -32) | |
97 | cfi_offset(r27, -40) | |
98 | cfi_offset(lr, 16) | |
99 | stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ | |
100 | cfi_adjust_cfa_offset(FRAMESIZE) | |
101 | ||
102 | dcbt 0, r3 | |
103 | dcbt 0, r4 | |
104 | cmpdi cr7, r3, 0 /* Input validation. */ | |
105 | beq cr7, L(retnull) | |
106 | cmpdi cr7, r4, 0 | |
107 | beq cr7, L(retnull) | |
108 | ||
109 | mr r29, r3 | |
110 | mr r30, r4 | |
111 | /* Load first byte from r4 and check if its null. */ | |
112 | lbz r6, 0(r4) | |
113 | cmpdi cr7, r6, 0 | |
114 | beq cr7, L(ret_r3) | |
115 | ||
116 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
117 | add r9, r10, __libc_tsd_LOCALE@tls | |
118 | ld r9, 0(r9) | |
119 | ld r9, LOCALE_CTYPE_TOUPPER(r9) | |
120 | sldi r10, r6, 2 /* Convert to upper case. */ | |
121 | lwzx r28, r9, r10 | |
122 | ||
123 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
124 | add r11, r10, __libc_tsd_LOCALE@tls | |
125 | ld r11, 0(r11) | |
126 | ld r11, LOCALE_CTYPE_TOLOWER(r11) | |
127 | sldi r10, r6, 2 /* Convert to lower case. */ | |
128 | lwzx r27, r11, r10 | |
129 | ||
130 | /* Check if the first char is present. */ | |
131 | mr r4, r27 | |
132 | bl STRCHR | |
133 | nop | |
134 | mr r5, r3 | |
135 | mr r3, r29 | |
136 | mr r29, r5 | |
137 | mr r4, r28 | |
138 | bl STRCHR | |
139 | nop | |
140 | cmpdi cr7, r29, 0 | |
141 | beq cr7, L(firstpos) | |
142 | cmpdi cr7, r3, 0 | |
143 | beq cr7, L(skipcheck) | |
144 | cmpw cr7, r3, r29 | |
145 | ble cr7, L(firstpos) | |
146 | /* Move r3 to the first occurence. */ | |
147 | L(skipcheck): | |
148 | mr r3, r29 | |
149 | L(firstpos): | |
150 | mr r29, r3 | |
151 | ||
152 | sldi r9, r27, 8 | |
153 | or r28, r9, r28 | |
154 | /* Reg r27 is used to count the number of iterations. */ | |
155 | li r27, 0 | |
156 | /* If first char of search str is not present. */ | |
157 | cmpdi cr7, r3, 0 | |
158 | ble cr7, L(end) | |
159 | ||
160 | /* Find the length of pattern. */ | |
161 | mr r3, r30 | |
162 | bl STRLEN | |
163 | nop | |
164 | ||
165 | cmpdi cr7, r3, 0 /* If search str is null. */ | |
166 | beq cr7, L(ret_r3) | |
167 | ||
168 | mr r31, r3 | |
169 | mr r4, r3 | |
170 | mr r3, r29 | |
171 | bl STRNLEN | |
172 | nop | |
173 | ||
174 | cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ | |
175 | blt cr7, L(retnull) | |
176 | ||
177 | mr r3, r29 | |
178 | ||
179 | /* Locales not matching ASCII for single bytes. */ | |
180 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
181 | add r9, r10, __libc_tsd_LOCALE@tls | |
182 | ld r9, 0(r9) | |
183 | ld r7, 0(r9) | |
184 | addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES | |
185 | lwz r8, 0(r7) | |
186 | cmpdi cr7, r8, 1 | |
187 | beq cr7, L(bytebybyte) | |
188 | ||
189 | /* If len(r4) < 16 handle byte by byte. */ | |
190 | /* For shorter strings we will not use vector registers. */ | |
191 | cmpdi cr7, r31, 16 | |
192 | blt cr7, L(bytebybyte) | |
193 | ||
194 | /* Comparison values used for TOLOWER. */ | |
195 | /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */ | |
196 | vspltish v0, 0 | |
197 | vspltisb v5, 2 | |
198 | vspltisb v4, 4 | |
199 | vsl v3, v5, v4 | |
200 | vaddubm v1, v3, v3 | |
201 | vspltisb v5, 15 | |
202 | vaddubm v2, v5, v5 | |
203 | vaddubm v2, v1, v2 | |
204 | vspltisb v4, -3 | |
205 | vaddubm v2, v2, v4 | |
206 | ||
207 | /* | |
208 | 1. Load 16 bytes from r3 and r4 | |
209 | 2. Check if there is null, If yes, proceed byte by byte path. | |
210 | 3. Else,Convert both to lowercase and compare. | |
211 | 4. If they are same proceed to 1. | |
212 | 5. If they dont match, find if first char of r4 is present in the | |
213 | loaded 16 byte of r3. | |
214 | 6. If yes, move position, load next 16 bytes of r3 and proceed to 2. | |
215 | */ | |
216 | ||
217 | mr r8, r3 /* Save r3 for future use. */ | |
218 | mr r4, r30 /* Restore r4. */ | |
219 | clrldi r10, r4, 60 | |
220 | lvx v5, 0, r4 /* Load 16 bytes from r4. */ | |
221 | cmpdi cr7, r10, 0 | |
222 | beq cr7, L(begin2) | |
223 | /* If r4 is unaligned, load another 16 bytes. */ | |
224 | #ifdef __LITTLE_ENDIAN__ | |
225 | lvsr v7, 0, r4 | |
226 | #else | |
227 | lvsl v7, 0, r4 | |
228 | #endif | |
229 | addi r5, r4, 16 | |
230 | lvx v9, 0, r5 | |
231 | #ifdef __LITTLE_ENDIAN__ | |
232 | vperm v5, v9, v5, v7 | |
233 | #else | |
234 | vperm v5, v5, v9, v7 | |
235 | #endif | |
236 | L(begin2): | |
237 | lvx v4, 0, r3 | |
238 | vcmpequb. v7, v0, v4 /* Check for null. */ | |
239 | beq cr6, L(nullchk6) | |
240 | b L(trailcheck) | |
241 | ||
242 | .align 4 | |
243 | L(nullchk6): | |
244 | clrldi r10, r3, 60 | |
245 | cmpdi cr7, r10, 0 | |
246 | beq cr7, L(next16) | |
247 | #ifdef __LITTLE_ENDIAN__ | |
248 | lvsr v7, 0, r3 | |
249 | #else | |
250 | lvsl v7, 0, r3 | |
251 | #endif | |
252 | addi r5, r3, 16 | |
253 | /* If r3 is unaligned, load another 16 bytes. */ | |
254 | lvx v10, 0, r5 | |
255 | #ifdef __LITTLE_ENDIAN__ | |
256 | vperm v4, v10, v4, v7 | |
257 | #else | |
258 | vperm v4, v4, v10, v7 | |
259 | #endif | |
260 | L(next16): | |
261 | vcmpequb. v6, v0, v5 /* Check for null. */ | |
262 | beq cr6, L(nullchk) | |
263 | b L(trailcheck) | |
264 | ||
265 | .align 4 | |
266 | L(nullchk): | |
267 | vcmpequb. v6, v0, v4 | |
268 | beq cr6, L(nullchk1) | |
269 | b L(retnull) | |
270 | ||
271 | .align 4 | |
272 | L(nullchk1): | |
273 | /* Convert both v3 and v4 to lower. */ | |
274 | TOLOWER(v5) | |
275 | /* If both are same, branch to match. */ | |
276 | blt cr6, L(match) | |
277 | /* Find if the first char is present in next 15 bytes. */ | |
278 | #ifdef __LITTLE_ENDIAN__ | |
279 | vspltb v6, v5, 15 | |
280 | vsldoi v7, v0, v4, 15 | |
281 | #else | |
282 | vspltb v6, v5, 0 | |
283 | vspltisb v7, 8 | |
284 | vslo v7, v4, v7 | |
285 | #endif | |
286 | vcmpequb v7, v6, v7 | |
287 | vcmpequb. v6, v0, v7 | |
288 | /* Shift r3 by 16 bytes and proceed. */ | |
289 | blt cr6, L(shift16) | |
290 | VCLZD_V8_v7 | |
291 | #ifdef __LITTLE_ENDIAN__ | |
292 | vspltb v6, v8, 15 | |
293 | #else | |
294 | vspltb v6, v8, 7 | |
295 | #endif | |
296 | vcmpequb. v6, v6, v1 | |
297 | /* Shift r3 by 8 bytes and proceed. */ | |
298 | blt cr6, L(shift8) | |
299 | b L(begin) | |
300 | ||
301 | .align 4 | |
302 | L(match): | |
303 | /* There is a match of 16 bytes, check next bytes. */ | |
304 | cmpdi cr7, r31, 16 | |
305 | mr r29, r3 | |
306 | beq cr7, L(ret_r3) | |
307 | ||
308 | L(secondmatch): | |
309 | addi r3, r3, 16 | |
310 | addi r4, r4, 16 | |
311 | /* Load next 16 bytes of r3 and r4 and compare. */ | |
312 | clrldi r10, r4, 60 | |
313 | cmpdi cr7, r10, 0 | |
314 | beq cr7, L(nextload) | |
315 | /* Handle unaligned case. */ | |
316 | vor v6, v9, v9 | |
317 | vcmpequb. v7, v0, v6 | |
318 | beq cr6, L(nullchk2) | |
319 | b L(trailcheck) | |
320 | ||
321 | .align 4 | |
322 | L(nullchk2): | |
323 | #ifdef __LITTLE_ENDIAN__ | |
324 | lvsr v7, 0, r4 | |
325 | #else | |
326 | lvsl v7, 0, r4 | |
327 | #endif | |
328 | addi r5, r4, 16 | |
329 | /* If r4 is unaligned, load another 16 bytes. */ | |
330 | lvx v9, 0, r5 | |
331 | #ifdef __LITTLE_ENDIAN__ | |
332 | vperm v11, v9, v6, v7 | |
333 | #else | |
334 | vperm v11, v6, v9, v7 | |
335 | #endif | |
336 | b L(compare) | |
337 | ||
338 | .align 4 | |
339 | L(nextload): | |
340 | lvx v11, 0, r4 | |
341 | L(compare): | |
342 | vcmpequb. v7, v0, v11 | |
343 | beq cr6, L(nullchk3) | |
344 | b L(trailcheck) | |
345 | ||
346 | .align 4 | |
347 | L(nullchk3): | |
348 | clrldi r10, r3, 60 | |
349 | cmpdi cr7, r10, 0 | |
350 | beq cr7, L(nextload1) | |
351 | /* Handle unaligned case. */ | |
352 | vor v4, v10, v10 | |
353 | vcmpequb. v7, v0, v4 | |
354 | beq cr6, L(nullchk4) | |
355 | b L(retnull) | |
356 | ||
357 | .align 4 | |
358 | L(nullchk4): | |
359 | #ifdef __LITTLE_ENDIAN__ | |
360 | lvsr v7, 0, r3 | |
361 | #else | |
362 | lvsl v7, 0, r3 | |
363 | #endif | |
364 | addi r5, r3, 16 | |
365 | /* If r3 is unaligned, load another 16 bytes. */ | |
366 | lvx v10, 0, r5 | |
367 | #ifdef __LITTLE_ENDIAN__ | |
368 | vperm v4, v10, v4, v7 | |
369 | #else | |
370 | vperm v4, v4, v10, v7 | |
371 | #endif | |
372 | b L(compare1) | |
373 | ||
374 | .align 4 | |
375 | L(nextload1): | |
376 | lvx v4, 0, r3 | |
377 | L(compare1): | |
378 | vcmpequb. v7, v0, v4 | |
379 | beq cr6, L(nullchk5) | |
380 | b L(retnull) | |
381 | ||
382 | .align 4 | |
383 | L(nullchk5): | |
384 | /* Convert both v3 and v4 to lower. */ | |
385 | TOLOWER(v11) | |
386 | /* If both are same, branch to secondmatch. */ | |
387 | blt cr6, L(secondmatch) | |
388 | /* Continue the search. */ | |
389 | b L(begin) | |
390 | ||
391 | .align 4 | |
392 | L(trailcheck): | |
393 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
394 | add r11, r10, __libc_tsd_LOCALE@tls | |
395 | ld r11, 0(r11) | |
396 | ld r11, LOCALE_CTYPE_TOLOWER(r11) | |
397 | L(loop2): | |
398 | lbz r5, 0(r3) /* Load byte from r3. */ | |
399 | lbz r6, 0(r4) /* Load next byte from r4. */ | |
400 | cmpdi cr7, r6, 0 /* Is it null? */ | |
401 | beq cr7, L(updater3) | |
402 | cmpdi cr7, r5, 0 /* Is it null? */ | |
403 | beq cr7, L(retnull) /* If yes, return. */ | |
404 | addi r3, r3, 1 | |
405 | addi r4, r4, 1 /* Increment r4. */ | |
406 | sldi r10, r5, 2 /* Convert to lower case. */ | |
407 | lwzx r10, r11, r10 | |
408 | sldi r7, r6, 2 /* Convert to lower case. */ | |
409 | lwzx r7, r11, r7 | |
410 | cmpw cr7, r7, r10 /* Compare with byte from r4. */ | |
411 | bne cr7, L(begin) | |
412 | b L(loop2) | |
413 | ||
414 | .align 4 | |
415 | L(shift8): | |
416 | addi r8, r8, 7 | |
417 | b L(begin) | |
418 | .align 4 | |
419 | L(shift16): | |
420 | addi r8, r8, 15 | |
421 | .align 4 | |
422 | L(begin): | |
423 | addi r8, r8, 1 | |
424 | mr r3, r8 | |
425 | /* When our iterations exceed ITERATIONS,fall back to default. */ | |
426 | addi r27, r27, 1 | |
427 | cmpdi cr7, r27, ITERATIONS | |
428 | beq cr7, L(default) | |
429 | mr r4, r30 /* Restore r4. */ | |
430 | b L(begin2) | |
431 | ||
432 | /* Handling byte by byte. */ | |
433 | .align 4 | |
434 | L(loop1): | |
435 | mr r3, r8 | |
436 | addi r27, r27, 1 | |
437 | cmpdi cr7, r27, ITERATIONS | |
438 | beq cr7, L(default) | |
439 | mr r29, r8 | |
440 | srdi r4, r28, 8 | |
441 | /* Check if the first char is present. */ | |
442 | bl STRCHR | |
443 | nop | |
444 | mr r5, r3 | |
445 | mr r3, r29 | |
446 | mr r29, r5 | |
447 | sldi r4, r28, 56 | |
448 | srdi r4, r4, 56 | |
449 | bl STRCHR | |
450 | nop | |
451 | cmpdi cr7, r29, 0 | |
452 | beq cr7, L(nextpos) | |
453 | cmpdi cr7, r3, 0 | |
454 | beq cr7, L(skipcheck1) | |
455 | cmpw cr7, r3, r29 | |
456 | ble cr7, L(nextpos) | |
457 | /* Move r3 to first occurence. */ | |
458 | L(skipcheck1): | |
459 | mr r3, r29 | |
460 | L(nextpos): | |
461 | mr r29, r3 | |
462 | cmpdi cr7, r3, 0 | |
463 | ble cr7, L(retnull) | |
464 | L(bytebybyte): | |
465 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) | |
466 | add r11, r10, __libc_tsd_LOCALE@tls | |
467 | ld r11, 0(r11) | |
468 | ld r11, LOCALE_CTYPE_TOLOWER(r11) | |
469 | mr r4, r30 /* Restore r4. */ | |
470 | mr r8, r3 /* Save r3. */ | |
471 | addi r8, r8, 1 | |
472 | ||
473 | L(loop): | |
474 | addi r3, r3, 1 | |
475 | lbz r5, 0(r3) /* Load byte from r3. */ | |
476 | addi r4, r4, 1 /* Increment r4. */ | |
477 | lbz r6, 0(r4) /* Load next byte from r4. */ | |
478 | cmpdi cr7, r6, 0 /* Is it null? */ | |
479 | beq cr7, L(updater3) | |
480 | cmpdi cr7, r5, 0 /* Is it null? */ | |
481 | beq cr7, L(retnull) /* If yes, return. */ | |
482 | sldi r10, r5, 2 /* Convert to lower case. */ | |
483 | lwzx r10, r11, r10 | |
484 | sldi r7, r6, 2 /* Convert to lower case. */ | |
485 | lwzx r7, r11, r7 | |
486 | cmpw cr7, r7, r10 /* Compare with byte from r4. */ | |
487 | bne cr7, L(loop1) | |
488 | b L(loop) | |
489 | ||
490 | /* Handling return values. */ | |
491 | .align 4 | |
492 | L(updater3): | |
493 | subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */ | |
494 | b L(end) | |
495 | ||
496 | .align 4 | |
497 | L(ret_r3): | |
498 | mr r3, r29 /* Return point of match. */ | |
499 | b L(end) | |
500 | ||
501 | .align 4 | |
502 | L(retnull): | |
503 | li r3, 0 /* Substring was not found. */ | |
504 | b L(end) | |
505 | ||
506 | .align 4 | |
507 | L(default): | |
508 | mr r4, r30 | |
509 | bl __strcasestr_ppc | |
510 | nop | |
511 | ||
512 | .align 4 | |
513 | L(end): | |
514 | addi r1, r1, FRAMESIZE /* Restore stack pointer. */ | |
515 | cfi_adjust_cfa_offset(-FRAMESIZE) | |
516 | ld r0, 16(r1) /* Restore the saved link register. */ | |
517 | ld r27, -40(r1) | |
518 | ld r28, -32(r1) | |
519 | ld r29, -24(r1) /* Restore callers save register r29. */ | |
520 | ld r30, -16(r1) /* Restore callers save register r30. */ | |
521 | ld r31, -8(r1) /* Restore callers save register r31. */ | |
522 | cfi_restore(lr) | |
523 | cfi_restore(r27) | |
524 | cfi_restore(r28) | |
525 | cfi_restore(r29) | |
526 | cfi_restore(r30) | |
527 | cfi_restore(r31) | |
528 | mtlr r0 /* Branch to link register. */ | |
529 | blr | |
c24480ce TMQMF |
530 | END (__strcasestr) |
531 | ||
532 | weak_alias (__strcasestr, strcasestr) | |
533 | libc_hidden_def (__strcasestr) | |
e413b14e | 534 | libc_hidden_builtin_def (strcasestr) |