]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strstr.c
SSE4.2 strstr/strcasestr for x86-64.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strstr.c
CommitLineData
2b7a8664
L
1/* strstr with SSE4.2 intrinsics
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21#include <nmmintrin.h>
22
23#ifndef STRSTR_SSE42
24# define STRSTR_SSE42 __strstr_sse42
25#endif
26
27#ifdef USE_AS_STRCASESTR
28# include <ctype.h>
29# include <locale.h>
30# include <string.h>
31
32# define LOADBYTE(C) tolower (C)
33# define CMPBYTE(C1, C2) \
34 ((C1) == (C2) || tolower (C1) == tolower (C2))
35#else
36# define LOADBYTE(C) (C)
37# define CMPBYTE(C1, C2) ((C1) == (C2))
38#endif
39
40/* We use 0xe ordered-compare:
41 _SIDD_SBYTE_OPS
42 | _SIDD_CMP_EQUAL_ORDER
43 | _SIDD_LEAST_SIGNIFICANT
44 on pcmpistri to do the scanning and string comparsion requirements of
45 sub-string match. In the scanning phase, we process Cflag and ECX
46 index to locate the first fragment match; once the first fragment
47 match position has been identified, we do comparison of subsequent
48 string fragments until we can conclude false or true match; whe
49 n concluding a false match, we may need to repeat scanning process
50 from next relevant offset in the target string.
51
52 In the scanning phase we have 4 cases:
53 case ECX CFlag ZFlag SFlag
54 1 16 0 0 0
55 2a 16 0 0 1
56 2b 16 0 1 0
57 2c 16 0 1 1
58
59 1. No ordered-comparison match, both 16B fragments are valid, so
60 continue to next fragment.
61 2. No ordered-comparison match, there is EOS in either fragment,
62 2a. Zflg = 0, Sflg = 1, we continue
63 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
64 2c. Zflg = 1, sflg = 1, lenth determine match or no match
65
66 In the string comparison phase, the 1st fragment match is fixed up
67 to produce ECX = 0. Subsequent fragment compare of nonzero index
68 and no match conclude a false match.
69
70 case ECX CFlag ZFlag SFlag
71 3 X 1 0 0/1
72 4a 0 1 0 0
73 4b 0 1 0 1
74 4c 0 < X 1 0 0/1
75 5 16 0 1 0
76
77 3. An initial ordered-comparison fragment match, we fix up to do
78 subsequent string comparison
79 4a. Continuation of fragment comparison of a string compare.
80 4b. EOS reached in the reference string, we conclude true match and
81 return
82 4c. String compare failed if index is nonzero, we need to go back to
83 scanning
84 5. failed string compare, go back to scanning
85 */
86
87/* Fix-up of removal of unneeded data due to 16B aligned load
88 parameters:
89 value: 16B data loaded from 16B aligned address.
90 offset: Offset of target data address relative to 16B aligned load
91 address.
92 */
93
94static __inline__ __m128i
95__m128i_shift_right (__m128i value, int offset)
96{
97 switch (offset)
98 {
99 case 1:
100 value = _mm_srli_si128 (value, 1);
101 break;
102 case 2:
103 value = _mm_srli_si128 (value, 2);
104 break;
105 case 3:
106 value = _mm_srli_si128 (value, 3);
107 break;
108 case 4:
109 value = _mm_srli_si128 (value, 4);
110 break;
111 case 5:
112 value = _mm_srli_si128 (value, 5);
113 break;
114 case 6:
115 value = _mm_srli_si128 (value, 6);
116 break;
117 case 7:
118 value = _mm_srli_si128 (value, 7);
119 break;
120 case 8:
121 value = _mm_srli_si128 (value, 8);
122 break;
123 case 9:
124 value = _mm_srli_si128 (value, 9);
125 break;
126 case 10:
127 value = _mm_srli_si128 (value, 10);
128 break;
129 case 11:
130 value = _mm_srli_si128 (value, 11);
131 break;
132 case 12:
133 value = _mm_srli_si128 (value, 12);
134 break;
135 case 13:
136 value = _mm_srli_si128 (value, 13);
137 break;
138 case 14:
139 value = _mm_srli_si128 (value, 14);
140 break;
141 case 15:
142 value = _mm_srli_si128 (value, 15);
143 break;
144 }
145 return value;
146}
147
148/* Simple replacement of movdqu to address 4KB boundary cross issue.
149 If EOS occurs within less than 16B before 4KB boundary, we don't
150 cross to next page. */
151
152static __m128i
153__attribute__ ((section (".text.sse4.2")))
154__m128i_strloadu (const unsigned char * p)
155{
156 int offset = ((size_t) p & (16 - 1));
157
158 if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
159 {
160 __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
161 __m128i zero = _mm_setzero_si128 ();
162 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
163 if ((bmsk >> offset) != 0)
164 return __m128i_shift_right (a, offset);
165 }
166 return _mm_loadu_si128 ((__m128i *) p);
167}
168
169#ifdef USE_AS_STRCASESTR
170
171/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
172 locale. */
173
174static __m128i
175__attribute__ ((section (".text.sse4.2")))
176__m128i_strloadu_tolower_posix (const unsigned char * p)
177{
178 __m128i frag = __m128i_strloadu (p);
179
180 /* Convert frag to lower case for POSIX/C locale. */
181 __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
182 __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
183 __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44);
184 __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1);
185 mask2 = _mm_sub_epi8 (mask2, u2ldelta);
186 return _mm_blendv_epi8 (frag, mask2, mask1);
187}
188
189/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
190 locale. */
191
192static __m128i
193__attribute__ ((section (".text.sse4.2")))
194__m128i_strloadu_tolower (const unsigned char * p)
195{
196 union
197 {
198 char b[16];
199 __m128i x;
200 } u;
201
202 for (int i = 0; i < 16; i++)
203 if (p[i] == 0)
204 {
205 u.b[i] = 0;
206 break;
207 }
208 else
209 u.b[i] = tolower (p[i]);
210
211 return u.x;
212}
213#endif
214
215/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
216 algorithm) overlap for a fully populated 16B vector.
217 Input parameter: 1st 16Byte loaded from the reference string of a
218 strstr function.
219 We don't use KMP algorithm if reference string is less than 16B.
220 */
221
222static int
223__inline__ __attribute__ ((__always_inline__,))
224KMP16Bovrlap (__m128i s2)
225{
226 __m128i a, b;
227 int bmsk, k1;
228
229 b = _mm_unpacklo_epi8 (s2, s2);
230 a = _mm_unpacklo_epi8 (b, b);
231 a = _mm_shuffle_epi32 (a, 0);
232 b = _mm_srli_si128 (s2, sizeof (char));
233 bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
234
235 /* _BitScanForward(&k1, bmsk); */
236 __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
237 if (!bmsk)
238 return 16;
239 else if (bmsk == 0x7fff)
240 return 1;
241 else if (!k1)
242 {
243 /* There are al least two ditinct char in s2. If byte 0 and 1 are
244 idential and the distinct value lies farther down, we can deduce
245 the next byte offset to restart full compare is least no earlier
246 than byte 3. */
247 return 3;
248 }
249 else
250 {
251 /* Byte 1 is not degenerated to byte 0. */
252 return k1 + 1;
253 }
254}
255
256char *
257__attribute__ ((section (".text.sse4.2")))
258STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
259{
260 int len, len1;
261 const unsigned char *p1 = s1;
262 const unsigned char *p2 = s2;
263 __m128i frag1, frag2, zero;
264 int cmp, cmp_c, cmp_z, cmp_s;
265 int kmp_fwd, bmsk, bmsk1;
266 const unsigned char *pt;
267
268 if (!p2[0])
269 return (char *) p1;
270
271 if (!p1[0])
272 return NULL;
273
274 /* Check if p1 length is 1 byte long. */
275 if (!p1[1])
276 return !p2[1] && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
277
278#ifdef USE_AS_STRCASESTR
279 __m128i (*strloadu) (const unsigned char *);
280 const char *used_locale = setlocale (LC_CTYPE, NULL);
281
282 if (!used_locale
283 || (used_locale[0] == 'C' && used_locale[1] == '\0')
284 || strcmp (used_locale, "POSIX") == 0)
285 strloadu = __m128i_strloadu_tolower_posix;
286 else
287 strloadu = __m128i_strloadu_tolower;
288#else
289# define strloadu __m128i_strloadu
290#endif
291
292 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
293 frag1 = strloadu (p1);
294
295 if (p2[1])
296 {
297 /* p2 is > 1 byte long. */
298 frag2 = strloadu (p2);
299 }
300 else
301 {
302 zero = _mm_setzero_si128 ();
303 frag2 = _mm_insert_epi8 (zero, LOADBYTE(p2[0]), 0);
304 }
305
306 /* Unsigned bytes, equal order, does frag2 has null? */
307 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
308 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
309 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
310 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
311 if (cmp_s & cmp_c)
312 {
313 zero = _mm_setzero_si128 ();
314 bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
315 __asm ("bsfl %[bmsk], %[len]"
316 : [len] "=r" (len) : [bmsk] "r" (bmsk));
317 p1 += cmp;
318 if ((len + cmp) <= 16)
319 return (char *) p1;
320 else
321 {
322 /* Load up to 16 bytes of fragment. */
323 frag1 = strloadu (p1);
324 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
325 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
326 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
327 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
328 if ((len + cmp) <= 16)
329 return (char *) p1 + cmp;
330 }
331 }
332
333 if (cmp_s)
334 {
335 /* Adjust addr for 16B alginment in ensuing loop. */
336 while (!cmp_z)
337 {
338 p1 += cmp;
339 /* Load up to 16 bytes of fragment. */
340 frag1 = strloadu (p1);
341 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
342 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
343 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
344 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
345 once already, this time cmp will be zero and we can exit. */
346 if ((!cmp) & cmp_c)
347 break;
348 }
349
350 if (!cmp_c)
351 return NULL;
352 else
353 {
354 /* Since s2 is less than 16 bytes, com_c is definitive
355 determination of full match. */
356 return (char *) p1 + cmp;
357 }
358 }
359
360 /* General case, s2 is at least 16 bytes or more.
361 First, the common case of false-match at first byte of p2. */
362 pt = NULL;
363 kmp_fwd = 0;
364re_trace:
365 while (!cmp_c)
366 {
367 /* frag1 has null. */
368 if (cmp_z)
369 return NULL;
370
371 /* frag 1 has no null, advance 16 bytes. */
372 p1 += 16;
373 /* Load up to 16 bytes of fragment. */
374 frag1 = strloadu (p1);
375 /* Unsigned bytes, equal order, is there a partial match? */
376 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
377 cmp = _mm_cmpistri(frag2, frag1, 0x0c);
378 cmp_z = _mm_cmpistrz(frag2, frag1, 0x0c);
379 }
380
381 /* Next, handle inital positive match as first byte of p2. We have
382 a partial fragment match, make full determination until we reached
383 end of s2. */
384 if (!cmp)
385 {
386 if (cmp_z)
387 return (char *) p1;
388
389 pt = p1;
390 p1 += 16;
391 p2 += 16;
392 /* Load up to 16 bytes of fragment. */
393 frag2 = strloadu(p2);
394 }
395 else
396 {
397 /* Adjust 16B alignment. */
398 p1 += cmp;
399 pt = p1;
400 }
401
402 /* Load up to 16 bytes of fragment. */
403 frag1 = strloadu (p1);
404
405 /* Unsigned bytes, equal order, does frag2 has null? */
406 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
407 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
408 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
409 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
410 while (!(cmp | cmp_z | cmp_s))
411 {
412 p1 += 16;
413 p2 += 16;
414 /* Load up to 16 bytes of fragment. */
415 frag2 = strloadu (p2);
416 /* Load up to 16 bytes of fragment. */
417 frag1 = strloadu (p1);
418 /* Unsigned bytes, equal order, does frag2 has null? */
419 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
420 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
421 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
422 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
423 }
424
425 /* Full determination yielded false result, retrace s1 to next
426 starting position.
427 Zflg 1 0 1 0/1
428 Sflg 0 1 1 0/1
429 cmp na 0 0 >0
430 action done done continue continue if s2 < s1
431 false match retrace s1 else false
432 */
433
434 if(cmp_s & !cmp)
435 return (char *) pt;
436 else if (cmp_z)
437 {
438 if (!cmp_s)
439 return NULL;
440
441 /* Handle both zero and sign flag set and s1 is shorter in
442 length. */
443 zero = _mm_setzero_si128 ();
444 bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
445 bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
446 __asm ("bsfl %[bmsk], %[len]"
447 : [len] "=r" (len) : [bmsk] "r" (bmsk));
448 __asm ("bsfl %[bmsk1], %[len1]"
449 : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
450 if (len >= len1)
451 return NULL;
452 }
453 else if (!cmp)
454 return (char *) pt;
455
456 /* Otherwise, we have to retrace and continue. Default of multiple
457 paths that need to retrace from next byte in s1. */
458 p2 = s2;
459 frag2 = strloadu (p2);
460
461 if (!kmp_fwd)
462 kmp_fwd = KMP16Bovrlap (frag2);
463
464 /* KMP algorithm predicted overlap needs to be corrected for
465 partial fragment compare. */
466 p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
467
468 /* Since s2 is at least 16 bytes long, we're certain there is no
469 match. */
470 if (!p1[0])
471 return NULL;
472 else
473 {
474 /* Load up to 16 bytes of fragment. */
475 frag1 = strloadu (p1);
476 }
477
478 /* Unsigned bytes, equal order, is there a partial match? */
479 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
480 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
481 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
482 goto re_trace;
483}