]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strstr.c
Update copyright notices with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strstr.c
CommitLineData
2b7a8664 1/* strstr with SSE4.2 intrinsics
568035b7 2 Copyright (C) 2009-2013 Free Software Foundation, Inc.
2b7a8664
L
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
2b7a8664
L
19
20#include <nmmintrin.h>
73f27d5e 21#include "varshift.h"
2b7a8664
L
22
23#ifndef STRSTR_SSE42
24# define STRSTR_SSE42 __strstr_sse42
25#endif
26
27#ifdef USE_AS_STRCASESTR
28# include <ctype.h>
a8f895eb 29# include <locale/localeinfo.h>
2b7a8664
L
30
31# define LOADBYTE(C) tolower (C)
ae612b04 32# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
2b7a8664
L
33#else
34# define LOADBYTE(C) (C)
35# define CMPBYTE(C1, C2) ((C1) == (C2))
36#endif
37
38/* We use 0xe ordered-compare:
39 _SIDD_SBYTE_OPS
40 | _SIDD_CMP_EQUAL_ORDER
41 | _SIDD_LEAST_SIGNIFICANT
42 on pcmpistri to do the scanning and string comparsion requirements of
43 sub-string match. In the scanning phase, we process Cflag and ECX
44 index to locate the first fragment match; once the first fragment
45 match position has been identified, we do comparison of subsequent
46 string fragments until we can conclude false or true match; whe
47 n concluding a false match, we may need to repeat scanning process
48 from next relevant offset in the target string.
49
50 In the scanning phase we have 4 cases:
51 case ECX CFlag ZFlag SFlag
52 1 16 0 0 0
53 2a 16 0 0 1
54 2b 16 0 1 0
55 2c 16 0 1 1
56
57 1. No ordered-comparison match, both 16B fragments are valid, so
58 continue to next fragment.
59 2. No ordered-comparison match, there is EOS in either fragment,
60 2a. Zflg = 0, Sflg = 1, we continue
61 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
62 2c. Zflg = 1, sflg = 1, lenth determine match or no match
63
64 In the string comparison phase, the 1st fragment match is fixed up
65 to produce ECX = 0. Subsequent fragment compare of nonzero index
66 and no match conclude a false match.
67
68 case ECX CFlag ZFlag SFlag
69 3 X 1 0 0/1
cc9f2e47
UD
70 4a 0 1 0 0
71 4b 0 1 0 1
72 4c 0 < X 1 0 0/1
73 5 16 0 1 0
2b7a8664
L
74
75 3. An initial ordered-comparison fragment match, we fix up to do
76 subsequent string comparison
77 4a. Continuation of fragment comparison of a string compare.
78 4b. EOS reached in the reference string, we conclude true match and
79 return
80 4c. String compare failed if index is nonzero, we need to go back to
81 scanning
82 5. failed string compare, go back to scanning
83 */
84
2b7a8664
L
85/* Simple replacement of movdqu to address 4KB boundary cross issue.
86 If EOS occurs within less than 16B before 4KB boundary, we don't
87 cross to next page. */
88
cc9f2e47 89static inline __m128i
52e4b9eb 90__m128i_strloadu (const unsigned char * p, __m128i zero)
2b7a8664 91{
52e4b9eb 92 if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
2b7a8664 93 {
52e4b9eb 94 size_t offset = ((size_t) p & (16 - 1));
2b7a8664 95 __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
2b7a8664
L
96 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
97 if ((bmsk >> offset) != 0)
98 return __m128i_shift_right (a, offset);
99 }
100 return _mm_loadu_si128 ((__m128i *) p);
101}
102
cc9f2e47 103#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
2b7a8664
L
104
105/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
fd52bc6d
UD
106 locale and other which have single-byte letters only in the ASCII
107 range. */
cc9f2e47 108static inline __m128i
52e4b9eb 109__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
fd52bc6d 110 __m128i uchigh, __m128i lcqword)
2b7a8664 111{
52e4b9eb 112 __m128i frag = __m128i_strloadu (p, zero);
2b7a8664 113
8e96b93a 114 /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
fd52bc6d 115 __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
8e96b93a 116 /* Compare if bytes are > 'A' - 1. */
fd52bc6d 117 __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
8e96b93a
UD
118 /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
119 __m128i mask = _mm_and_si128 (r2, r1);
120 /* Apply lowercase bit 6 mask for above mask bytes == ff. */
fd52bc6d 121 return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
2b7a8664
L
122}
123
2b7a8664
L
124#endif
125
126/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
127 algorithm) overlap for a fully populated 16B vector.
128 Input parameter: 1st 16Byte loaded from the reference string of a
129 strstr function.
cc9f2e47 130 We don't use KMP algorithm if reference string is less than 16B. */
2b7a8664
L
131static int
132__inline__ __attribute__ ((__always_inline__,))
133KMP16Bovrlap (__m128i s2)
134{
ae612b04
UD
135 __m128i b = _mm_unpacklo_epi8 (s2, s2);
136 __m128i a = _mm_unpacklo_epi8 (b, b);
2b7a8664
L
137 a = _mm_shuffle_epi32 (a, 0);
138 b = _mm_srli_si128 (s2, sizeof (char));
ae612b04 139 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
2b7a8664
L
140
141 /* _BitScanForward(&k1, bmsk); */
ae612b04 142 int k1;
2b7a8664
L
143 __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
144 if (!bmsk)
145 return 16;
146 else if (bmsk == 0x7fff)
147 return 1;
148 else if (!k1)
149 {
cc9f2e47 150 /* There are al least two distinct chars in s2. If byte 0 and 1 are
2b7a8664
L
151 idential and the distinct value lies farther down, we can deduce
152 the next byte offset to restart full compare is least no earlier
153 than byte 3. */
154 return 3;
155 }
156 else
157 {
158 /* Byte 1 is not degenerated to byte 0. */
159 return k1 + 1;
160 }
161}
162
163char *
164__attribute__ ((section (".text.sse4.2")))
165STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
166{
ae612b04 167#define p1 s1
2b7a8664 168 const unsigned char *p2 = s2;
ae612b04 169
cc9f2e47
UD
170#ifndef STRCASESTR_NONASCII
171 if (__builtin_expect (p2[0] == '\0', 0))
2b7a8664
L
172 return (char *) p1;
173
cc9f2e47 174 if (__builtin_expect (p1[0] == '\0', 0))
2b7a8664
L
175 return NULL;
176
177 /* Check if p1 length is 1 byte long. */
cc9f2e47 178 if (__builtin_expect (p1[1] == '\0', 0))
ae612b04 179 return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
cc9f2e47 180#endif
2b7a8664
L
181
182#ifdef USE_AS_STRCASESTR
d02dc4ba 183# ifndef STRCASESTR_NONASCII
cc9f2e47
UD
184 if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
185 != 0, 0))
186 return __strcasestr_sse42_nonascii (s1, s2);
2b7a8664 187
fd52bc6d
UD
188 const __m128i uclow = _mm_set1_epi8 (0x40);
189 const __m128i uchigh = _mm_set1_epi8 (0x5b);
190 const __m128i lcqword = _mm_set1_epi8 (0x20);
52e4b9eb
UD
191 const __m128i zero = _mm_setzero_si128 ();
192# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
f6a31e0e
AS
193# else
194# define strloadu __m128i_strloadu_tolower
52e4b9eb 195# define zero _mm_setzero_si128 ()
f6a31e0e 196# endif
2b7a8664 197#else
52e4b9eb
UD
198# define strloadu(p) __m128i_strloadu (p, zero)
199 const __m128i zero = _mm_setzero_si128 ();
2b7a8664
L
200#endif
201
202 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
ae612b04 203 __m128i frag1 = strloadu (p1);
2b7a8664 204
ae612b04
UD
205 __m128i frag2;
206 if (p2[1] != '\0')
207 /* p2 is > 1 byte long. */
208 frag2 = strloadu (p2);
2b7a8664 209 else
52e4b9eb 210 frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
2b7a8664
L
211
212 /* Unsigned bytes, equal order, does frag2 has null? */
ae612b04
UD
213 int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
214 int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
215 int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
216 int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
2b7a8664
L
217 if (cmp_s & cmp_c)
218 {
52e4b9eb 219 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
ae612b04 220 int len;
2b7a8664
L
221 __asm ("bsfl %[bmsk], %[len]"
222 : [len] "=r" (len) : [bmsk] "r" (bmsk));
223 p1 += cmp;
224 if ((len + cmp) <= 16)
225 return (char *) p1;
ae612b04
UD
226
227 /* Load up to 16 bytes of fragment. */
228 frag1 = strloadu (p1);
229 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
230 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
231 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
232 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
233 if ((len + cmp) <= 16)
234 return (char *) p1 + cmp;
2b7a8664
L
235 }
236
237 if (cmp_s)
238 {
239 /* Adjust addr for 16B alginment in ensuing loop. */
240 while (!cmp_z)
241 {
242 p1 += cmp;
243 /* Load up to 16 bytes of fragment. */
244 frag1 = strloadu (p1);
245 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
246 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
247 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
248 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
249 once already, this time cmp will be zero and we can exit. */
250 if ((!cmp) & cmp_c)
251 break;
252 }
253
254 if (!cmp_c)
255 return NULL;
ae612b04
UD
256
257 /* Since s2 is less than 16 bytes, com_c is definitive
258 determination of full match. */
259 return (char *) p1 + cmp;
2b7a8664
L
260 }
261
262 /* General case, s2 is at least 16 bytes or more.
263 First, the common case of false-match at first byte of p2. */
ae612b04
UD
264 const unsigned char *pt = NULL;
265 int kmp_fwd = 0;
2b7a8664
L
266re_trace:
267 while (!cmp_c)
268 {
269 /* frag1 has null. */
270 if (cmp_z)
271 return NULL;
272
273 /* frag 1 has no null, advance 16 bytes. */
274 p1 += 16;
275 /* Load up to 16 bytes of fragment. */
276 frag1 = strloadu (p1);
277 /* Unsigned bytes, equal order, is there a partial match? */
278 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
ae612b04
UD
279 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
280 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
2b7a8664
L
281 }
282
ae612b04 283 /* Next, handle initial positive match as first byte of p2. We have
2b7a8664
L
284 a partial fragment match, make full determination until we reached
285 end of s2. */
286 if (!cmp)
287 {
288 if (cmp_z)
289 return (char *) p1;
290
291 pt = p1;
292 p1 += 16;
293 p2 += 16;
294 /* Load up to 16 bytes of fragment. */
ae612b04 295 frag2 = strloadu (p2);
2b7a8664
L
296 }
297 else
298 {
299 /* Adjust 16B alignment. */
300 p1 += cmp;
301 pt = p1;
302 }
303
304 /* Load up to 16 bytes of fragment. */
305 frag1 = strloadu (p1);
306
307 /* Unsigned bytes, equal order, does frag2 has null? */
308 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
309 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
310 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
311 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
312 while (!(cmp | cmp_z | cmp_s))
313 {
314 p1 += 16;
315 p2 += 16;
316 /* Load up to 16 bytes of fragment. */
317 frag2 = strloadu (p2);
318 /* Load up to 16 bytes of fragment. */
319 frag1 = strloadu (p1);
320 /* Unsigned bytes, equal order, does frag2 has null? */
321 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
322 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
323 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
324 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
325 }
326
ae612b04 327 /* Full determination yielded a false result, retrace s1 to next
2b7a8664
L
328 starting position.
329 Zflg 1 0 1 0/1
330 Sflg 0 1 1 0/1
331 cmp na 0 0 >0
332 action done done continue continue if s2 < s1
333 false match retrace s1 else false
334 */
a8f895eb 335
ae612b04 336 if (cmp_s & !cmp)
2b7a8664 337 return (char *) pt;
ae612b04 338 if (cmp_z)
2b7a8664
L
339 {
340 if (!cmp_s)
341 return NULL;
342
343 /* Handle both zero and sign flag set and s1 is shorter in
344 length. */
ae612b04
UD
345 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
346 int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
347 int len;
348 int len1;
2b7a8664
L
349 __asm ("bsfl %[bmsk], %[len]"
350 : [len] "=r" (len) : [bmsk] "r" (bmsk));
351 __asm ("bsfl %[bmsk1], %[len1]"
352 : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
353 if (len >= len1)
354 return NULL;
355 }
356 else if (!cmp)
357 return (char *) pt;
358
359 /* Otherwise, we have to retrace and continue. Default of multiple
360 paths that need to retrace from next byte in s1. */
361 p2 = s2;
362 frag2 = strloadu (p2);
363
364 if (!kmp_fwd)
365 kmp_fwd = KMP16Bovrlap (frag2);
366
367 /* KMP algorithm predicted overlap needs to be corrected for
368 partial fragment compare. */
369 p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
370
371 /* Since s2 is at least 16 bytes long, we're certain there is no
372 match. */
ae612b04 373 if (p1[0] == '\0')
2b7a8664 374 return NULL;
ae612b04
UD
375
376 /* Load up to 16 bytes of fragment. */
377 frag1 = strloadu (p1);
2b7a8664
L
378
379 /* Unsigned bytes, equal order, is there a partial match? */
380 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
381 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
382 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
383 goto re_trace;
384}