]>
Commit | Line | Data |
---|---|---|
2b7a8664 | 1 | /* strstr with SSE4.2 intrinsics |
568035b7 | 2 | Copyright (C) 2009-2013 Free Software Foundation, Inc. |
2b7a8664 L |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
2b7a8664 L |
19 | |
20 | #include <nmmintrin.h> | |
73f27d5e | 21 | #include "varshift.h" |
2b7a8664 L |
22 | |
23 | #ifndef STRSTR_SSE42 | |
24 | # define STRSTR_SSE42 __strstr_sse42 | |
25 | #endif | |
26 | ||
27 | #ifdef USE_AS_STRCASESTR | |
28 | # include <ctype.h> | |
a8f895eb | 29 | # include <locale/localeinfo.h> |
2b7a8664 L |
30 | |
31 | # define LOADBYTE(C) tolower (C) | |
ae612b04 | 32 | # define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2)) |
2b7a8664 L |
33 | #else |
34 | # define LOADBYTE(C) (C) | |
35 | # define CMPBYTE(C1, C2) ((C1) == (C2)) | |
36 | #endif | |
37 | ||
38 | /* We use 0xe ordered-compare: | |
39 | _SIDD_SBYTE_OPS | |
40 | | _SIDD_CMP_EQUAL_ORDER | |
41 | | _SIDD_LEAST_SIGNIFICANT | |
42 | on pcmpistri to do the scanning and string comparsion requirements of | |
43 | sub-string match. In the scanning phase, we process Cflag and ECX | |
44 | index to locate the first fragment match; once the first fragment | |
45 | match position has been identified, we do comparison of subsequent | |
46 | string fragments until we can conclude false or true match; whe | |
47 | n concluding a false match, we may need to repeat scanning process | |
48 | from next relevant offset in the target string. | |
49 | ||
50 | In the scanning phase we have 4 cases: | |
51 | case ECX CFlag ZFlag SFlag | |
52 | 1 16 0 0 0 | |
53 | 2a 16 0 0 1 | |
54 | 2b 16 0 1 0 | |
55 | 2c 16 0 1 1 | |
56 | ||
57 | 1. No ordered-comparison match, both 16B fragments are valid, so | |
58 | continue to next fragment. | |
59 | 2. No ordered-comparison match, there is EOS in either fragment, | |
60 | 2a. Zflg = 0, Sflg = 1, we continue | |
61 | 2b. Zflg = 1, Sflg = 0, we conclude no match and return. | |
62 | 2c. Zflg = 1, sflg = 1, lenth determine match or no match | |
63 | ||
64 | In the string comparison phase, the 1st fragment match is fixed up | |
65 | to produce ECX = 0. Subsequent fragment compare of nonzero index | |
66 | and no match conclude a false match. | |
67 | ||
68 | case ECX CFlag ZFlag SFlag | |
69 | 3 X 1 0 0/1 | |
cc9f2e47 UD |
70 | 4a 0 1 0 0 |
71 | 4b 0 1 0 1 | |
72 | 4c 0 < X 1 0 0/1 | |
73 | 5 16 0 1 0 | |
2b7a8664 L |
74 | |
75 | 3. An initial ordered-comparison fragment match, we fix up to do | |
76 | subsequent string comparison | |
77 | 4a. Continuation of fragment comparison of a string compare. | |
78 | 4b. EOS reached in the reference string, we conclude true match and | |
79 | return | |
80 | 4c. String compare failed if index is nonzero, we need to go back to | |
81 | scanning | |
82 | 5. failed string compare, go back to scanning | |
83 | */ | |
84 | ||
2b7a8664 L |
85 | /* Simple replacement of movdqu to address 4KB boundary cross issue. |
86 | If EOS occurs within less than 16B before 4KB boundary, we don't | |
87 | cross to next page. */ | |
88 | ||
cc9f2e47 | 89 | static inline __m128i |
52e4b9eb | 90 | __m128i_strloadu (const unsigned char * p, __m128i zero) |
2b7a8664 | 91 | { |
52e4b9eb | 92 | if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0)) |
2b7a8664 | 93 | { |
52e4b9eb | 94 | size_t offset = ((size_t) p & (16 - 1)); |
2b7a8664 | 95 | __m128i a = _mm_load_si128 ((__m128i *) (p - offset)); |
2b7a8664 L |
96 | int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero)); |
97 | if ((bmsk >> offset) != 0) | |
98 | return __m128i_shift_right (a, offset); | |
99 | } | |
100 | return _mm_loadu_si128 ((__m128i *) p); | |
101 | } | |
102 | ||
cc9f2e47 | 103 | #if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII |
2b7a8664 L |
104 | |
105 | /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C | |
fd52bc6d UD |
106 | locale and other which have single-byte letters only in the ASCII |
107 | range. */ | |
cc9f2e47 | 108 | static inline __m128i |
52e4b9eb | 109 | __m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow, |
fd52bc6d | 110 | __m128i uchigh, __m128i lcqword) |
2b7a8664 | 111 | { |
52e4b9eb | 112 | __m128i frag = __m128i_strloadu (p, zero); |
2b7a8664 | 113 | |
8e96b93a | 114 | /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */ |
fd52bc6d | 115 | __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag); |
8e96b93a | 116 | /* Compare if bytes are > 'A' - 1. */ |
fd52bc6d | 117 | __m128i r1 = _mm_cmpgt_epi8 (frag, uclow); |
8e96b93a UD |
118 | /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */ |
119 | __m128i mask = _mm_and_si128 (r2, r1); | |
120 | /* Apply lowercase bit 6 mask for above mask bytes == ff. */ | |
fd52bc6d | 121 | return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword)); |
2b7a8664 L |
122 | } |
123 | ||
2b7a8664 L |
124 | #endif |
125 | ||
126 | /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP | |
127 | algorithm) overlap for a fully populated 16B vector. | |
128 | Input parameter: 1st 16Byte loaded from the reference string of a | |
129 | strstr function. | |
cc9f2e47 | 130 | We don't use KMP algorithm if reference string is less than 16B. */ |
2b7a8664 L |
131 | static int |
132 | __inline__ __attribute__ ((__always_inline__,)) | |
133 | KMP16Bovrlap (__m128i s2) | |
134 | { | |
ae612b04 UD |
135 | __m128i b = _mm_unpacklo_epi8 (s2, s2); |
136 | __m128i a = _mm_unpacklo_epi8 (b, b); | |
2b7a8664 L |
137 | a = _mm_shuffle_epi32 (a, 0); |
138 | b = _mm_srli_si128 (s2, sizeof (char)); | |
ae612b04 | 139 | int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a)); |
2b7a8664 L |
140 | |
141 | /* _BitScanForward(&k1, bmsk); */ | |
ae612b04 | 142 | int k1; |
2b7a8664 L |
143 | __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk)); |
144 | if (!bmsk) | |
145 | return 16; | |
146 | else if (bmsk == 0x7fff) | |
147 | return 1; | |
148 | else if (!k1) | |
149 | { | |
cc9f2e47 | 150 | /* There are al least two distinct chars in s2. If byte 0 and 1 are |
2b7a8664 L |
151 | idential and the distinct value lies farther down, we can deduce |
152 | the next byte offset to restart full compare is least no earlier | |
153 | than byte 3. */ | |
154 | return 3; | |
155 | } | |
156 | else | |
157 | { | |
158 | /* Byte 1 is not degenerated to byte 0. */ | |
159 | return k1 + 1; | |
160 | } | |
161 | } | |
162 | ||
163 | char * | |
164 | __attribute__ ((section (".text.sse4.2"))) | |
165 | STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2) | |
166 | { | |
ae612b04 | 167 | #define p1 s1 |
2b7a8664 | 168 | const unsigned char *p2 = s2; |
ae612b04 | 169 | |
cc9f2e47 UD |
170 | #ifndef STRCASESTR_NONASCII |
171 | if (__builtin_expect (p2[0] == '\0', 0)) | |
2b7a8664 L |
172 | return (char *) p1; |
173 | ||
cc9f2e47 | 174 | if (__builtin_expect (p1[0] == '\0', 0)) |
2b7a8664 L |
175 | return NULL; |
176 | ||
177 | /* Check if p1 length is 1 byte long. */ | |
cc9f2e47 | 178 | if (__builtin_expect (p1[1] == '\0', 0)) |
ae612b04 | 179 | return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; |
cc9f2e47 | 180 | #endif |
2b7a8664 L |
181 | |
182 | #ifdef USE_AS_STRCASESTR | |
d02dc4ba | 183 | # ifndef STRCASESTR_NONASCII |
cc9f2e47 UD |
184 | if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) |
185 | != 0, 0)) | |
186 | return __strcasestr_sse42_nonascii (s1, s2); | |
2b7a8664 | 187 | |
fd52bc6d UD |
188 | const __m128i uclow = _mm_set1_epi8 (0x40); |
189 | const __m128i uchigh = _mm_set1_epi8 (0x5b); | |
190 | const __m128i lcqword = _mm_set1_epi8 (0x20); | |
52e4b9eb UD |
191 | const __m128i zero = _mm_setzero_si128 (); |
192 | # define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword) | |
f6a31e0e AS |
193 | # else |
194 | # define strloadu __m128i_strloadu_tolower | |
52e4b9eb | 195 | # define zero _mm_setzero_si128 () |
f6a31e0e | 196 | # endif |
2b7a8664 | 197 | #else |
52e4b9eb UD |
198 | # define strloadu(p) __m128i_strloadu (p, zero) |
199 | const __m128i zero = _mm_setzero_si128 (); | |
2b7a8664 L |
200 | #endif |
201 | ||
202 | /* p1 > 1 byte long. Load up to 16 bytes of fragment. */ | |
ae612b04 | 203 | __m128i frag1 = strloadu (p1); |
2b7a8664 | 204 | |
ae612b04 UD |
205 | __m128i frag2; |
206 | if (p2[1] != '\0') | |
207 | /* p2 is > 1 byte long. */ | |
208 | frag2 = strloadu (p2); | |
2b7a8664 | 209 | else |
52e4b9eb | 210 | frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0); |
2b7a8664 L |
211 | |
212 | /* Unsigned bytes, equal order, does frag2 has null? */ | |
ae612b04 UD |
213 | int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); |
214 | int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
215 | int cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
216 | int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
2b7a8664 L |
217 | if (cmp_s & cmp_c) |
218 | { | |
52e4b9eb | 219 | int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero)); |
ae612b04 | 220 | int len; |
2b7a8664 L |
221 | __asm ("bsfl %[bmsk], %[len]" |
222 | : [len] "=r" (len) : [bmsk] "r" (bmsk)); | |
223 | p1 += cmp; | |
224 | if ((len + cmp) <= 16) | |
225 | return (char *) p1; | |
ae612b04 UD |
226 | |
227 | /* Load up to 16 bytes of fragment. */ | |
228 | frag1 = strloadu (p1); | |
229 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
230 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
231 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
232 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
233 | if ((len + cmp) <= 16) | |
234 | return (char *) p1 + cmp; | |
2b7a8664 L |
235 | } |
236 | ||
237 | if (cmp_s) | |
238 | { | |
239 | /* Adjust addr for 16B alginment in ensuing loop. */ | |
240 | while (!cmp_z) | |
241 | { | |
242 | p1 += cmp; | |
243 | /* Load up to 16 bytes of fragment. */ | |
244 | frag1 = strloadu (p1); | |
245 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
246 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
247 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
248 | /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp | |
249 | once already, this time cmp will be zero and we can exit. */ | |
250 | if ((!cmp) & cmp_c) | |
251 | break; | |
252 | } | |
253 | ||
254 | if (!cmp_c) | |
255 | return NULL; | |
ae612b04 UD |
256 | |
257 | /* Since s2 is less than 16 bytes, com_c is definitive | |
258 | determination of full match. */ | |
259 | return (char *) p1 + cmp; | |
2b7a8664 L |
260 | } |
261 | ||
262 | /* General case, s2 is at least 16 bytes or more. | |
263 | First, the common case of false-match at first byte of p2. */ | |
ae612b04 UD |
264 | const unsigned char *pt = NULL; |
265 | int kmp_fwd = 0; | |
2b7a8664 L |
266 | re_trace: |
267 | while (!cmp_c) | |
268 | { | |
269 | /* frag1 has null. */ | |
270 | if (cmp_z) | |
271 | return NULL; | |
272 | ||
273 | /* frag 1 has no null, advance 16 bytes. */ | |
274 | p1 += 16; | |
275 | /* Load up to 16 bytes of fragment. */ | |
276 | frag1 = strloadu (p1); | |
277 | /* Unsigned bytes, equal order, is there a partial match? */ | |
278 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
ae612b04 UD |
279 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); |
280 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
2b7a8664 L |
281 | } |
282 | ||
ae612b04 | 283 | /* Next, handle initial positive match as first byte of p2. We have |
2b7a8664 L |
284 | a partial fragment match, make full determination until we reached |
285 | end of s2. */ | |
286 | if (!cmp) | |
287 | { | |
288 | if (cmp_z) | |
289 | return (char *) p1; | |
290 | ||
291 | pt = p1; | |
292 | p1 += 16; | |
293 | p2 += 16; | |
294 | /* Load up to 16 bytes of fragment. */ | |
ae612b04 | 295 | frag2 = strloadu (p2); |
2b7a8664 L |
296 | } |
297 | else | |
298 | { | |
299 | /* Adjust 16B alignment. */ | |
300 | p1 += cmp; | |
301 | pt = p1; | |
302 | } | |
303 | ||
304 | /* Load up to 16 bytes of fragment. */ | |
305 | frag1 = strloadu (p1); | |
306 | ||
307 | /* Unsigned bytes, equal order, does frag2 has null? */ | |
308 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
309 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
310 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
311 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
312 | while (!(cmp | cmp_z | cmp_s)) | |
313 | { | |
314 | p1 += 16; | |
315 | p2 += 16; | |
316 | /* Load up to 16 bytes of fragment. */ | |
317 | frag2 = strloadu (p2); | |
318 | /* Load up to 16 bytes of fragment. */ | |
319 | frag1 = strloadu (p1); | |
320 | /* Unsigned bytes, equal order, does frag2 has null? */ | |
321 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
322 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
323 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
324 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
325 | } | |
326 | ||
ae612b04 | 327 | /* Full determination yielded a false result, retrace s1 to next |
2b7a8664 L |
328 | starting position. |
329 | Zflg 1 0 1 0/1 | |
330 | Sflg 0 1 1 0/1 | |
331 | cmp na 0 0 >0 | |
332 | action done done continue continue if s2 < s1 | |
333 | false match retrace s1 else false | |
334 | */ | |
a8f895eb | 335 | |
ae612b04 | 336 | if (cmp_s & !cmp) |
2b7a8664 | 337 | return (char *) pt; |
ae612b04 | 338 | if (cmp_z) |
2b7a8664 L |
339 | { |
340 | if (!cmp_s) | |
341 | return NULL; | |
342 | ||
343 | /* Handle both zero and sign flag set and s1 is shorter in | |
344 | length. */ | |
ae612b04 UD |
345 | int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2)); |
346 | int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1)); | |
347 | int len; | |
348 | int len1; | |
2b7a8664 L |
349 | __asm ("bsfl %[bmsk], %[len]" |
350 | : [len] "=r" (len) : [bmsk] "r" (bmsk)); | |
351 | __asm ("bsfl %[bmsk1], %[len1]" | |
352 | : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1)); | |
353 | if (len >= len1) | |
354 | return NULL; | |
355 | } | |
356 | else if (!cmp) | |
357 | return (char *) pt; | |
358 | ||
359 | /* Otherwise, we have to retrace and continue. Default of multiple | |
360 | paths that need to retrace from next byte in s1. */ | |
361 | p2 = s2; | |
362 | frag2 = strloadu (p2); | |
363 | ||
364 | if (!kmp_fwd) | |
365 | kmp_fwd = KMP16Bovrlap (frag2); | |
366 | ||
367 | /* KMP algorithm predicted overlap needs to be corrected for | |
368 | partial fragment compare. */ | |
369 | p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd); | |
370 | ||
371 | /* Since s2 is at least 16 bytes long, we're certain there is no | |
372 | match. */ | |
ae612b04 | 373 | if (p1[0] == '\0') |
2b7a8664 | 374 | return NULL; |
ae612b04 UD |
375 | |
376 | /* Load up to 16 bytes of fragment. */ | |
377 | frag1 = strloadu (p1); | |
2b7a8664 L |
378 | |
379 | /* Unsigned bytes, equal order, is there a partial match? */ | |
380 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
381 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
382 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
383 | goto re_trace; | |
384 | } |