]>
Commit | Line | Data |
---|---|---|
2b7a8664 L |
1 | /* strstr with SSE4.2 intrinsics |
2 | Copyright (C) 2009 Free Software Foundation, Inc. | |
3 | Contributed by Intel Corporation. | |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, write to the Free | |
18 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
19 | 02111-1307 USA. */ | |
20 | ||
21 | #include <nmmintrin.h> | |
22 | ||
23 | #ifndef STRSTR_SSE42 | |
24 | # define STRSTR_SSE42 __strstr_sse42 | |
25 | #endif | |
26 | ||
27 | #ifdef USE_AS_STRCASESTR | |
28 | # include <ctype.h> | |
29 | # include <locale.h> | |
30 | # include <string.h> | |
31 | ||
32 | # define LOADBYTE(C) tolower (C) | |
33 | # define CMPBYTE(C1, C2) \ | |
34 | ((C1) == (C2) || tolower (C1) == tolower (C2)) | |
35 | #else | |
36 | # define LOADBYTE(C) (C) | |
37 | # define CMPBYTE(C1, C2) ((C1) == (C2)) | |
38 | #endif | |
39 | ||
40 | /* We use 0xe ordered-compare: | |
41 | _SIDD_SBYTE_OPS | |
42 | | _SIDD_CMP_EQUAL_ORDER | |
43 | | _SIDD_LEAST_SIGNIFICANT | |
44 | on pcmpistri to do the scanning and string comparsion requirements of | |
45 | sub-string match. In the scanning phase, we process Cflag and ECX | |
46 | index to locate the first fragment match; once the first fragment | |
47 | match position has been identified, we do comparison of subsequent | |
48 | string fragments until we can conclude false or true match; whe | |
49 | n concluding a false match, we may need to repeat scanning process | |
50 | from next relevant offset in the target string. | |
51 | ||
52 | In the scanning phase we have 4 cases: | |
53 | case ECX CFlag ZFlag SFlag | |
54 | 1 16 0 0 0 | |
55 | 2a 16 0 0 1 | |
56 | 2b 16 0 1 0 | |
57 | 2c 16 0 1 1 | |
58 | ||
59 | 1. No ordered-comparison match, both 16B fragments are valid, so | |
60 | continue to next fragment. | |
61 | 2. No ordered-comparison match, there is EOS in either fragment, | |
62 | 2a. Zflg = 0, Sflg = 1, we continue | |
63 | 2b. Zflg = 1, Sflg = 0, we conclude no match and return. | |
64 | 2c. Zflg = 1, sflg = 1, lenth determine match or no match | |
65 | ||
66 | In the string comparison phase, the 1st fragment match is fixed up | |
67 | to produce ECX = 0. Subsequent fragment compare of nonzero index | |
68 | and no match conclude a false match. | |
69 | ||
70 | case ECX CFlag ZFlag SFlag | |
71 | 3 X 1 0 0/1 | |
72 | 4a 0 1 0 0 | |
73 | 4b 0 1 0 1 | |
74 | 4c 0 < X 1 0 0/1 | |
75 | 5 16 0 1 0 | |
76 | ||
77 | 3. An initial ordered-comparison fragment match, we fix up to do | |
78 | subsequent string comparison | |
79 | 4a. Continuation of fragment comparison of a string compare. | |
80 | 4b. EOS reached in the reference string, we conclude true match and | |
81 | return | |
82 | 4c. String compare failed if index is nonzero, we need to go back to | |
83 | scanning | |
84 | 5. failed string compare, go back to scanning | |
85 | */ | |
86 | ||
87 | /* Fix-up of removal of unneeded data due to 16B aligned load | |
88 | parameters: | |
89 | value: 16B data loaded from 16B aligned address. | |
90 | offset: Offset of target data address relative to 16B aligned load | |
91 | address. | |
92 | */ | |
93 | ||
94 | static __inline__ __m128i | |
95 | __m128i_shift_right (__m128i value, int offset) | |
96 | { | |
97 | switch (offset) | |
98 | { | |
99 | case 1: | |
100 | value = _mm_srli_si128 (value, 1); | |
101 | break; | |
102 | case 2: | |
103 | value = _mm_srli_si128 (value, 2); | |
104 | break; | |
105 | case 3: | |
106 | value = _mm_srli_si128 (value, 3); | |
107 | break; | |
108 | case 4: | |
109 | value = _mm_srli_si128 (value, 4); | |
110 | break; | |
111 | case 5: | |
112 | value = _mm_srli_si128 (value, 5); | |
113 | break; | |
114 | case 6: | |
115 | value = _mm_srli_si128 (value, 6); | |
116 | break; | |
117 | case 7: | |
118 | value = _mm_srli_si128 (value, 7); | |
119 | break; | |
120 | case 8: | |
121 | value = _mm_srli_si128 (value, 8); | |
122 | break; | |
123 | case 9: | |
124 | value = _mm_srli_si128 (value, 9); | |
125 | break; | |
126 | case 10: | |
127 | value = _mm_srli_si128 (value, 10); | |
128 | break; | |
129 | case 11: | |
130 | value = _mm_srli_si128 (value, 11); | |
131 | break; | |
132 | case 12: | |
133 | value = _mm_srli_si128 (value, 12); | |
134 | break; | |
135 | case 13: | |
136 | value = _mm_srli_si128 (value, 13); | |
137 | break; | |
138 | case 14: | |
139 | value = _mm_srli_si128 (value, 14); | |
140 | break; | |
141 | case 15: | |
142 | value = _mm_srli_si128 (value, 15); | |
143 | break; | |
144 | } | |
145 | return value; | |
146 | } | |
147 | ||
148 | /* Simple replacement of movdqu to address 4KB boundary cross issue. | |
149 | If EOS occurs within less than 16B before 4KB boundary, we don't | |
150 | cross to next page. */ | |
151 | ||
152 | static __m128i | |
153 | __attribute__ ((section (".text.sse4.2"))) | |
154 | __m128i_strloadu (const unsigned char * p) | |
155 | { | |
156 | int offset = ((size_t) p & (16 - 1)); | |
157 | ||
158 | if (offset && (int) ((size_t) p & 0xfff) > 0xff0) | |
159 | { | |
160 | __m128i a = _mm_load_si128 ((__m128i *) (p - offset)); | |
161 | __m128i zero = _mm_setzero_si128 (); | |
162 | int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero)); | |
163 | if ((bmsk >> offset) != 0) | |
164 | return __m128i_shift_right (a, offset); | |
165 | } | |
166 | return _mm_loadu_si128 ((__m128i *) p); | |
167 | } | |
168 | ||
169 | #ifdef USE_AS_STRCASESTR | |
170 | ||
171 | /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C | |
172 | locale. */ | |
173 | ||
174 | static __m128i | |
175 | __attribute__ ((section (".text.sse4.2"))) | |
176 | __m128i_strloadu_tolower_posix (const unsigned char * p) | |
177 | { | |
178 | __m128i frag = __m128i_strloadu (p); | |
179 | ||
180 | /* Convert frag to lower case for POSIX/C locale. */ | |
181 | __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41); | |
182 | __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0); | |
183 | __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44); | |
184 | __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1); | |
185 | mask2 = _mm_sub_epi8 (mask2, u2ldelta); | |
186 | return _mm_blendv_epi8 (frag, mask2, mask1); | |
187 | } | |
188 | ||
189 | /* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C | |
190 | locale. */ | |
191 | ||
192 | static __m128i | |
193 | __attribute__ ((section (".text.sse4.2"))) | |
194 | __m128i_strloadu_tolower (const unsigned char * p) | |
195 | { | |
196 | union | |
197 | { | |
198 | char b[16]; | |
199 | __m128i x; | |
200 | } u; | |
201 | ||
202 | for (int i = 0; i < 16; i++) | |
203 | if (p[i] == 0) | |
204 | { | |
205 | u.b[i] = 0; | |
206 | break; | |
207 | } | |
208 | else | |
209 | u.b[i] = tolower (p[i]); | |
210 | ||
211 | return u.x; | |
212 | } | |
213 | #endif | |
214 | ||
215 | /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP | |
216 | algorithm) overlap for a fully populated 16B vector. | |
217 | Input parameter: 1st 16Byte loaded from the reference string of a | |
218 | strstr function. | |
219 | We don't use KMP algorithm if reference string is less than 16B. | |
220 | */ | |
221 | ||
222 | static int | |
223 | __inline__ __attribute__ ((__always_inline__,)) | |
224 | KMP16Bovrlap (__m128i s2) | |
225 | { | |
226 | __m128i a, b; | |
227 | int bmsk, k1; | |
228 | ||
229 | b = _mm_unpacklo_epi8 (s2, s2); | |
230 | a = _mm_unpacklo_epi8 (b, b); | |
231 | a = _mm_shuffle_epi32 (a, 0); | |
232 | b = _mm_srli_si128 (s2, sizeof (char)); | |
233 | bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a)); | |
234 | ||
235 | /* _BitScanForward(&k1, bmsk); */ | |
236 | __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk)); | |
237 | if (!bmsk) | |
238 | return 16; | |
239 | else if (bmsk == 0x7fff) | |
240 | return 1; | |
241 | else if (!k1) | |
242 | { | |
243 | /* There are al least two ditinct char in s2. If byte 0 and 1 are | |
244 | idential and the distinct value lies farther down, we can deduce | |
245 | the next byte offset to restart full compare is least no earlier | |
246 | than byte 3. */ | |
247 | return 3; | |
248 | } | |
249 | else | |
250 | { | |
251 | /* Byte 1 is not degenerated to byte 0. */ | |
252 | return k1 + 1; | |
253 | } | |
254 | } | |
255 | ||
256 | char * | |
257 | __attribute__ ((section (".text.sse4.2"))) | |
258 | STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2) | |
259 | { | |
260 | int len, len1; | |
261 | const unsigned char *p1 = s1; | |
262 | const unsigned char *p2 = s2; | |
263 | __m128i frag1, frag2, zero; | |
264 | int cmp, cmp_c, cmp_z, cmp_s; | |
265 | int kmp_fwd, bmsk, bmsk1; | |
266 | const unsigned char *pt; | |
267 | ||
268 | if (!p2[0]) | |
269 | return (char *) p1; | |
270 | ||
271 | if (!p1[0]) | |
272 | return NULL; | |
273 | ||
274 | /* Check if p1 length is 1 byte long. */ | |
275 | if (!p1[1]) | |
276 | return !p2[1] && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; | |
277 | ||
278 | #ifdef USE_AS_STRCASESTR | |
279 | __m128i (*strloadu) (const unsigned char *); | |
280 | const char *used_locale = setlocale (LC_CTYPE, NULL); | |
281 | ||
282 | if (!used_locale | |
283 | || (used_locale[0] == 'C' && used_locale[1] == '\0') | |
284 | || strcmp (used_locale, "POSIX") == 0) | |
285 | strloadu = __m128i_strloadu_tolower_posix; | |
286 | else | |
287 | strloadu = __m128i_strloadu_tolower; | |
288 | #else | |
289 | # define strloadu __m128i_strloadu | |
290 | #endif | |
291 | ||
292 | /* p1 > 1 byte long. Load up to 16 bytes of fragment. */ | |
293 | frag1 = strloadu (p1); | |
294 | ||
295 | if (p2[1]) | |
296 | { | |
297 | /* p2 is > 1 byte long. */ | |
298 | frag2 = strloadu (p2); | |
299 | } | |
300 | else | |
301 | { | |
302 | zero = _mm_setzero_si128 (); | |
303 | frag2 = _mm_insert_epi8 (zero, LOADBYTE(p2[0]), 0); | |
304 | } | |
305 | ||
306 | /* Unsigned bytes, equal order, does frag2 has null? */ | |
307 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
308 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
309 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
310 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
311 | if (cmp_s & cmp_c) | |
312 | { | |
313 | zero = _mm_setzero_si128 (); | |
314 | bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero)); | |
315 | __asm ("bsfl %[bmsk], %[len]" | |
316 | : [len] "=r" (len) : [bmsk] "r" (bmsk)); | |
317 | p1 += cmp; | |
318 | if ((len + cmp) <= 16) | |
319 | return (char *) p1; | |
320 | else | |
321 | { | |
322 | /* Load up to 16 bytes of fragment. */ | |
323 | frag1 = strloadu (p1); | |
324 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
325 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
326 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
327 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
328 | if ((len + cmp) <= 16) | |
329 | return (char *) p1 + cmp; | |
330 | } | |
331 | } | |
332 | ||
333 | if (cmp_s) | |
334 | { | |
335 | /* Adjust addr for 16B alginment in ensuing loop. */ | |
336 | while (!cmp_z) | |
337 | { | |
338 | p1 += cmp; | |
339 | /* Load up to 16 bytes of fragment. */ | |
340 | frag1 = strloadu (p1); | |
341 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
342 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
343 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
344 | /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp | |
345 | once already, this time cmp will be zero and we can exit. */ | |
346 | if ((!cmp) & cmp_c) | |
347 | break; | |
348 | } | |
349 | ||
350 | if (!cmp_c) | |
351 | return NULL; | |
352 | else | |
353 | { | |
354 | /* Since s2 is less than 16 bytes, com_c is definitive | |
355 | determination of full match. */ | |
356 | return (char *) p1 + cmp; | |
357 | } | |
358 | } | |
359 | ||
360 | /* General case, s2 is at least 16 bytes or more. | |
361 | First, the common case of false-match at first byte of p2. */ | |
362 | pt = NULL; | |
363 | kmp_fwd = 0; | |
364 | re_trace: | |
365 | while (!cmp_c) | |
366 | { | |
367 | /* frag1 has null. */ | |
368 | if (cmp_z) | |
369 | return NULL; | |
370 | ||
371 | /* frag 1 has no null, advance 16 bytes. */ | |
372 | p1 += 16; | |
373 | /* Load up to 16 bytes of fragment. */ | |
374 | frag1 = strloadu (p1); | |
375 | /* Unsigned bytes, equal order, is there a partial match? */ | |
376 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
377 | cmp = _mm_cmpistri(frag2, frag1, 0x0c); | |
378 | cmp_z = _mm_cmpistrz(frag2, frag1, 0x0c); | |
379 | } | |
380 | ||
381 | /* Next, handle inital positive match as first byte of p2. We have | |
382 | a partial fragment match, make full determination until we reached | |
383 | end of s2. */ | |
384 | if (!cmp) | |
385 | { | |
386 | if (cmp_z) | |
387 | return (char *) p1; | |
388 | ||
389 | pt = p1; | |
390 | p1 += 16; | |
391 | p2 += 16; | |
392 | /* Load up to 16 bytes of fragment. */ | |
393 | frag2 = strloadu(p2); | |
394 | } | |
395 | else | |
396 | { | |
397 | /* Adjust 16B alignment. */ | |
398 | p1 += cmp; | |
399 | pt = p1; | |
400 | } | |
401 | ||
402 | /* Load up to 16 bytes of fragment. */ | |
403 | frag1 = strloadu (p1); | |
404 | ||
405 | /* Unsigned bytes, equal order, does frag2 has null? */ | |
406 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
407 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
408 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
409 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
410 | while (!(cmp | cmp_z | cmp_s)) | |
411 | { | |
412 | p1 += 16; | |
413 | p2 += 16; | |
414 | /* Load up to 16 bytes of fragment. */ | |
415 | frag2 = strloadu (p2); | |
416 | /* Load up to 16 bytes of fragment. */ | |
417 | frag1 = strloadu (p1); | |
418 | /* Unsigned bytes, equal order, does frag2 has null? */ | |
419 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
420 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
421 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
422 | cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); | |
423 | } | |
424 | ||
425 | /* Full determination yielded false result, retrace s1 to next | |
426 | starting position. | |
427 | Zflg 1 0 1 0/1 | |
428 | Sflg 0 1 1 0/1 | |
429 | cmp na 0 0 >0 | |
430 | action done done continue continue if s2 < s1 | |
431 | false match retrace s1 else false | |
432 | */ | |
433 | ||
434 | if(cmp_s & !cmp) | |
435 | return (char *) pt; | |
436 | else if (cmp_z) | |
437 | { | |
438 | if (!cmp_s) | |
439 | return NULL; | |
440 | ||
441 | /* Handle both zero and sign flag set and s1 is shorter in | |
442 | length. */ | |
443 | zero = _mm_setzero_si128 (); | |
444 | bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2)); | |
445 | bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1)); | |
446 | __asm ("bsfl %[bmsk], %[len]" | |
447 | : [len] "=r" (len) : [bmsk] "r" (bmsk)); | |
448 | __asm ("bsfl %[bmsk1], %[len1]" | |
449 | : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1)); | |
450 | if (len >= len1) | |
451 | return NULL; | |
452 | } | |
453 | else if (!cmp) | |
454 | return (char *) pt; | |
455 | ||
456 | /* Otherwise, we have to retrace and continue. Default of multiple | |
457 | paths that need to retrace from next byte in s1. */ | |
458 | p2 = s2; | |
459 | frag2 = strloadu (p2); | |
460 | ||
461 | if (!kmp_fwd) | |
462 | kmp_fwd = KMP16Bovrlap (frag2); | |
463 | ||
464 | /* KMP algorithm predicted overlap needs to be corrected for | |
465 | partial fragment compare. */ | |
466 | p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd); | |
467 | ||
468 | /* Since s2 is at least 16 bytes long, we're certain there is no | |
469 | match. */ | |
470 | if (!p1[0]) | |
471 | return NULL; | |
472 | else | |
473 | { | |
474 | /* Load up to 16 bytes of fragment. */ | |
475 | frag1 = strloadu (p1); | |
476 | } | |
477 | ||
478 | /* Unsigned bytes, equal order, is there a partial match? */ | |
479 | cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); | |
480 | cmp = _mm_cmpistri (frag2, frag1, 0x0c); | |
481 | cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); | |
482 | goto re_trace; | |
483 | } |