]>
Commit | Line | Data |
---|---|---|
06e51c8f | 1 | /* strcspn with SSE4.2 intrinsics |
bfff8b1b | 2 | Copyright (C) 2009-2017 Free Software Foundation, Inc. |
06e51c8f L |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
06e51c8f L |
19 | |
20 | #include <nmmintrin.h> | |
21 | #include <string.h> | |
73f27d5e | 22 | #include "varshift.h" |
06e51c8f L |
23 | |
24 | /* We use 0x2: | |
25 | _SIDD_SBYTE_OPS | |
26 | | _SIDD_CMP_EQUAL_ANY | |
27 | | _SIDD_POSITIVE_POLARITY | |
28 | | _SIDD_LEAST_SIGNIFICANT | |
29 | on pcmpistri to compare xmm/mem128 | |
30 | ||
31 | 0 1 2 3 4 5 6 7 8 9 A B C D E F | |
32 | X X X X X X X X X X X X X X X X | |
33 | ||
34 | against xmm | |
35 | ||
36 | 0 1 2 3 4 5 6 7 8 9 A B C D E F | |
37 | A A A A A A A A A A A A A A A A | |
38 | ||
39 | to find out if the first 16byte data element has any byte A and | |
40 | the offset of the first byte. There are 3 cases: | |
41 | ||
42 | 1. The first 16byte data element has the byte A at the offset X. | |
43 | 2. The first 16byte data element has EOS and doesn't have the byte A. | |
44 | 3. The first 16byte data element is valid and doesn't have the byte A. | |
45 | ||
46 | Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: | |
47 | ||
48 | 1 X 1 0/1 0 | |
49 | 2 16 0 1 0 | |
50 | 3 16 0 0 0 | |
51 | ||
52 | We exit from the loop for cases 1 and 2 with jbe which branches | |
53 | when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset | |
54 | X for case 1. */ | |
55 | ||
56 | #ifndef STRCSPN_SSE2 | |
cea43295 UD |
57 | # define STRCSPN_SSE2 __strcspn_sse2 |
58 | # define STRCSPN_SSE42 __strcspn_sse42 | |
59 | #endif | |
60 | ||
61 | #ifdef USE_AS_STRPBRK | |
62 | # define RETURN(val1, val2) return val1 | |
63 | #else | |
64 | # define RETURN(val1, val2) return val2 | |
06e51c8f L |
65 | #endif |
66 | ||
67 | extern | |
68 | #ifdef USE_AS_STRPBRK | |
69 | char * | |
70 | #else | |
71 | size_t | |
72 | #endif | |
7550717e | 73 | STRCSPN_SSE2 (const char *, const char *) attribute_hidden; |
06e51c8f | 74 | |
cea43295 | 75 | |
06e51c8f L |
76 | #ifdef USE_AS_STRPBRK |
77 | char * | |
78 | #else | |
79 | size_t | |
80 | #endif | |
81 | __attribute__ ((section (".text.sse4.2"))) | |
82 | STRCSPN_SSE42 (const char *s, const char *a) | |
83 | { | |
06e51c8f | 84 | if (*a == 0) |
cea43295 | 85 | RETURN (NULL, strlen (s)); |
06e51c8f | 86 | |
cea43295 UD |
87 | const char *aligned; |
88 | __m128i mask; | |
89 | int offset = (int) ((size_t) a & 15); | |
06e51c8f L |
90 | if (offset != 0) |
91 | { | |
92 | /* Load masks. */ | |
02cea471 | 93 | aligned = (const char *) ((size_t) a & -16L); |
cea43295 | 94 | __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); |
06e51c8f | 95 | |
73f27d5e | 96 | mask = __m128i_shift_right (mask0, offset); |
06e51c8f L |
97 | |
98 | /* Find where the NULL terminator is. */ | |
cea43295 | 99 | int length = _mm_cmpistri (mask, mask, 0x3a); |
06e51c8f L |
100 | if (length == 16 - offset) |
101 | { | |
102 | /* There is no NULL terminator. */ | |
cea43295 UD |
103 | __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); |
104 | int index = _mm_cmpistri (mask1, mask1, 0x3a); | |
06e51c8f L |
105 | length += index; |
106 | ||
107 | /* Don't use SSE4.2 if the length of A > 16. */ | |
108 | if (length > 16) | |
109 | return STRCSPN_SSE2 (s, a); | |
110 | ||
111 | if (index != 0) | |
112 | { | |
73f27d5e RH |
113 | /* Combine mask0 and mask1. We could play games with |
114 | palignr, but frankly this data should be in L1 now | |
115 | so do the merge via an unaligned load. */ | |
116 | mask = _mm_loadu_si128 ((__m128i *) a); | |
06e51c8f L |
117 | } |
118 | } | |
119 | } | |
120 | else | |
121 | { | |
122 | /* A is aligned. */ | |
123 | mask = _mm_load_si128 ((__m128i *) a); | |
124 | ||
125 | /* Find where the NULL terminator is. */ | |
cea43295 | 126 | int length = _mm_cmpistri (mask, mask, 0x3a); |
06e51c8f L |
127 | if (length == 16) |
128 | { | |
129 | /* There is no NULL terminator. Don't use SSE4.2 if the length | |
130 | of A > 16. */ | |
131 | if (a[16] != 0) | |
132 | return STRCSPN_SSE2 (s, a); | |
133 | } | |
134 | } | |
135 | ||
136 | offset = (int) ((size_t) s & 15); | |
137 | if (offset != 0) | |
138 | { | |
139 | /* Check partial string. */ | |
02cea471 | 140 | aligned = (const char *) ((size_t) s & -16L); |
cea43295 | 141 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
06e51c8f | 142 | |
73f27d5e | 143 | value = __m128i_shift_right (value, offset); |
06e51c8f | 144 | |
cea43295 | 145 | int length = _mm_cmpistri (mask, value, 0x2); |
06e51c8f | 146 | /* No need to check ZFlag since ZFlag is always 1. */ |
cea43295 | 147 | int cflag = _mm_cmpistrc (mask, value, 0x2); |
06e51c8f | 148 | if (cflag) |
cea43295 | 149 | RETURN ((char *) (s + length), length); |
06e51c8f | 150 | /* Find where the NULL terminator is. */ |
cea43295 | 151 | int index = _mm_cmpistri (value, value, 0x3a); |
06e51c8f | 152 | if (index < 16 - offset) |
cea43295 | 153 | RETURN (NULL, index); |
06e51c8f L |
154 | aligned += 16; |
155 | } | |
156 | else | |
157 | aligned = s; | |
158 | ||
cea43295 | 159 | while (1) |
06e51c8f | 160 | { |
cea43295 UD |
161 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
162 | int index = _mm_cmpistri (mask, value, 0x2); | |
163 | int cflag = _mm_cmpistrc (mask, value, 0x2); | |
164 | int zflag = _mm_cmpistrz (mask, value, 0x2); | |
165 | if (cflag) | |
166 | RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); | |
167 | if (zflag) | |
168 | RETURN (NULL, | |
169 | /* Find where the NULL terminator is. */ | |
170 | (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); | |
171 | aligned += 16; | |
06e51c8f | 172 | } |
06e51c8f | 173 | } |