]>
Commit | Line | Data |
---|---|---|
1b045ee5 CES |
1 | /* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized |
2 | loop. | |
04277e02 | 3 | Copyright (C) 2016-2019 Free Software Foundation, Inc. |
1b045ee5 CES |
4 | This file is part of the GNU C Library. |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, see | |
18 | <http://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include <sysdep.h> | |
21 | ||
1b045ee5 CES |
22 | /* int [r3] strlen (char *s [r3]) */ |
23 | ||
001649fd WSM |
24 | #ifndef STRLEN |
25 | # define STRLEN strlen | |
26 | #endif | |
066020c5 | 27 | .machine power8 |
d5b41185 | 28 | ENTRY_TOCLESS (STRLEN, 4) |
1b045ee5 CES |
29 | CALL_MCOUNT 1 |
30 | dcbt 0,r3 | |
31 | clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ | |
32 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ | |
33 | li r0,0 /* Doubleword with null chars to use | |
34 | with cmpb. */ | |
35 | li r5,-1 /* MASK = 0xffffffffffffffff. */ | |
36 | ld r12,0(r4) /* Load doubleword from memory. */ | |
37 | #ifdef __LITTLE_ENDIAN__ | |
38 | sld r5,r5,r6 | |
39 | #else | |
40 | srd r5,r5,r6 /* MASK = MASK >> padding. */ | |
41 | #endif | |
42 | orc r9,r12,r5 /* Mask bits that are not part of the string. */ | |
43 | cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ | |
44 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ | |
45 | bne cr7,L(done) | |
46 | ||
47 | /* For shorter strings (< 64 bytes), we will not use vector registers, | |
48 | as the overhead isn't worth it. So, let's use GPRs instead. This | |
49 | will be done the same way as we do in the POWER7 implementation. | |
50 | Let's see if we are aligned to a quadword boundary. If so, we can | |
51 | jump to the first (non-vectorized) loop. Otherwise, we have to | |
52 | handle the next DWORD first. */ | |
53 | mtcrf 0x01,r4 | |
54 | mr r9,r4 | |
55 | addi r9,r9,8 | |
56 | bt 28,L(align64) | |
57 | ||
58 | /* Handle the next 8 bytes so we are aligned to a quadword | |
59 | boundary. */ | |
60 | ldu r5,8(r4) | |
61 | cmpb r10,r5,r0 | |
62 | cmpdi cr7,r10,0 | |
63 | addi r9,r9,8 | |
64 | bne cr7,L(done) | |
65 | ||
66 | L(align64): | |
67 | /* Proceed to the old (POWER7) implementation, checking two doublewords | |
68 | per iteraction. For the first 56 bytes, we will just check for null | |
69 | characters. After that, we will also check if we are 64-byte aligned | |
70 | so we can jump to the vectorized implementation. We will unroll | |
71 | these loops to avoid excessive branching. */ | |
72 | ld r6,8(r4) | |
73 | ldu r5,16(r4) | |
74 | cmpb r10,r6,r0 | |
75 | cmpb r11,r5,r0 | |
76 | or r5,r10,r11 | |
77 | cmpdi cr7,r5,0 | |
78 | addi r9,r9,16 | |
79 | bne cr7,L(dword_zero) | |
80 | ||
81 | ld r6,8(r4) | |
82 | ldu r5,16(r4) | |
83 | cmpb r10,r6,r0 | |
84 | cmpb r11,r5,r0 | |
85 | or r5,r10,r11 | |
86 | cmpdi cr7,r5,0 | |
87 | addi r9,r9,16 | |
88 | bne cr7,L(dword_zero) | |
89 | ||
90 | ld r6,8(r4) | |
91 | ldu r5,16(r4) | |
92 | cmpb r10,r6,r0 | |
93 | cmpb r11,r5,r0 | |
94 | or r5,r10,r11 | |
95 | cmpdi cr7,r5,0 | |
96 | addi r9,r9,16 | |
97 | bne cr7,L(dword_zero) | |
98 | ||
99 | /* Are we 64-byte aligned? If so, jump to the vectorized loop. | |
100 | Note: aligning to 64-byte will necessarily slow down performance for | |
101 | strings around 64 bytes in length due to the extra comparisons | |
102 | required to check alignment for the vectorized loop. This is a | |
103 | necessary tradeoff we are willing to take in order to speed up the | |
104 | calculation for larger strings. */ | |
105 | andi. r10,r9,63 | |
106 | beq cr0,L(preloop) | |
107 | ld r6,8(r4) | |
108 | ldu r5,16(r4) | |
109 | cmpb r10,r6,r0 | |
110 | cmpb r11,r5,r0 | |
111 | or r5,r10,r11 | |
112 | cmpdi cr7,r5,0 | |
113 | addi r9,r9,16 | |
114 | bne cr7,L(dword_zero) | |
115 | ||
116 | andi. r10,r9,63 | |
117 | beq cr0,L(preloop) | |
118 | ld r6,8(r4) | |
119 | ldu r5,16(r4) | |
120 | cmpb r10,r6,r0 | |
121 | cmpb r11,r5,r0 | |
122 | or r5,r10,r11 | |
123 | cmpdi cr7,r5,0 | |
124 | addi r9,r9,16 | |
125 | bne cr7,L(dword_zero) | |
126 | ||
1b045ee5 CES |
127 | andi. r10,r9,63 |
128 | beq cr0,L(preloop) | |
129 | ld r6,8(r4) | |
130 | ldu r5,16(r4) | |
131 | cmpb r10,r6,r0 | |
132 | cmpb r11,r5,r0 | |
133 | or r5,r10,r11 | |
134 | cmpdi cr7,r5,0 | |
135 | addi r9,r9,16 | |
136 | ||
137 | /* At this point, we are necessarily 64-byte aligned. If no zeroes were | |
138 | found, jump to the vectorized loop. */ | |
139 | beq cr7,L(preloop) | |
140 | ||
141 | L(dword_zero): | |
142 | /* OK, one (or both) of the doublewords contains a null byte. Check | |
143 | the first doubleword and decrement the address in case the first | |
144 | doubleword really contains a null byte. */ | |
145 | ||
146 | cmpdi cr6,r10,0 | |
147 | addi r4,r4,-8 | |
148 | bne cr6,L(done) | |
149 | ||
150 | /* The null byte must be in the second doubleword. Adjust the address | |
151 | again and move the result of cmpb to r10 so we can calculate the | |
152 | length. */ | |
153 | ||
154 | mr r10,r11 | |
155 | addi r4,r4,8 | |
156 | ||
157 | /* If the null byte was found in the non-vectorized code, compute the | |
158 | final length. r10 has the output of the cmpb instruction, that is, | |
159 | it contains 0xff in the same position as the null byte in the | |
160 | original doubleword from the string. Use that to calculate the | |
161 | length. */ | |
162 | L(done): | |
163 | #ifdef __LITTLE_ENDIAN__ | |
164 | addi r9, r10,-1 /* Form a mask from trailing zeros. */ | |
165 | andc r9, r9,r10 | |
166 | popcntd r0, r9 /* Count the bits in the mask. */ | |
167 | #else | |
168 | cntlzd r0,r10 /* Count leading zeros before the match. */ | |
169 | #endif | |
170 | subf r5,r3,r4 | |
171 | srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ | |
172 | add r3,r5,r0 /* Compute final length. */ | |
173 | blr | |
174 | ||
175 | /* Vectorized implementation starts here. */ | |
176 | .p2align 4 | |
177 | L(preloop): | |
178 | /* Set up for the loop. */ | |
179 | mr r4,r9 | |
180 | li r7, 16 /* Load required offsets. */ | |
181 | li r8, 32 | |
182 | li r9, 48 | |
183 | li r12, 8 | |
184 | vxor v0,v0,v0 /* VR with null chars to use with | |
185 | vcmpequb. */ | |
186 | ||
187 | /* Main loop to look for the end of the string. We will read in | |
188 | 64-byte chunks. Align it to 32 bytes and unroll it 3 times to | |
189 | leverage the icache performance. */ | |
190 | .p2align 5 | |
191 | L(loop): | |
192 | lvx v1,r4,r0 /* Load 4 quadwords. */ | |
193 | lvx v2,r4,r7 | |
194 | lvx v3,r4,r8 | |
195 | lvx v4,r4,r9 | |
196 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ | |
197 | vminub v6,v3,v4 | |
198 | vminub v7,v5,v6 | |
199 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ | |
200 | addi r4,r4,64 /* Adjust address for the next iteration. */ | |
201 | bne cr6,L(vmx_zero) | |
202 | ||
203 | lvx v1,r4,r0 /* Load 4 quadwords. */ | |
204 | lvx v2,r4,r7 | |
205 | lvx v3,r4,r8 | |
206 | lvx v4,r4,r9 | |
207 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ | |
208 | vminub v6,v3,v4 | |
209 | vminub v7,v5,v6 | |
210 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ | |
211 | addi r4,r4,64 /* Adjust address for the next iteration. */ | |
212 | bne cr6,L(vmx_zero) | |
213 | ||
214 | lvx v1,r4,r0 /* Load 4 quadwords. */ | |
215 | lvx v2,r4,r7 | |
216 | lvx v3,r4,r8 | |
217 | lvx v4,r4,r9 | |
218 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ | |
219 | vminub v6,v3,v4 | |
220 | vminub v7,v5,v6 | |
221 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ | |
222 | addi r4,r4,64 /* Adjust address for the next iteration. */ | |
223 | beq cr6,L(loop) | |
224 | ||
225 | L(vmx_zero): | |
226 | /* OK, we found a null byte. Let's look for it in the current 64-byte | |
227 | block and mark it in its corresponding VR. */ | |
228 | vcmpequb v1,v1,v0 | |
229 | vcmpequb v2,v2,v0 | |
230 | vcmpequb v3,v3,v0 | |
231 | vcmpequb v4,v4,v0 | |
232 | ||
233 | /* We will now 'compress' the result into a single doubleword, so it | |
234 | can be moved to a GPR for the final calculation. First, we | |
235 | generate an appropriate mask for vbpermq, so we can permute bits into | |
236 | the first halfword. */ | |
237 | vspltisb v10,3 | |
238 | lvsl v11,r0,r0 | |
239 | vslb v10,v11,v10 | |
240 | ||
241 | /* Permute the first bit of each byte into bits 48-63. */ | |
066020c5 RFF |
242 | vbpermq v1,v1,v10 |
243 | vbpermq v2,v2,v10 | |
244 | vbpermq v3,v3,v10 | |
245 | vbpermq v4,v4,v10 | |
1b045ee5 CES |
246 | |
247 | /* Shift each component into its correct position for merging. */ | |
248 | #ifdef __LITTLE_ENDIAN__ | |
249 | vsldoi v2,v2,v2,2 | |
250 | vsldoi v3,v3,v3,4 | |
251 | vsldoi v4,v4,v4,6 | |
252 | #else | |
253 | vsldoi v1,v1,v1,6 | |
254 | vsldoi v2,v2,v2,4 | |
255 | vsldoi v3,v3,v3,2 | |
256 | #endif | |
257 | ||
258 | /* Merge the results and move to a GPR. */ | |
259 | vor v1,v2,v1 | |
260 | vor v2,v3,v4 | |
261 | vor v4,v1,v2 | |
066020c5 | 262 | mfvrd r10,v4 |
1b045ee5 CES |
263 | |
264 | /* Adjust address to the begninning of the current 64-byte block. */ | |
265 | addi r4,r4,-64 | |
266 | ||
267 | #ifdef __LITTLE_ENDIAN__ | |
268 | addi r9, r10,-1 /* Form a mask from trailing zeros. */ | |
269 | andc r9, r9,r10 | |
270 | popcntd r0, r9 /* Count the bits in the mask. */ | |
271 | #else | |
272 | cntlzd r0,r10 /* Count leading zeros before the match. */ | |
273 | #endif | |
274 | subf r5,r3,r4 | |
275 | add r3,r5,r0 /* Compute final length. */ | |
276 | blr | |
277 | ||
001649fd | 278 | END (STRLEN) |
1b045ee5 | 279 | libc_hidden_builtin_def (strlen) |