]>
Commit | Line | Data |
---|---|---|
a23bd00f | 1 | /* Optimized strlen implementation for PowerPC64/POWER9. |
2b778ceb | 2 | Copyright (C) 2020-2021 Free Software Foundation, Inc. |
a23bd00f PM |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <https://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | #ifndef STRLEN | |
22 | # define STRLEN __strlen | |
23 | # define DEFINE_STRLEN_HIDDEN_DEF 1 | |
24 | #endif | |
25 | ||
26 | /* Implements the function | |
27 | ||
28 | int [r3] strlen (const void *s [r3]) | |
29 | ||
30 | The implementation can load bytes past a matching byte, but only | |
31 | up to the next 64B boundary, so it never crosses a page. */ | |
32 | ||
33 | .machine power9 | |
34 | ENTRY_TOCLESS (STRLEN, 4) | |
35 | CALL_MCOUNT 2 | |
36 | ||
37 | vspltisb v18,0 | |
38 | vspltisb v19,-1 | |
39 | ||
40 | neg r5,r3 | |
41 | rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ | |
42 | ||
43 | /* Align data and fill bytes not loaded with non matching char. */ | |
44 | lvx v0,0,r3 | |
45 | lvsr v1,0,r3 | |
46 | vperm v0,v19,v0,v1 | |
47 | ||
48 | vcmpequb. v6,v0,v18 | |
49 | beq cr6,L(aligned) | |
50 | ||
51 | vctzlsbb r3,v6 | |
52 | blr | |
53 | ||
54 | /* Test 64B 16B at a time. The 64B vector loop is optimized for | |
55 | longer strings. Likewise, we check a multiple of 64B to avoid | |
56 | breaking the alignment calculation below. */ | |
57 | L(aligned): | |
58 | add r4,r3,r9 | |
59 | rldicl. r5,r4,60,62 /* Determine the number of 48B loops needed for | |
60 | alignment to 64B. And test for zero. */ | |
61 | ||
62 | lxv v0+32,0(r4) | |
63 | vcmpequb. v6,v0,v18 | |
64 | bne cr6,L(tail1) | |
65 | ||
66 | lxv v0+32,16(r4) | |
67 | vcmpequb. v6,v0,v18 | |
68 | bne cr6,L(tail2) | |
69 | ||
70 | lxv v0+32,32(r4) | |
71 | vcmpequb. v6,v0,v18 | |
72 | bne cr6,L(tail3) | |
73 | ||
74 | lxv v0+32,48(r4) | |
75 | vcmpequb. v6,v0,v18 | |
76 | bne cr6,L(tail4) | |
77 | addi r4,r4,64 | |
78 | ||
79 | /* Speculatively generate a fake 16B aligned address to generate the | |
80 | vector byte constant 0,1,..,15 using lvsl during reduction. */ | |
81 | li r0,0 | |
82 | ||
83 | /* Skip the alignment if already 64B aligned. */ | |
84 | beq L(loop_64b) | |
85 | mtctr r5 | |
86 | ||
87 | /* Test 48B per iteration until 64B aligned. */ | |
88 | .p2align 5 | |
89 | L(loop): | |
90 | lxv v0+32,0(r4) | |
91 | vcmpequb. v6,v0,v18 | |
92 | bne cr6,L(tail1) | |
93 | ||
94 | lxv v0+32,16(r4) | |
95 | vcmpequb. v6,v0,v18 | |
96 | bne cr6,L(tail2) | |
97 | ||
98 | lxv v0+32,32(r4) | |
99 | vcmpequb. v6,v0,v18 | |
100 | bne cr6,L(tail3) | |
101 | ||
102 | addi r4,r4,48 | |
103 | bdnz L(loop) | |
104 | ||
105 | .p2align 5 | |
106 | L(loop_64b): | |
107 | lxv v1+32,0(r4) /* Load 4 quadwords. */ | |
108 | lxv v2+32,16(r4) | |
109 | lxv v3+32,32(r4) | |
110 | lxv v4+32,48(r4) | |
111 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ | |
112 | vminub v6,v3,v4 | |
113 | vminub v7,v5,v6 | |
114 | vcmpequb. v7,v7,v18 /* Check for NULLs. */ | |
115 | addi r4,r4,64 /* Adjust address for the next iteration. */ | |
116 | bne cr6,L(vmx_zero) | |
117 | ||
118 | lxv v1+32,0(r4) /* Load 4 quadwords. */ | |
119 | lxv v2+32,16(r4) | |
120 | lxv v3+32,32(r4) | |
121 | lxv v4+32,48(r4) | |
122 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ | |
123 | vminub v6,v3,v4 | |
124 | vminub v7,v5,v6 | |
125 | vcmpequb. v7,v7,v18 /* Check for NULLs. */ | |
126 | addi r4,r4,64 /* Adjust address for the next iteration. */ | |
127 | bne cr6,L(vmx_zero) | |
128 | ||
129 | lxv v1+32,0(r4) /* Load 4 quadwords. */ | |
130 | lxv v2+32,16(r4) | |
131 | lxv v3+32,32(r4) | |
132 | lxv v4+32,48(r4) | |
133 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ | |
134 | vminub v6,v3,v4 | |
135 | vminub v7,v5,v6 | |
136 | vcmpequb. v7,v7,v18 /* Check for NULLs. */ | |
137 | addi r4,r4,64 /* Adjust address for the next iteration. */ | |
138 | beq cr6,L(loop_64b) | |
139 | ||
140 | L(vmx_zero): | |
141 | /* OK, we found a null byte. Let's look for it in the current 64-byte | |
142 | block and mark it in its corresponding VR. */ | |
143 | vcmpequb v1,v1,v18 | |
144 | vcmpequb v2,v2,v18 | |
145 | vcmpequb v3,v3,v18 | |
146 | vcmpequb v4,v4,v18 | |
147 | ||
148 | /* We will now 'compress' the result into a single doubleword, so it | |
149 | can be moved to a GPR for the final calculation. First, we | |
150 | generate an appropriate mask for vbpermq, so we can permute bits into | |
151 | the first halfword. */ | |
152 | vspltisb v10,3 | |
153 | lvsl v11,0,r0 | |
154 | vslb v10,v11,v10 | |
155 | ||
156 | /* Permute the first bit of each byte into bits 48-63. */ | |
157 | vbpermq v1,v1,v10 | |
158 | vbpermq v2,v2,v10 | |
159 | vbpermq v3,v3,v10 | |
160 | vbpermq v4,v4,v10 | |
161 | ||
162 | /* Shift each component into its correct position for merging. */ | |
163 | vsldoi v2,v2,v2,2 | |
164 | vsldoi v3,v3,v3,4 | |
165 | vsldoi v4,v4,v4,6 | |
166 | ||
167 | /* Merge the results and move to a GPR. */ | |
168 | vor v1,v2,v1 | |
169 | vor v2,v3,v4 | |
170 | vor v4,v1,v2 | |
171 | mfvrd r10,v4 | |
172 | ||
173 | /* Adjust address to the begninning of the current 64-byte block. */ | |
174 | addi r4,r4,-64 | |
175 | ||
176 | cnttzd r0,r10 /* Count trailing zeros before the match. */ | |
177 | subf r5,r3,r4 | |
178 | add r3,r5,r0 /* Compute final length. */ | |
179 | blr | |
180 | ||
181 | L(tail1): | |
182 | vctzlsbb r0,v6 | |
183 | add r4,r4,r0 | |
184 | subf r3,r3,r4 | |
185 | blr | |
186 | ||
187 | L(tail2): | |
188 | vctzlsbb r0,v6 | |
189 | add r4,r4,r0 | |
190 | addi r4,r4,16 | |
191 | subf r3,r3,r4 | |
192 | blr | |
193 | ||
194 | L(tail3): | |
195 | vctzlsbb r0,v6 | |
196 | add r4,r4,r0 | |
197 | addi r4,r4,32 | |
198 | subf r3,r3,r4 | |
199 | blr | |
200 | ||
201 | L(tail4): | |
202 | vctzlsbb r0,v6 | |
203 | add r4,r4,r0 | |
204 | addi r4,r4,48 | |
205 | subf r3,r3,r4 | |
206 | blr | |
207 | ||
208 | END (STRLEN) | |
209 | ||
210 | #ifdef DEFINE_STRLEN_HIDDEN_DEF | |
211 | weak_alias (__strlen, strlen) | |
212 | libc_hidden_builtin_def (strlen) | |
213 | #endif |