]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power9/strncmp.S
3f2fa7513ae845ce9319289d5f2a133ff775ea53
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power9 / strncmp.S
1 /* Optimized strncmp implementation for PowerPC64/POWER9.
2 Copyright (C) 2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18 #ifdef __LITTLE_ENDIAN__
19 #include <sysdep.h>
20
21 /* Implements the function
22
23 int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
24
25 The implementation uses unaligned doubleword access to avoid specialized
26 code paths depending of data alignment for first 32 bytes and uses
27 vectorised loops after that. */
28
29 /* TODO: Change this to actual instructions when minimum binutils is upgraded
30 to 2.27. Macros are defined below for these newer instructions in order
31 to maintain compatibility. */
32 # define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
33
34 # define VEXTUBRX(t,a,b) .long (0x1000070d \
35 | ((t)<<(32-11)) \
36 | ((a)<<(32-16)) \
37 | ((b)<<(32-21)) )
38
39 # define VCMPNEZB(t,a,b) .long (0x10000507 \
40 | ((t)<<(32-11)) \
41 | ((a)<<(32-16)) \
42 | ((b)<<(32-21)) )
43
44 /* Get 16 bytes for unaligned case.
45 reg1: Vector to hold next 16 bytes.
46 reg2: Address to read from.
47 reg3: Permute control vector. */
48 # define GET16BYTES(reg1, reg2, reg3) \
49 lvx reg1, 0, reg2; \
50 vperm v8, v2, reg1, reg3; \
51 vcmpequb. v8, v0, v8; \
52 beq cr6, 1f; \
53 vspltisb v9, 0; \
54 b 2f; \
55 .align 4; \
56 1: \
57 cmplw cr6, r5, r11; \
58 ble cr6, 2f; \
59 addi r6, reg2, 16; \
60 lvx v9, 0, r6; \
61 2: \
62 vperm reg1, v9, reg1, reg3;
63
64 /* TODO: change this to .machine power9 when minimum binutils
65 is upgraded to 2.27. */
66 .machine power7
67 EALIGN (strncmp, 4, 0)
68 /* Check if size is 0. */
69 cmpdi cr0, r5, 0
70 beq cr0, L(ret0)
71 li r0, 0
72
73 /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
74 the code:
75
76 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
77
78 with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
79 rldicl r8, r3, 0, 52
80 cmpldi cr7, r8, 4096-32
81 bgt cr7, L(pagecross)
82 rldicl r9, r4, 0, 52
83 cmpldi cr7, r9, 4096-32
84 bgt cr7, L(pagecross)
85
86 /* For short strings up to 32 bytes, load both s1 and s2 using
87 unaligned dwords and compare. */
88
89 ld r7, 0(r3)
90 ld r9, 0(r4)
91 li r8, 0
92 cmpb r8, r7, r8
93 cmpb r6, r7, r9
94 orc. r8, r8, r6
95 bne cr0, L(different1)
96
97 /* If the strings compared are equal, but size is less or equal
98 to 8, return 0. */
99 cmpldi cr7, r5, 8
100 li r9, 0
101 ble cr7, L(ret1)
102 addi r5, r5, -8
103
104 ld r7, 8(r3)
105 ld r9, 8(r4)
106 cmpb r8, r7, r8
107 cmpb r6, r7, r9
108 orc. r8, r8, r6
109 bne cr0, L(different1)
110 cmpldi cr7, r5, 8
111 mr r9, r8
112 ble cr7, L(ret1)
113 /* Update pointers and size. */
114 addi r5, r5, -8
115 addi r3, r3, 16
116 addi r4, r4, 16
117
118 ld r7, 0(r3)
119 ld r9, 0(r4)
120 li r8, 0
121 cmpb r8, r7, r8
122 cmpb r6, r7, r9
123 orc. r8, r8, r6
124 bne cr0, L(different1)
125 cmpldi cr7, r5, 8
126 li r9, 0
127 ble cr7, L(ret1)
128 addi r5, r5, -8
129
130 ld r7, 8(r3)
131 ld r9, 8(r4)
132 cmpb r8, r7, r8
133 cmpb r6, r7, r9
134 orc. r8, r8, r6
135 bne cr0, L(different1)
136 cmpldi cr7, r5, 8
137 mr r9, r8
138 ble cr7, L(ret1)
139
140 /* Update pointers and size. */
141 addi r5, r5, -8
142 addi r3, r3, 16
143 addi r4, r4, 16
144 L(align):
145 /* Now it has checked for first 32 bytes, align source1 to doubleword
146 and adjust source2 address. */
147 vspltisb v0, 0
148 vspltisb v2, -1
149 or r6, r4, r3
150 andi. r6, r6, 0xF
151 beq cr0, L(aligned)
152 lvsr v6, 0, r4 /* Compute mask. */
153 clrldi r6, r4, 60
154 subfic r11, r6, 16
155 andi. r6, r3, 0xF
156 beq cr0, L(s1_align)
157 /* Both s1 and s2 are unaligned. */
158 GET16BYTES(v5, r4, v6)
159 lvsr v10, 0, r3 /* Compute mask. */
160 clrldi r6, r3, 60
161 subfic r11, r6, 16
162 GET16BYTES(v4, r3, v10)
163 VCMPNEZB(v7, v5, v4)
164 beq cr6, L(match)
165 b L(different)
166
167 /* Align s1 to qw and adjust s2 address. */
168 .align 4
169 L(match):
170 cmpldi cr7, r5, 16
171 ble cr7, L(ret0)
172 subf r5, r11, r5
173 add r3, r3, r11
174 add r4, r4, r11
175 andi. r11, r4, 0xF
176 beq cr0, L(aligned)
177 lvsr v6, 0, r4
178 clrldi r6, r4, 60
179 subfic r11, r6, 16
180 /* There are 2 loops depending on the input alignment.
181 Each loop gets 16 bytes from s1 and s2, checks for null
182 and compares them. Loops until a mismatch or null occurs. */
183 L(s1_align):
184 lvx v4, 0, r3
185 GET16BYTES(v5, r4, v6)
186 VCMPNEZB(v7, v5, v4)
187 bne cr6, L(different)
188 cmpldi cr7, r5, 16
189 ble cr7, L(ret0)
190 addi r5, r5, -16
191 addi r3, r3, 16
192 addi r4, r4, 16
193
194 lvx v4, 0, r3
195 GET16BYTES(v5, r4, v6)
196 VCMPNEZB(v7, v5, v4)
197 bne cr6, L(different)
198 cmpldi cr7, r5, 16
199 ble cr7, L(ret0)
200 addi r5, r5, -16
201 addi r3, r3, 16
202 addi r4, r4, 16
203
204 lvx v4, 0, r3
205 GET16BYTES(v5, r4, v6)
206 VCMPNEZB(v7, v5, v4)
207 bne cr6, L(different)
208 cmpldi cr7, r5, 16
209 ble cr7, L(ret0)
210 addi r5, r5, -16
211 addi r3, r3, 16
212 addi r4, r4, 16
213
214 lvx v4, 0, r3
215 GET16BYTES(v5, r4, v6)
216 VCMPNEZB(v7, v5, v4)
217 bne cr6, L(different)
218 cmpldi cr7, r5, 16
219 ble cr7, L(ret0)
220 addi r5, r5, -16
221 addi r3, r3, 16
222 addi r4, r4, 16
223 b L(s1_align)
224 .align 4
225 L(aligned):
226 lvx v4, 0, r3
227 lvx v5, 0, r4
228 VCMPNEZB(v7, v5, v4)
229 bne cr6, L(different)
230 cmpldi cr7, r5, 16
231 ble cr7, L(ret0)
232 addi r5, r5, -16
233 addi r3, r3, 16
234 addi r4, r4, 16
235
236 lvx v4, 0, r3
237 lvx v5, 0, r4
238 VCMPNEZB(v7, v5, v4)
239 bne cr6, L(different)
240 cmpldi cr7, r5, 16
241 ble cr7, L(ret0)
242 addi r5, r5, -16
243 addi r3, r3, 16
244 addi r4, r4, 16
245
246 lvx v4, 0, r3
247 lvx v5, 0, r4
248 VCMPNEZB(v7, v5, v4)
249 bne cr6, L(different)
250 cmpldi cr7, r5, 16
251 ble cr7, L(ret0)
252 addi r5, r5, -16
253 addi r3, r3, 16
254 addi r4, r4, 16
255
256 lvx v4, 0, r3
257 lvx v5, 0, r4
258 VCMPNEZB(v7, v5, v4)
259 bne cr6, L(different)
260 cmpldi cr7, r5, 16
261 ble cr7, L(ret0)
262 addi r5, r5, -16
263 addi r3, r3, 16
264 addi r4, r4, 16
265 b L(aligned)
266 /* Calculate and return the difference. */
267 L(different):
268 VCTZLSBB(r6, v7)
269 cmplw cr7, r5, r6
270 ble cr7, L(ret0)
271 VEXTUBRX(r5, r6, v4)
272 VEXTUBRX(r4, r6, v5)
273 subf r3, r4, r5
274 extsw r3, r3
275 blr
276
277 .align 4
278 L(ret0):
279 li r9, 0
280 L(ret1):
281 mr r3, r9
282 blr
283
284 /* The code now checks if r8 and r5 are different by issuing a
285 cmpb and shifts the result based on its output:
286
287 leadzero = (__builtin_ffsl (z1) - 1);
288 leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
289 r1 = (r1 >> leadzero) & 0xFFUL;
290 r2 = (r2 >> leadzero) & 0xFFUL;
291 return r1 - r2; */
292
293 .align 4
294 L(different1):
295 neg r11, r8
296 sldi r5, r5, 3
297 and r8, r11, r8
298 addi r5, r5, -8
299 cntlzd r8, r8
300 subfic r8, r8, 63
301 extsw r8, r8
302 cmpld cr7, r8, r5
303 ble cr7, L(different2)
304 mr r8, r5
305 L(different2):
306 extsw r8, r8
307 srd r7, r7, r8
308 srd r9, r9, r8
309 rldicl r3, r7, 0, 56
310 rldicl r9, r9, 0, 56
311 subf r9, r9, 3
312 extsw r9, r9
313 mr r3, r9
314 blr
315
316 /* If unaligned 16 bytes reads across a 4K page boundary, it uses
317 a simple byte a byte comparison until the page alignment for s1
318 is reached. */
319 .align 4
320 L(pagecross):
321 lbz r7, 0(r3)
322 lbz r9, 0(r4)
323 subfic r8, r8,4095
324 cmplw cr7, r9, r7
325 bne cr7, L(byte_ne_3)
326 cmpdi cr7, r9, 0
327 beq cr7, L(byte_ne_0)
328 addi r5, r5, -1
329 subf r7, r8, r5
330 subf r9, r7, r5
331 addi r9, r9, 1
332 mtctr r9
333 b L(pagecross_loop1)
334
335 .align 4
336 L(pagecross_loop0):
337 beq cr7, L(ret0)
338 lbz r9, 0(r3)
339 lbz r8, 0(r4)
340 addi r5, r5, -1
341 cmplw cr7, r9, r8
342 cmpdi cr5, r9, 0
343 bne cr7, L(byte_ne_2)
344 beq cr5, L(byte_ne_0)
345 L(pagecross_loop1):
346 cmpdi cr7, r5, 0
347 addi r3, r3, 1
348 addi r4, r4, 1
349 bdnz L(pagecross_loop0)
350 cmpdi cr7, r7, 0
351 li r9, 0
352 bne+ cr7, L(align)
353 b L(ret1)
354
355 .align 4
356 L(byte_ne_0):
357 li r7, 0
358 L(byte_ne_1):
359 subf r9, r9, r7
360 extsw r9, r9
361 b L(ret1)
362
363 .align 4
364 L(byte_ne_2):
365 extsw r7, r9
366 mr r9, r8
367 b L(byte_ne_1)
368 L(byte_ne_3):
369 extsw r7, r7
370 b L(byte_ne_1)
371 END(strncmp)
372 libc_hidden_builtin_def(strncmp)
373 #else
374 #include <sysdeps/powerpc/powerpc64/power8/strncmp.S>
375 #endif