]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/le/power9/strncmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / le / power9 / strncmp.S
1 /* Optimized strncmp implementation for PowerPC64/POWER9.
2 Copyright (C) 2016-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18 #include <sysdep.h>
19
20 /* Implements the function
21
22 int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
23
24 The implementation uses unaligned doubleword access to avoid specialized
25 code paths depending of data alignment for first 32 bytes and uses
26 vectorised loops after that. */
27
28 #ifndef STRNCMP
29 # define STRNCMP strncmp
30 #endif
31
32 /* TODO: Change this to actual instructions when minimum binutils is upgraded
33 to 2.27. Macros are defined below for these newer instructions in order
34 to maintain compatibility. */
35 #define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
36
37 #define VEXTUBRX(t,a,b) .long (0x1000070d \
38 | ((t)<<(32-11)) \
39 | ((a)<<(32-16)) \
40 | ((b)<<(32-21)) )
41
42 #define VCMPNEZB(t,a,b) .long (0x10000507 \
43 | ((t)<<(32-11)) \
44 | ((a)<<(32-16)) \
45 | ((b)<<(32-21)) )
46
47 /* Get 16 bytes for unaligned case.
48 reg1: Vector to hold next 16 bytes.
49 reg2: Address to read from.
50 reg3: Permute control vector. */
51 #define GET16BYTES(reg1, reg2, reg3) \
52 lvx reg1, 0, reg2; \
53 vperm v8, v2, reg1, reg3; \
54 vcmpequb. v8, v0, v8; \
55 beq cr6, 1f; \
56 vspltisb v9, 0; \
57 b 2f; \
58 .align 4; \
59 1: \
60 cmplw cr6, r5, r11; \
61 ble cr6, 2f; \
62 addi r6, reg2, 16; \
63 lvx v9, 0, r6; \
64 2: \
65 vperm reg1, v9, reg1, reg3;
66
67 /* TODO: change this to .machine power9 when minimum binutils
68 is upgraded to 2.27. */
69 .machine power7
70 ENTRY_TOCLESS (STRNCMP, 4)
71 /* Check if size is 0. */
72 cmpdi cr0, r5, 0
73 beq cr0, L(ret0)
74 li r0, 0
75
76 /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
77 the code:
78
79 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
80
81 with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
82 rldicl r8, r3, 0, 52
83 cmpldi cr7, r8, 4096-32
84 bgt cr7, L(pagecross)
85 rldicl r9, r4, 0, 52
86 cmpldi cr7, r9, 4096-32
87 bgt cr7, L(pagecross)
88
89 /* For short strings up to 32 bytes, load both s1 and s2 using
90 unaligned dwords and compare. */
91
92 ld r7, 0(r3)
93 ld r9, 0(r4)
94 li r8, 0
95 cmpb r8, r7, r8
96 cmpb r6, r7, r9
97 orc. r8, r8, r6
98 bne cr0, L(different1)
99
100 /* If the strings compared are equal, but size is less or equal
101 to 8, return 0. */
102 cmpldi cr7, r5, 8
103 li r9, 0
104 ble cr7, L(ret1)
105 addi r5, r5, -8
106
107 ld r7, 8(r3)
108 ld r9, 8(r4)
109 cmpb r8, r7, r8
110 cmpb r6, r7, r9
111 orc. r8, r8, r6
112 bne cr0, L(different1)
113 cmpldi cr7, r5, 8
114 mr r9, r8
115 ble cr7, L(ret1)
116 /* Update pointers and size. */
117 addi r5, r5, -8
118 addi r3, r3, 16
119 addi r4, r4, 16
120
121 ld r7, 0(r3)
122 ld r9, 0(r4)
123 li r8, 0
124 cmpb r8, r7, r8
125 cmpb r6, r7, r9
126 orc. r8, r8, r6
127 bne cr0, L(different1)
128 cmpldi cr7, r5, 8
129 li r9, 0
130 ble cr7, L(ret1)
131 addi r5, r5, -8
132
133 ld r7, 8(r3)
134 ld r9, 8(r4)
135 cmpb r8, r7, r8
136 cmpb r6, r7, r9
137 orc. r8, r8, r6
138 bne cr0, L(different1)
139 cmpldi cr7, r5, 8
140 mr r9, r8
141 ble cr7, L(ret1)
142
143 /* Update pointers and size. */
144 addi r5, r5, -8
145 addi r3, r3, 16
146 addi r4, r4, 16
147 L(align):
148 /* Now it has checked for first 32 bytes, align source1 to doubleword
149 and adjust source2 address. */
150 vspltisb v0, 0
151 vspltisb v2, -1
152 or r6, r4, r3
153 andi. r6, r6, 0xF
154 beq cr0, L(aligned)
155 lvsr v6, 0, r4 /* Compute mask. */
156 clrldi r6, r4, 60
157 subfic r11, r6, 16
158 andi. r6, r3, 0xF
159 beq cr0, L(s1_align)
160 /* Both s1 and s2 are unaligned. */
161 GET16BYTES(v5, r4, v6)
162 lvsr v10, 0, r3 /* Compute mask. */
163 clrldi r6, r3, 60
164 subfic r11, r6, 16
165 GET16BYTES(v4, r3, v10)
166 VCMPNEZB(v7, v5, v4)
167 beq cr6, L(match)
168 b L(different)
169
170 /* Align s1 to qw and adjust s2 address. */
171 .align 4
172 L(match):
173 cmpldi cr7, r5, 16
174 ble cr7, L(ret0)
175 subf r5, r11, r5
176 add r3, r3, r11
177 add r4, r4, r11
178 andi. r11, r4, 0xF
179 beq cr0, L(aligned)
180 lvsr v6, 0, r4
181 clrldi r6, r4, 60
182 subfic r11, r6, 16
183 /* There are 2 loops depending on the input alignment.
184 Each loop gets 16 bytes from s1 and s2, checks for null
185 and compares them. Loops until a mismatch or null occurs. */
186 L(s1_align):
187 lvx v4, 0, r3
188 GET16BYTES(v5, r4, v6)
189 VCMPNEZB(v7, v5, v4)
190 bne cr6, L(different)
191 cmpldi cr7, r5, 16
192 ble cr7, L(ret0)
193 addi r5, r5, -16
194 addi r3, r3, 16
195 addi r4, r4, 16
196
197 lvx v4, 0, r3
198 GET16BYTES(v5, r4, v6)
199 VCMPNEZB(v7, v5, v4)
200 bne cr6, L(different)
201 cmpldi cr7, r5, 16
202 ble cr7, L(ret0)
203 addi r5, r5, -16
204 addi r3, r3, 16
205 addi r4, r4, 16
206
207 lvx v4, 0, r3
208 GET16BYTES(v5, r4, v6)
209 VCMPNEZB(v7, v5, v4)
210 bne cr6, L(different)
211 cmpldi cr7, r5, 16
212 ble cr7, L(ret0)
213 addi r5, r5, -16
214 addi r3, r3, 16
215 addi r4, r4, 16
216
217 lvx v4, 0, r3
218 GET16BYTES(v5, r4, v6)
219 VCMPNEZB(v7, v5, v4)
220 bne cr6, L(different)
221 cmpldi cr7, r5, 16
222 ble cr7, L(ret0)
223 addi r5, r5, -16
224 addi r3, r3, 16
225 addi r4, r4, 16
226 b L(s1_align)
227 .align 4
228 L(aligned):
229 lvx v4, 0, r3
230 lvx v5, 0, r4
231 VCMPNEZB(v7, v5, v4)
232 bne cr6, L(different)
233 cmpldi cr7, r5, 16
234 ble cr7, L(ret0)
235 addi r5, r5, -16
236 addi r3, r3, 16
237 addi r4, r4, 16
238
239 lvx v4, 0, r3
240 lvx v5, 0, r4
241 VCMPNEZB(v7, v5, v4)
242 bne cr6, L(different)
243 cmpldi cr7, r5, 16
244 ble cr7, L(ret0)
245 addi r5, r5, -16
246 addi r3, r3, 16
247 addi r4, r4, 16
248
249 lvx v4, 0, r3
250 lvx v5, 0, r4
251 VCMPNEZB(v7, v5, v4)
252 bne cr6, L(different)
253 cmpldi cr7, r5, 16
254 ble cr7, L(ret0)
255 addi r5, r5, -16
256 addi r3, r3, 16
257 addi r4, r4, 16
258
259 lvx v4, 0, r3
260 lvx v5, 0, r4
261 VCMPNEZB(v7, v5, v4)
262 bne cr6, L(different)
263 cmpldi cr7, r5, 16
264 ble cr7, L(ret0)
265 addi r5, r5, -16
266 addi r3, r3, 16
267 addi r4, r4, 16
268 b L(aligned)
269 /* Calculate and return the difference. */
270 L(different):
271 VCTZLSBB(r6, v7)
272 cmplw cr7, r5, r6
273 ble cr7, L(ret0)
274 VEXTUBRX(r5, r6, v4)
275 VEXTUBRX(r4, r6, v5)
276 subf r3, r4, r5
277 extsw r3, r3
278 blr
279
280 .align 4
281 L(ret0):
282 li r9, 0
283 L(ret1):
284 mr r3, r9
285 blr
286
287 /* The code now checks if r8 and r5 are different by issuing a
288 cmpb and shifts the result based on its output:
289
290 leadzero = (__builtin_ffsl (z1) - 1);
291 leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
292 r1 = (r1 >> leadzero) & 0xFFUL;
293 r2 = (r2 >> leadzero) & 0xFFUL;
294 return r1 - r2; */
295
296 .align 4
297 L(different1):
298 neg r11, r8
299 sldi r5, r5, 3
300 and r8, r11, r8
301 addi r5, r5, -8
302 cntlzd r8, r8
303 subfic r8, r8, 63
304 extsw r8, r8
305 cmpld cr7, r8, r5
306 ble cr7, L(different2)
307 mr r8, r5
308 L(different2):
309 extsw r8, r8
310 srd r7, r7, r8
311 srd r9, r9, r8
312 rldicl r3, r7, 0, 56
313 rldicl r9, r9, 0, 56
314 subf r9, r9, 3
315 extsw r9, r9
316 mr r3, r9
317 blr
318
319 /* If unaligned 16 bytes reads across a 4K page boundary, it uses
320 a simple byte a byte comparison until the page alignment for s1
321 is reached. */
322 .align 4
323 L(pagecross):
324 lbz r7, 0(r3)
325 lbz r9, 0(r4)
326 subfic r8, r8,4095
327 cmplw cr7, r9, r7
328 bne cr7, L(byte_ne_3)
329 cmpdi cr7, r9, 0
330 beq cr7, L(byte_ne_0)
331 addi r5, r5, -1
332 subf r7, r8, r5
333 subf r9, r7, r5
334 addi r9, r9, 1
335 mtctr r9
336 b L(pagecross_loop1)
337
338 .align 4
339 L(pagecross_loop0):
340 beq cr7, L(ret0)
341 lbz r9, 0(r3)
342 lbz r8, 0(r4)
343 addi r5, r5, -1
344 cmplw cr7, r9, r8
345 cmpdi cr5, r9, 0
346 bne cr7, L(byte_ne_2)
347 beq cr5, L(byte_ne_0)
348 L(pagecross_loop1):
349 cmpdi cr7, r5, 0
350 addi r3, r3, 1
351 addi r4, r4, 1
352 bdnz L(pagecross_loop0)
353 cmpdi cr7, r7, 0
354 li r9, 0
355 bne+ cr7, L(align)
356 b L(ret1)
357
358 .align 4
359 L(byte_ne_0):
360 li r7, 0
361 L(byte_ne_1):
362 subf r9, r9, r7
363 extsw r9, r9
364 b L(ret1)
365
366 .align 4
367 L(byte_ne_2):
368 extsw r7, r9
369 mr r9, r8
370 b L(byte_ne_1)
371 L(byte_ne_3):
372 extsw r7, r7
373 b L(byte_ne_1)
374 END(STRNCMP)
375 libc_hidden_builtin_def(strncmp)