]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power9/strcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power9 / strcmp.S
CommitLineData
80ab6401 1/* Optimized strcmp implementation for PowerPC64/POWER9.
688903eb 2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
80ab6401
RS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18#ifdef __LITTLE_ENDIAN__
19#include <sysdep.h>
20
3bc426e1
WSM
21#ifndef STRCMP
22# define STRCMP strcmp
23#endif
24
80ab6401
RS
25/* Implements the function
26
27 int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
28
29 The implementation uses unaligned doubleword access for first 32 bytes
30 as in POWER8 patch and uses vectorised loops after that. */
31
32/* TODO: Change this to actual instructions when minimum binutils is upgraded
33 to 2.27. Macros are defined below for these newer instructions in order
34 to maintain compatibility. */
35# define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
36
37# define VEXTUBRX(t,a,b) .long (0x1000070d \
38 | ((t)<<(32-11)) \
39 | ((a)<<(32-16)) \
40 | ((b)<<(32-21)) )
41
42# define VCMPNEZB(t,a,b) .long (0x10000507 \
43 | ((t)<<(32-11)) \
44 | ((a)<<(32-16)) \
45 | ((b)<<(32-21)) )
46
47/* Get 16 bytes for unaligned case.
48 reg1: Vector to hold next 16 bytes.
49 reg2: Address to read from.
50 reg3: Permute control vector. */
51# define GET16BYTES(reg1, reg2, reg3) \
52 lvx reg1, 0, reg2; \
53 vperm v8, v2, reg1, reg3; \
54 vcmpequb. v8, v0, v8; \
55 beq cr6, 1f; \
56 vspltisb v9, 0; \
57 b 2f; \
58 .align 4; \
591: \
60 addi r6, reg2, 16; \
61 lvx v9, 0, r6; \
622: \
63 vperm reg1, v9, reg1, reg3;
64
65/* TODO: change this to .machine power9 when the minimum required binutils
66 allows it. */
67
68 .machine power7
d5b41185 69ENTRY_TOCLESS (STRCMP, 4)
80ab6401
RS
70 li r0, 0
71
04f0fd64 72 /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
80ab6401
RS
73 the code:
74
75 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
76
04f0fd64 77 with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
80ab6401
RS
78
79 rldicl r7, r3, 0, 52
80 rldicl r9, r4, 0, 52
04f0fd64 81 cmpldi cr7, r7, 4096-16
80ab6401 82 bgt cr7, L(pagecross_check)
04f0fd64 83 cmpldi cr5, r9, 4096-16
80ab6401
RS
84 bgt cr5, L(pagecross_check)
85
04f0fd64 86 /* For short strings up to 16 bytes, load both s1 and s2 using
80ab6401
RS
87 unaligned dwords and compare. */
88 ld r8, 0(r3)
89 ld r10, 0(r4)
90 cmpb r12, r8, r0
91 cmpb r11, r8, r10
92 orc. r9, r12, r11
93 bne cr0, L(different_nocmpb)
94
95 ld r8, 8(r3)
96 ld r10, 8(r4)
97 cmpb r12, r8, r0
98 cmpb r11, r8, r10
99 orc. r9, r12, r11
100 bne cr0, L(different_nocmpb)
101
04f0fd64
RS
102 addi r7, r3, 16
103 addi r4, r4, 16
80ab6401
RS
104
105L(align):
04f0fd64 106 /* Now it has checked for first 16 bytes. */
80ab6401
RS
107 vspltisb v0, 0
108 vspltisb v2, -1
109 lvsr v6, 0, r4 /* Compute mask. */
110 or r5, r4, r7
111 andi. r5, r5, 0xF
112 beq cr0, L(aligned)
113 andi. r5, r7, 0xF
114 beq cr0, L(s1_align)
115 lvsr v10, 0, r7 /* Compute mask. */
116
117 /* Both s1 and s2 are unaligned. */
118 GET16BYTES(v4, r7, v10)
119 GET16BYTES(v5, r4, v6)
120 VCMPNEZB(v7, v5, v4)
121 beq cr6, L(match)
122 b L(different)
123
124 /* Align s1 to qw and adjust s2 address. */
125 .align 4
126L(match):
127 clrldi r6, r7, 60
128 subfic r5, r6, 16
129 add r7, r7, r5
130 add r4, r4, r5
131 andi. r5, r4, 0xF
132 beq cr0, L(aligned)
133 lvsr v6, 0, r4
134 /* There are 2 loops depending on the input alignment.
135 Each loop gets 16 bytes from s1 and s2 and compares.
136 Loop until a mismatch or null occurs. */
137L(s1_align):
138 lvx v4, r7, r0
139 GET16BYTES(v5, r4, v6)
140 VCMPNEZB(v7, v5, v4)
141 addi r7, r7, 16
142 addi r4, r4, 16
143 bne cr6, L(different)
144
145 lvx v4, r7, r0
146 GET16BYTES(v5, r4, v6)
147 VCMPNEZB(v7, v5, v4)
148 addi r7, r7, 16
149 addi r4, r4, 16
150 bne cr6, L(different)
151
152 lvx v4, r7, r0
153 GET16BYTES(v5, r4, v6)
154 VCMPNEZB(v7, v5, v4)
155 addi r7, r7, 16
156 addi r4, r4, 16
157 bne cr6, L(different)
158
159 lvx v4, r7, r0
160 GET16BYTES(v5, r4, v6)
161 VCMPNEZB(v7, v5, v4)
162 addi r7, r7, 16
163 addi r4, r4, 16
164 beq cr6, L(s1_align)
165 b L(different)
166
167 .align 4
168L(aligned):
169 lvx v4, 0, r7
170 lvx v5, 0, r4
171 VCMPNEZB(v7, v5, v4)
172 addi r7, r7, 16
173 addi r4, r4, 16
174 bne cr6, L(different)
175
176 lvx v4, 0, r7
177 lvx v5, 0, r4
178 VCMPNEZB(v7, v5, v4)
179 addi r7, r7, 16
180 addi r4, r4, 16
181 bne cr6, L(different)
182
183 lvx v4, 0, r7
184 lvx v5, 0, r4
185 VCMPNEZB(v7, v5, v4)
186 addi r7, r7, 16
187 addi r4, r4, 16
188 bne cr6, L(different)
189
190 lvx v4, 0, r7
191 lvx v5, 0, r4
192 VCMPNEZB(v7, v5, v4)
193 addi r7, r7, 16
194 addi r4, r4, 16
195 beq cr6, L(aligned)
196
197 /* Calculate and return the difference. */
198L(different):
199 VCTZLSBB(r6, v7)
200 VEXTUBRX(r5, r6, v4)
201 VEXTUBRX(r4, r6, v5)
202 subf r3, r4, r5
203 extsw r3, r3
204 blr
205
206 .align 4
207L(different_nocmpb):
208 neg r3, r9
209 and r9, r9, r3
210 cntlzd r9, r9
211 subfic r9, r9, 63
212 srd r3, r8, r9
213 srd r10, r10, r9
214 rldicl r10, r10, 0, 56
215 rldicl r3, r3, 0, 56
216 subf r3, r10, r3
217 extsw r3, r3
218 blr
219
220 .align 4
221L(pagecross_check):
222 subfic r9, r9, 4096
223 subfic r7, r7, 4096
224 cmpld cr7, r7, r9
225 bge cr7, L(pagecross)
226 mr r7, r9
227
228 /* If unaligned 16 bytes reads across a 4K page boundary, it uses
229 a simple byte a byte comparison until the page alignment for s1
230 is reached. */
231L(pagecross):
232 add r7, r3, r7
233 subf r9, r3, r7
234 mtctr r9
235
236 .align 4
237L(pagecross_loop):
238 /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
239 and if *s1 is '\0'. */
240 lbz r9, 0(r3)
241 lbz r10, 0(r4)
242 addi r3, r3, 1
243 addi r4, r4, 1
244 cmplw cr7, r9, r10
245 cmpdi cr5, r9, r0
246 bne cr7, L(pagecross_ne)
247 beq cr5, L(pagecross_nullfound)
248 bdnz L(pagecross_loop)
249 b L(align)
250
251 .align 4
252L(pagecross_ne):
253 extsw r3, r9
254 mr r9, r10
255L(pagecross_retdiff):
256 subf r9, r9, r3
257 extsw r3, r9
258 blr
259
260 .align 4
261L(pagecross_nullfound):
262 li r3, 0
263 b L(pagecross_retdiff)
3bc426e1 264END (STRCMP)
80ab6401
RS
265libc_hidden_builtin_def (strcmp)
266#else
267#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
268#endif