]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power9/strcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power9 / strcmp.S
1 /* Optimized strcmp implementation for PowerPC64/POWER9.
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18 #ifdef __LITTLE_ENDIAN__
19 #include <sysdep.h>
20
21 /* Implements the function
22
23 int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
24
25 The implementation uses unaligned doubleword access for first 32 bytes
26 as in POWER8 patch and uses vectorised loops after that. */
27
28 /* TODO: Change this to actual instructions when minimum binutils is upgraded
29 to 2.27. Macros are defined below for these newer instructions in order
30 to maintain compatibility. */
31 # define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
32
33 # define VEXTUBRX(t,a,b) .long (0x1000070d \
34 | ((t)<<(32-11)) \
35 | ((a)<<(32-16)) \
36 | ((b)<<(32-21)) )
37
38 # define VCMPNEZB(t,a,b) .long (0x10000507 \
39 | ((t)<<(32-11)) \
40 | ((a)<<(32-16)) \
41 | ((b)<<(32-21)) )
42
43 /* Get 16 bytes for unaligned case.
44 reg1: Vector to hold next 16 bytes.
45 reg2: Address to read from.
46 reg3: Permute control vector. */
47 # define GET16BYTES(reg1, reg2, reg3) \
48 lvx reg1, 0, reg2; \
49 vperm v8, v2, reg1, reg3; \
50 vcmpequb. v8, v0, v8; \
51 beq cr6, 1f; \
52 vspltisb v9, 0; \
53 b 2f; \
54 .align 4; \
55 1: \
56 addi r6, reg2, 16; \
57 lvx v9, 0, r6; \
58 2: \
59 vperm reg1, v9, reg1, reg3;
60
61 /* TODO: change this to .machine power9 when the minimum required binutils
62 allows it. */
63
64 .machine power7
65 EALIGN (strcmp, 4, 0)
66 li r0, 0
67
68 /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
69 the code:
70
71 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
72
73 with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
74
75 rldicl r7, r3, 0, 52
76 rldicl r9, r4, 0, 52
77 cmpldi cr7, r7, 4096-32
78 bgt cr7, L(pagecross_check)
79 cmpldi cr5, r9, 4096-32
80 bgt cr5, L(pagecross_check)
81
82 /* For short strings up to 32 bytes, load both s1 and s2 using
83 unaligned dwords and compare. */
84 ld r8, 0(r3)
85 ld r10, 0(r4)
86 cmpb r12, r8, r0
87 cmpb r11, r8, r10
88 orc. r9, r12, r11
89 bne cr0, L(different_nocmpb)
90
91 ld r8, 8(r3)
92 ld r10, 8(r4)
93 cmpb r12, r8, r0
94 cmpb r11, r8, r10
95 orc. r9, r12, r11
96 bne cr0, L(different_nocmpb)
97
98 ld r8, 16(r3)
99 ld r10, 16(r4)
100 cmpb r12, r8, r0
101 cmpb r11, r8, r10
102 orc. r9, r12, r11
103 bne cr0, L(different_nocmpb)
104
105 ld r8, 24(r3)
106 ld r10, 24(r4)
107 cmpb r12, r8, r0
108 cmpb r11, r8, r10
109 orc. r9, r12, r11
110 bne cr0, L(different_nocmpb)
111
112 addi r7, r3, 32
113 addi r4, r4, 32
114
115 L(align):
116 /* Now it has checked for first 32 bytes. */
117 vspltisb v0, 0
118 vspltisb v2, -1
119 lvsr v6, 0, r4 /* Compute mask. */
120 or r5, r4, r7
121 andi. r5, r5, 0xF
122 beq cr0, L(aligned)
123 andi. r5, r7, 0xF
124 beq cr0, L(s1_align)
125 lvsr v10, 0, r7 /* Compute mask. */
126
127 /* Both s1 and s2 are unaligned. */
128 GET16BYTES(v4, r7, v10)
129 GET16BYTES(v5, r4, v6)
130 VCMPNEZB(v7, v5, v4)
131 beq cr6, L(match)
132 b L(different)
133
134 /* Align s1 to qw and adjust s2 address. */
135 .align 4
136 L(match):
137 clrldi r6, r7, 60
138 subfic r5, r6, 16
139 add r7, r7, r5
140 add r4, r4, r5
141 andi. r5, r4, 0xF
142 beq cr0, L(aligned)
143 lvsr v6, 0, r4
144 /* There are 2 loops depending on the input alignment.
145 Each loop gets 16 bytes from s1 and s2 and compares.
146 Loop until a mismatch or null occurs. */
147 L(s1_align):
148 lvx v4, r7, r0
149 GET16BYTES(v5, r4, v6)
150 VCMPNEZB(v7, v5, v4)
151 addi r7, r7, 16
152 addi r4, r4, 16
153 bne cr6, L(different)
154
155 lvx v4, r7, r0
156 GET16BYTES(v5, r4, v6)
157 VCMPNEZB(v7, v5, v4)
158 addi r7, r7, 16
159 addi r4, r4, 16
160 bne cr6, L(different)
161
162 lvx v4, r7, r0
163 GET16BYTES(v5, r4, v6)
164 VCMPNEZB(v7, v5, v4)
165 addi r7, r7, 16
166 addi r4, r4, 16
167 bne cr6, L(different)
168
169 lvx v4, r7, r0
170 GET16BYTES(v5, r4, v6)
171 VCMPNEZB(v7, v5, v4)
172 addi r7, r7, 16
173 addi r4, r4, 16
174 beq cr6, L(s1_align)
175 b L(different)
176
177 .align 4
178 L(aligned):
179 lvx v4, 0, r7
180 lvx v5, 0, r4
181 VCMPNEZB(v7, v5, v4)
182 addi r7, r7, 16
183 addi r4, r4, 16
184 bne cr6, L(different)
185
186 lvx v4, 0, r7
187 lvx v5, 0, r4
188 VCMPNEZB(v7, v5, v4)
189 addi r7, r7, 16
190 addi r4, r4, 16
191 bne cr6, L(different)
192
193 lvx v4, 0, r7
194 lvx v5, 0, r4
195 VCMPNEZB(v7, v5, v4)
196 addi r7, r7, 16
197 addi r4, r4, 16
198 bne cr6, L(different)
199
200 lvx v4, 0, r7
201 lvx v5, 0, r4
202 VCMPNEZB(v7, v5, v4)
203 addi r7, r7, 16
204 addi r4, r4, 16
205 beq cr6, L(aligned)
206
207 /* Calculate and return the difference. */
208 L(different):
209 VCTZLSBB(r6, v7)
210 VEXTUBRX(r5, r6, v4)
211 VEXTUBRX(r4, r6, v5)
212 subf r3, r4, r5
213 extsw r3, r3
214 blr
215
216 .align 4
217 L(different_nocmpb):
218 neg r3, r9
219 and r9, r9, r3
220 cntlzd r9, r9
221 subfic r9, r9, 63
222 srd r3, r8, r9
223 srd r10, r10, r9
224 rldicl r10, r10, 0, 56
225 rldicl r3, r3, 0, 56
226 subf r3, r10, r3
227 extsw r3, r3
228 blr
229
230 .align 4
231 L(pagecross_check):
232 subfic r9, r9, 4096
233 subfic r7, r7, 4096
234 cmpld cr7, r7, r9
235 bge cr7, L(pagecross)
236 mr r7, r9
237
238 /* If unaligned 16 bytes reads across a 4K page boundary, it uses
239 a simple byte a byte comparison until the page alignment for s1
240 is reached. */
241 L(pagecross):
242 add r7, r3, r7
243 subf r9, r3, r7
244 mtctr r9
245
246 .align 4
247 L(pagecross_loop):
248 /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
249 and if *s1 is '\0'. */
250 lbz r9, 0(r3)
251 lbz r10, 0(r4)
252 addi r3, r3, 1
253 addi r4, r4, 1
254 cmplw cr7, r9, r10
255 cmpdi cr5, r9, r0
256 bne cr7, L(pagecross_ne)
257 beq cr5, L(pagecross_nullfound)
258 bdnz L(pagecross_loop)
259 b L(align)
260
261 .align 4
262 L(pagecross_ne):
263 extsw r3, r9
264 mr r9, r10
265 L(pagecross_retdiff):
266 subf r9, r9, r3
267 extsw r3, r9
268 blr
269
270 .align 4
271 L(pagecross_nullfound):
272 li r3, 0
273 b L(pagecross_retdiff)
274 END (strcmp)
275 libc_hidden_builtin_def (strcmp)
276 #else
277 #include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
278 #endif