]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/le/power9/strcmp.S
412a13599ded0bc47d0f5fd03698175f151bd716
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / le / power9 / strcmp.S
1 /* Optimized strcmp implementation for PowerPC64/POWER9.
2 Copyright (C) 2016-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18 #include <sysdep.h>
19
20 #ifndef STRCMP
21 # define STRCMP strcmp
22 #endif
23
24 /* Implements the function
25
26 int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
27
28 The implementation uses unaligned doubleword access for first 32 bytes
29 as in POWER8 patch and uses vectorised loops after that. */
30
31 /* TODO: Change this to actual instructions when minimum binutils is upgraded
32 to 2.27. Macros are defined below for these newer instructions in order
33 to maintain compatibility. */
34 #define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
35
36 #define VEXTUBRX(t,a,b) .long (0x1000070d \
37 | ((t)<<(32-11)) \
38 | ((a)<<(32-16)) \
39 | ((b)<<(32-21)) )
40
41 #define VCMPNEZB(t,a,b) .long (0x10000507 \
42 | ((t)<<(32-11)) \
43 | ((a)<<(32-16)) \
44 | ((b)<<(32-21)) )
45
46 /* Get 16 bytes for unaligned case.
47 reg1: Vector to hold next 16 bytes.
48 reg2: Address to read from.
49 reg3: Permute control vector. */
50 #define GET16BYTES(reg1, reg2, reg3) \
51 lvx reg1, 0, reg2; \
52 vperm v8, v2, reg1, reg3; \
53 vcmpequb. v8, v0, v8; \
54 beq cr6, 1f; \
55 vspltisb v9, 0; \
56 b 2f; \
57 .align 4; \
58 1: \
59 addi r6, reg2, 16; \
60 lvx v9, 0, r6; \
61 2: \
62 vperm reg1, v9, reg1, reg3;
63
64 /* TODO: change this to .machine power9 when the minimum required binutils
65 allows it. */
66
67 .machine power7
68 ENTRY_TOCLESS (STRCMP, 4)
69 li r0, 0
70
71 /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
72 the code:
73
74 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
75
76 with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
77
78 rldicl r7, r3, 0, 52
79 rldicl r9, r4, 0, 52
80 cmpldi cr7, r7, 4096-16
81 bgt cr7, L(pagecross_check)
82 cmpldi cr5, r9, 4096-16
83 bgt cr5, L(pagecross_check)
84
85 /* For short strings up to 16 bytes, load both s1 and s2 using
86 unaligned dwords and compare. */
87 ld r8, 0(r3)
88 ld r10, 0(r4)
89 cmpb r12, r8, r0
90 cmpb r11, r8, r10
91 orc. r9, r12, r11
92 bne cr0, L(different_nocmpb)
93
94 ld r8, 8(r3)
95 ld r10, 8(r4)
96 cmpb r12, r8, r0
97 cmpb r11, r8, r10
98 orc. r9, r12, r11
99 bne cr0, L(different_nocmpb)
100
101 addi r7, r3, 16
102 addi r4, r4, 16
103
104 L(align):
105 /* Now it has checked for first 16 bytes. */
106 vspltisb v0, 0
107 vspltisb v2, -1
108 lvsr v6, 0, r4 /* Compute mask. */
109 or r5, r4, r7
110 andi. r5, r5, 0xF
111 beq cr0, L(aligned)
112 andi. r5, r7, 0xF
113 beq cr0, L(s1_align)
114 lvsr v10, 0, r7 /* Compute mask. */
115
116 /* Both s1 and s2 are unaligned. */
117 GET16BYTES(v4, r7, v10)
118 GET16BYTES(v5, r4, v6)
119 VCMPNEZB(v7, v5, v4)
120 beq cr6, L(match)
121 b L(different)
122
123 /* Align s1 to qw and adjust s2 address. */
124 .align 4
125 L(match):
126 clrldi r6, r7, 60
127 subfic r5, r6, 16
128 add r7, r7, r5
129 add r4, r4, r5
130 andi. r5, r4, 0xF
131 beq cr0, L(aligned)
132 lvsr v6, 0, r4
133 /* There are 2 loops depending on the input alignment.
134 Each loop gets 16 bytes from s1 and s2 and compares.
135 Loop until a mismatch or null occurs. */
136 L(s1_align):
137 lvx v4, r7, r0
138 GET16BYTES(v5, r4, v6)
139 VCMPNEZB(v7, v5, v4)
140 addi r7, r7, 16
141 addi r4, r4, 16
142 bne cr6, L(different)
143
144 lvx v4, r7, r0
145 GET16BYTES(v5, r4, v6)
146 VCMPNEZB(v7, v5, v4)
147 addi r7, r7, 16
148 addi r4, r4, 16
149 bne cr6, L(different)
150
151 lvx v4, r7, r0
152 GET16BYTES(v5, r4, v6)
153 VCMPNEZB(v7, v5, v4)
154 addi r7, r7, 16
155 addi r4, r4, 16
156 bne cr6, L(different)
157
158 lvx v4, r7, r0
159 GET16BYTES(v5, r4, v6)
160 VCMPNEZB(v7, v5, v4)
161 addi r7, r7, 16
162 addi r4, r4, 16
163 beq cr6, L(s1_align)
164 b L(different)
165
166 .align 4
167 L(aligned):
168 lvx v4, 0, r7
169 lvx v5, 0, r4
170 VCMPNEZB(v7, v5, v4)
171 addi r7, r7, 16
172 addi r4, r4, 16
173 bne cr6, L(different)
174
175 lvx v4, 0, r7
176 lvx v5, 0, r4
177 VCMPNEZB(v7, v5, v4)
178 addi r7, r7, 16
179 addi r4, r4, 16
180 bne cr6, L(different)
181
182 lvx v4, 0, r7
183 lvx v5, 0, r4
184 VCMPNEZB(v7, v5, v4)
185 addi r7, r7, 16
186 addi r4, r4, 16
187 bne cr6, L(different)
188
189 lvx v4, 0, r7
190 lvx v5, 0, r4
191 VCMPNEZB(v7, v5, v4)
192 addi r7, r7, 16
193 addi r4, r4, 16
194 beq cr6, L(aligned)
195
196 /* Calculate and return the difference. */
197 L(different):
198 VCTZLSBB(r6, v7)
199 VEXTUBRX(r5, r6, v4)
200 VEXTUBRX(r4, r6, v5)
201 subf r3, r4, r5
202 extsw r3, r3
203 blr
204
205 .align 4
206 L(different_nocmpb):
207 neg r3, r9
208 and r9, r9, r3
209 cntlzd r9, r9
210 subfic r9, r9, 63
211 srd r3, r8, r9
212 srd r10, r10, r9
213 rldicl r10, r10, 0, 56
214 rldicl r3, r3, 0, 56
215 subf r3, r10, r3
216 extsw r3, r3
217 blr
218
219 .align 4
220 L(pagecross_check):
221 subfic r9, r9, 4096
222 subfic r7, r7, 4096
223 cmpld cr7, r7, r9
224 bge cr7, L(pagecross)
225 mr r7, r9
226
227 /* If unaligned 16 bytes reads across a 4K page boundary, it uses
228 a simple byte a byte comparison until the page alignment for s1
229 is reached. */
230 L(pagecross):
231 add r7, r3, r7
232 subf r9, r3, r7
233 mtctr r9
234
235 .align 4
236 L(pagecross_loop):
237 /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
238 and if *s1 is '\0'. */
239 lbz r9, 0(r3)
240 lbz r10, 0(r4)
241 addi r3, r3, 1
242 addi r4, r4, 1
243 cmplw cr7, r9, r10
244 cmpdi cr5, r9, r0
245 bne cr7, L(pagecross_ne)
246 beq cr5, L(pagecross_nullfound)
247 bdnz L(pagecross_loop)
248 b L(align)
249
250 .align 4
251 L(pagecross_ne):
252 extsw r3, r9
253 mr r9, r10
254 L(pagecross_retdiff):
255 subf r9, r9, r3
256 extsw r3, r9
257 blr
258
259 .align 4
260 L(pagecross_nullfound):
261 li r3, 0
262 b L(pagecross_retdiff)
263 END (STRCMP)
264 libc_hidden_builtin_def (strcmp)