]>
Commit | Line | Data |
---|---|---|
80ab6401 | 1 | /* Optimized strcmp implementation for PowerPC64/POWER9. |
688903eb | 2 | Copyright (C) 2016-2018 Free Software Foundation, Inc. |
80ab6401 RS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | #ifdef __LITTLE_ENDIAN__ | |
19 | #include <sysdep.h> | |
20 | ||
3bc426e1 WSM |
21 | #ifndef STRCMP |
22 | # define STRCMP strcmp | |
23 | #endif | |
24 | ||
80ab6401 RS |
25 | /* Implements the function |
26 | ||
27 | int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) | |
28 | ||
29 | The implementation uses unaligned doubleword access for first 32 bytes | |
30 | as in POWER8 patch and uses vectorised loops after that. */ | |
31 | ||
32 | /* TODO: Change this to actual instructions when minimum binutils is upgraded | |
33 | to 2.27. Macros are defined below for these newer instructions in order | |
34 | to maintain compatibility. */ | |
35 | # define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21))) | |
36 | ||
37 | # define VEXTUBRX(t,a,b) .long (0x1000070d \ | |
38 | | ((t)<<(32-11)) \ | |
39 | | ((a)<<(32-16)) \ | |
40 | | ((b)<<(32-21)) ) | |
41 | ||
42 | # define VCMPNEZB(t,a,b) .long (0x10000507 \ | |
43 | | ((t)<<(32-11)) \ | |
44 | | ((a)<<(32-16)) \ | |
45 | | ((b)<<(32-21)) ) | |
46 | ||
47 | /* Get 16 bytes for unaligned case. | |
48 | reg1: Vector to hold next 16 bytes. | |
49 | reg2: Address to read from. | |
50 | reg3: Permute control vector. */ | |
51 | # define GET16BYTES(reg1, reg2, reg3) \ | |
52 | lvx reg1, 0, reg2; \ | |
53 | vperm v8, v2, reg1, reg3; \ | |
54 | vcmpequb. v8, v0, v8; \ | |
55 | beq cr6, 1f; \ | |
56 | vspltisb v9, 0; \ | |
57 | b 2f; \ | |
58 | .align 4; \ | |
59 | 1: \ | |
60 | addi r6, reg2, 16; \ | |
61 | lvx v9, 0, r6; \ | |
62 | 2: \ | |
63 | vperm reg1, v9, reg1, reg3; | |
64 | ||
65 | /* TODO: change this to .machine power9 when the minimum required binutils | |
66 | allows it. */ | |
67 | ||
68 | .machine power7 | |
d5b41185 | 69 | ENTRY_TOCLESS (STRCMP, 4) |
80ab6401 RS |
70 | li r0, 0 |
71 | ||
04f0fd64 | 72 | /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using |
80ab6401 RS |
73 | the code: |
74 | ||
75 | (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) | |
76 | ||
04f0fd64 | 77 | with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ |
80ab6401 RS |
78 | |
79 | rldicl r7, r3, 0, 52 | |
80 | rldicl r9, r4, 0, 52 | |
04f0fd64 | 81 | cmpldi cr7, r7, 4096-16 |
80ab6401 | 82 | bgt cr7, L(pagecross_check) |
04f0fd64 | 83 | cmpldi cr5, r9, 4096-16 |
80ab6401 RS |
84 | bgt cr5, L(pagecross_check) |
85 | ||
04f0fd64 | 86 | /* For short strings up to 16 bytes, load both s1 and s2 using |
80ab6401 RS |
87 | unaligned dwords and compare. */ |
88 | ld r8, 0(r3) | |
89 | ld r10, 0(r4) | |
90 | cmpb r12, r8, r0 | |
91 | cmpb r11, r8, r10 | |
92 | orc. r9, r12, r11 | |
93 | bne cr0, L(different_nocmpb) | |
94 | ||
95 | ld r8, 8(r3) | |
96 | ld r10, 8(r4) | |
97 | cmpb r12, r8, r0 | |
98 | cmpb r11, r8, r10 | |
99 | orc. r9, r12, r11 | |
100 | bne cr0, L(different_nocmpb) | |
101 | ||
04f0fd64 RS |
102 | addi r7, r3, 16 |
103 | addi r4, r4, 16 | |
80ab6401 RS |
104 | |
105 | L(align): | |
04f0fd64 | 106 | /* Now it has checked for first 16 bytes. */ |
80ab6401 RS |
107 | vspltisb v0, 0 |
108 | vspltisb v2, -1 | |
109 | lvsr v6, 0, r4 /* Compute mask. */ | |
110 | or r5, r4, r7 | |
111 | andi. r5, r5, 0xF | |
112 | beq cr0, L(aligned) | |
113 | andi. r5, r7, 0xF | |
114 | beq cr0, L(s1_align) | |
115 | lvsr v10, 0, r7 /* Compute mask. */ | |
116 | ||
117 | /* Both s1 and s2 are unaligned. */ | |
118 | GET16BYTES(v4, r7, v10) | |
119 | GET16BYTES(v5, r4, v6) | |
120 | VCMPNEZB(v7, v5, v4) | |
121 | beq cr6, L(match) | |
122 | b L(different) | |
123 | ||
124 | /* Align s1 to qw and adjust s2 address. */ | |
125 | .align 4 | |
126 | L(match): | |
127 | clrldi r6, r7, 60 | |
128 | subfic r5, r6, 16 | |
129 | add r7, r7, r5 | |
130 | add r4, r4, r5 | |
131 | andi. r5, r4, 0xF | |
132 | beq cr0, L(aligned) | |
133 | lvsr v6, 0, r4 | |
134 | /* There are 2 loops depending on the input alignment. | |
135 | Each loop gets 16 bytes from s1 and s2 and compares. | |
136 | Loop until a mismatch or null occurs. */ | |
137 | L(s1_align): | |
138 | lvx v4, r7, r0 | |
139 | GET16BYTES(v5, r4, v6) | |
140 | VCMPNEZB(v7, v5, v4) | |
141 | addi r7, r7, 16 | |
142 | addi r4, r4, 16 | |
143 | bne cr6, L(different) | |
144 | ||
145 | lvx v4, r7, r0 | |
146 | GET16BYTES(v5, r4, v6) | |
147 | VCMPNEZB(v7, v5, v4) | |
148 | addi r7, r7, 16 | |
149 | addi r4, r4, 16 | |
150 | bne cr6, L(different) | |
151 | ||
152 | lvx v4, r7, r0 | |
153 | GET16BYTES(v5, r4, v6) | |
154 | VCMPNEZB(v7, v5, v4) | |
155 | addi r7, r7, 16 | |
156 | addi r4, r4, 16 | |
157 | bne cr6, L(different) | |
158 | ||
159 | lvx v4, r7, r0 | |
160 | GET16BYTES(v5, r4, v6) | |
161 | VCMPNEZB(v7, v5, v4) | |
162 | addi r7, r7, 16 | |
163 | addi r4, r4, 16 | |
164 | beq cr6, L(s1_align) | |
165 | b L(different) | |
166 | ||
167 | .align 4 | |
168 | L(aligned): | |
169 | lvx v4, 0, r7 | |
170 | lvx v5, 0, r4 | |
171 | VCMPNEZB(v7, v5, v4) | |
172 | addi r7, r7, 16 | |
173 | addi r4, r4, 16 | |
174 | bne cr6, L(different) | |
175 | ||
176 | lvx v4, 0, r7 | |
177 | lvx v5, 0, r4 | |
178 | VCMPNEZB(v7, v5, v4) | |
179 | addi r7, r7, 16 | |
180 | addi r4, r4, 16 | |
181 | bne cr6, L(different) | |
182 | ||
183 | lvx v4, 0, r7 | |
184 | lvx v5, 0, r4 | |
185 | VCMPNEZB(v7, v5, v4) | |
186 | addi r7, r7, 16 | |
187 | addi r4, r4, 16 | |
188 | bne cr6, L(different) | |
189 | ||
190 | lvx v4, 0, r7 | |
191 | lvx v5, 0, r4 | |
192 | VCMPNEZB(v7, v5, v4) | |
193 | addi r7, r7, 16 | |
194 | addi r4, r4, 16 | |
195 | beq cr6, L(aligned) | |
196 | ||
197 | /* Calculate and return the difference. */ | |
198 | L(different): | |
199 | VCTZLSBB(r6, v7) | |
200 | VEXTUBRX(r5, r6, v4) | |
201 | VEXTUBRX(r4, r6, v5) | |
202 | subf r3, r4, r5 | |
203 | extsw r3, r3 | |
204 | blr | |
205 | ||
206 | .align 4 | |
207 | L(different_nocmpb): | |
208 | neg r3, r9 | |
209 | and r9, r9, r3 | |
210 | cntlzd r9, r9 | |
211 | subfic r9, r9, 63 | |
212 | srd r3, r8, r9 | |
213 | srd r10, r10, r9 | |
214 | rldicl r10, r10, 0, 56 | |
215 | rldicl r3, r3, 0, 56 | |
216 | subf r3, r10, r3 | |
217 | extsw r3, r3 | |
218 | blr | |
219 | ||
220 | .align 4 | |
221 | L(pagecross_check): | |
222 | subfic r9, r9, 4096 | |
223 | subfic r7, r7, 4096 | |
224 | cmpld cr7, r7, r9 | |
225 | bge cr7, L(pagecross) | |
226 | mr r7, r9 | |
227 | ||
228 | /* If unaligned 16 bytes reads across a 4K page boundary, it uses | |
229 | a simple byte a byte comparison until the page alignment for s1 | |
230 | is reached. */ | |
231 | L(pagecross): | |
232 | add r7, r3, r7 | |
233 | subf r9, r3, r7 | |
234 | mtctr r9 | |
235 | ||
236 | .align 4 | |
237 | L(pagecross_loop): | |
238 | /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 | |
239 | and if *s1 is '\0'. */ | |
240 | lbz r9, 0(r3) | |
241 | lbz r10, 0(r4) | |
242 | addi r3, r3, 1 | |
243 | addi r4, r4, 1 | |
244 | cmplw cr7, r9, r10 | |
245 | cmpdi cr5, r9, r0 | |
246 | bne cr7, L(pagecross_ne) | |
247 | beq cr5, L(pagecross_nullfound) | |
248 | bdnz L(pagecross_loop) | |
249 | b L(align) | |
250 | ||
251 | .align 4 | |
252 | L(pagecross_ne): | |
253 | extsw r3, r9 | |
254 | mr r9, r10 | |
255 | L(pagecross_retdiff): | |
256 | subf r9, r9, r3 | |
257 | extsw r3, r9 | |
258 | blr | |
259 | ||
260 | .align 4 | |
261 | L(pagecross_nullfound): | |
262 | li r3, 0 | |
263 | b L(pagecross_retdiff) | |
3bc426e1 | 264 | END (STRCMP) |
80ab6401 RS |
265 | libc_hidden_builtin_def (strcmp) |
266 | #else | |
267 | #include <sysdeps/powerpc/powerpc64/power8/strcmp.S> | |
268 | #endif |