]>
Commit | Line | Data |
---|---|---|
8bedcb5f | 1 | /* Optimized strcmp implementation for PowerPC64/POWER8. |
04277e02 | 2 | Copyright (C) 2015-2019 Free Software Foundation, Inc. |
8bedcb5f AZ |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
3bc426e1 WSM |
21 | #ifndef STRCMP |
22 | # define STRCMP strcmp | |
23 | #endif | |
24 | ||
8bedcb5f AZ |
25 | /* Implements the function |
26 | ||
27 | size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) | |
28 | ||
29 | The implementation uses unaligned doubleword access to avoid specialized | |
30 | code paths depending of data alignment. Although recent powerpc64 uses | |
31 | 64K as default, the page cross handling assumes minimum page size of | |
32 | 4k. */ | |
33 | ||
9250e661 | 34 | .machine power8 |
d5b41185 | 35 | ENTRY_TOCLESS (STRCMP, 4) |
8bedcb5f AZ |
36 | li r0,0 |
37 | ||
04f0fd64 | 38 | /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using |
8bedcb5f AZ |
39 | the code: |
40 | ||
41 | (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) | |
42 | ||
04f0fd64 | 43 | with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ |
8bedcb5f AZ |
44 | |
45 | rldicl r7,r3,0,52 | |
46 | rldicl r9,r4,0,52 | |
04f0fd64 | 47 | cmpldi cr7,r7,4096-16 |
8bedcb5f | 48 | bgt cr7,L(pagecross_check) |
04f0fd64 | 49 | cmpldi cr5,r9,4096-16 |
8bedcb5f AZ |
50 | bgt cr5,L(pagecross_check) |
51 | ||
04f0fd64 | 52 | /* For short string up to 16 bytes, load both s1 and s2 using |
8bedcb5f AZ |
53 | unaligned dwords and compare. */ |
54 | ld r8,0(r3) | |
55 | ld r10,0(r4) | |
56 | cmpb r12,r8,r0 | |
57 | cmpb r11,r8,r10 | |
58 | orc. r9,r12,r11 | |
59 | bne cr0,L(different_nocmpb) | |
60 | ||
61 | ld r8,8(r3) | |
62 | ld r10,8(r4) | |
63 | cmpb r12,r8,r0 | |
64 | cmpb r11,r8,r10 | |
65 | orc. r9,r12,r11 | |
66 | bne cr0,L(different_nocmpb) | |
67 | ||
04f0fd64 RS |
68 | addi r7,r3,16 |
69 | addi r4,r4,16 | |
8bedcb5f AZ |
70 | |
71 | L(align_8b): | |
04f0fd64 | 72 | /* Now it has checked for first 16 bytes, align source1 to doubleword |
8bedcb5f AZ |
73 | and adjust source2 address. */ |
74 | rldicl r9,r7,0,61 /* source1 alignment to doubleword */ | |
75 | subf r4,r9,r4 /* Adjust source2 address based on source1 | |
76 | alignment. */ | |
77 | rldicr r7,r7,0,60 /* Align source1 to doubleword. */ | |
78 | ||
79 | /* At this point, source1 alignment is 0 and source2 alignment is | |
80 | between 0 and 7. Check is source2 alignment is 0, meaning both | |
81 | sources have the same alignment. */ | |
82 | andi. r9,r4,0x7 | |
83 | bne cr0,L(loop_diff_align) | |
84 | ||
85 | /* If both source1 and source2 are doubleword aligned, there is no | |
86 | need for page boundary cross checks. */ | |
87 | ||
88 | ld r8,0(r7) | |
89 | ld r10,0(r4) | |
90 | cmpb r12,r8,r0 | |
91 | cmpb r11,r8,r10 | |
92 | orc. r9,r12,r11 | |
93 | bne cr0,L(different_nocmpb) | |
94 | ||
95 | .align 4 | |
96 | L(loop_equal_align): | |
97 | ld r8,8(r7) | |
98 | ld r10,8(r4) | |
99 | cmpb r12,r8,r0 | |
100 | cmpb r11,r8,r10 | |
101 | orc. r9,r12,r11 | |
102 | bne cr0,L(different_nocmpb) | |
103 | ||
104 | ld r8,16(r7) | |
105 | ld r10,16(r4) | |
106 | cmpb r12,r8,r0 | |
107 | cmpb r11,r8,r10 | |
108 | orc. r9,r12,r11 | |
109 | bne cr0,L(different_nocmpb) | |
110 | ||
111 | ldu r8,24(r7) | |
112 | ldu r10,24(r4) | |
113 | cmpb r12,r8,r0 | |
114 | cmpb r11,r8,r10 | |
115 | orc. r9,r12,r11 | |
116 | bne cr0,L(different_nocmpb) | |
117 | ||
118 | b L(loop_equal_align) | |
119 | ||
120 | /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb | |
121 | result and r10 the dword from s2. To code isolate the byte | |
122 | up to end (including the '\0'), masking with 0xFF the remaining | |
123 | ones: | |
124 | ||
125 | #if __LITTLE_ENDIAN__ | |
126 | (__builtin_ffsl (x) - 1) = counting trailing zero bits | |
127 | r9 = (__builtin_ffsl (r9) - 1) + 8; | |
128 | r9 = -1UL << r9 | |
129 | #else | |
130 | r9 = __builtin_clzl (r9) + 8; | |
131 | r9 = -1UL >> r9 | |
132 | #endif | |
133 | r8 = r8 | r9 | |
134 | r10 = r10 | r9 */ | |
135 | ||
136 | #ifdef __LITTLE_ENDIAN__ | |
137 | nor r9,r9,r9 | |
138 | L(different_nocmpb): | |
139 | neg r3,r9 | |
140 | and r9,r9,r3 | |
141 | cntlzd r9,r9 | |
142 | subfic r9,r9,63 | |
143 | #else | |
144 | not r9,r9 | |
145 | L(different_nocmpb): | |
146 | cntlzd r9,r9 | |
147 | subfic r9,r9,56 | |
148 | #endif | |
149 | srd r3,r8,r9 | |
150 | srd r10,r10,r9 | |
151 | rldicl r10,r10,0,56 | |
152 | rldicl r3,r3,0,56 | |
153 | subf r3,r10,r3 | |
154 | extsw r3,r3 | |
155 | blr | |
156 | ||
157 | .align 4 | |
158 | L(pagecross_check): | |
159 | subfic r9,r9,4096 | |
160 | subfic r7,r7,4096 | |
161 | cmpld cr7,r7,r9 | |
162 | bge cr7,L(pagecross) | |
163 | mr r7,r9 | |
164 | ||
165 | /* If unaligned 16 bytes reads across a 4K page boundary, it uses | |
166 | a simple byte a byte comparison until the page alignment for s1 | |
167 | is reached. */ | |
168 | L(pagecross): | |
169 | add r7,r3,r7 | |
170 | subf r9,r3,r7 | |
171 | mtctr r9 | |
172 | ||
173 | .align 4 | |
174 | L(pagecross_loop): | |
175 | /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 | |
176 | and if *s1 is '\0'. */ | |
177 | lbz r9,0(r3) | |
178 | lbz r10,0(r4) | |
179 | addi r3,r3,1 | |
180 | addi r4,r4,1 | |
181 | cmplw cr7,r9,r10 | |
182 | cmpdi cr5,r9,r0 | |
183 | bne cr7,L(pagecross_ne) | |
184 | beq cr5,L(pagecross_nullfound) | |
185 | bdnz L(pagecross_loop) | |
186 | b L(align_8b) | |
187 | ||
188 | .align 4 | |
189 | /* The unaligned read of source2 will cross a 4K page boundary, | |
190 | and the different byte or NULL maybe be in the remaining page | |
191 | bytes. Since it can not use the unaligned load, the algorithm | |
192 | reads and compares 8 bytes to keep source1 doubleword aligned. */ | |
193 | L(check_source2_byte): | |
194 | li r9,8 | |
195 | mtctr r9 | |
196 | ||
197 | .align 4 | |
198 | L(check_source2_byte_loop): | |
199 | lbz r9,0(r7) | |
200 | lbz r10,0(r4) | |
201 | addi r7,r7,1 | |
202 | addi r4,r4,1 | |
203 | cmplw cr7,r9,10 | |
204 | cmpdi r5,r9,0 | |
205 | bne cr7,L(pagecross_ne) | |
206 | beq cr5,L(pagecross_nullfound) | |
207 | bdnz L(check_source2_byte_loop) | |
208 | ||
209 | /* If source2 is unaligned to doubleword, the code needs to check | |
210 | on each interation if the unaligned doubleword access will cross | |
211 | a 4k page boundary. */ | |
212 | .align 5 | |
213 | L(loop_unaligned): | |
214 | ld r8,0(r7) | |
215 | ld r10,0(r4) | |
216 | cmpb r12,r8,r0 | |
217 | cmpb r11,r8,r10 | |
218 | orc. r9,r12,r11 | |
219 | bne cr0,L(different_nocmpb) | |
220 | addi r7,r7,8 | |
221 | addi r4,r4,8 | |
222 | ||
223 | L(loop_diff_align): | |
224 | /* Check if [src2]+8 cross a 4k page boundary: | |
225 | ||
226 | srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) | |
227 | ||
228 | with PAGE_SIZE being 4096. */ | |
229 | rldicl r9,r4,0,52 | |
230 | cmpldi cr7,r9,4088 | |
231 | ble cr7,L(loop_unaligned) | |
232 | b L(check_source2_byte) | |
233 | ||
234 | .align 4 | |
235 | L(pagecross_ne): | |
236 | extsw r3,r9 | |
237 | mr r9,r10 | |
238 | L(pagecross_retdiff): | |
239 | subf r9,r9,r3 | |
240 | extsw r3,r9 | |
241 | blr | |
242 | ||
243 | .align 4 | |
244 | L(pagecross_nullfound): | |
245 | li r3,0 | |
246 | b L(pagecross_retdiff) | |
3bc426e1 | 247 | END (STRCMP) |
8bedcb5f | 248 | libc_hidden_builtin_def (strcmp) |