]>
Commit | Line | Data |
---|---|---|
3af48cbd | 1 | /* memcpy with SSSE3 and REP string. |
d614a753 | 2 | Copyright (C) 2010-2020 Free Software Foundation, Inc. |
3af48cbd L |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 17 | License along with the GNU C Library; if not, see |
5a82c748 | 18 | <https://www.gnu.org/licenses/>. */ |
3af48cbd L |
19 | |
20 | #include <sysdep.h> | |
21 | ||
4f41c682 | 22 | #if IS_IN (libc) \ |
3af48cbd | 23 | && (defined SHARED \ |
f9a97dda | 24 | || defined USE_AS_MEMMOVE \ |
3af48cbd L |
25 | || !defined USE_MULTIARCH) |
26 | ||
27 | #include "asm-syntax.h" | |
28 | ||
29 | #ifndef MEMCPY | |
30 | # define MEMCPY __memcpy_ssse3_rep | |
31 | # define MEMCPY_CHK __memcpy_chk_ssse3_rep | |
32 | #endif | |
33 | ||
34 | #ifdef USE_AS_BCOPY | |
35 | # define SRC PARMS | |
36 | # define DEST SRC+4 | |
37 | # define LEN DEST+4 | |
38 | #else | |
39 | # define DEST PARMS | |
40 | # define SRC DEST+4 | |
41 | # define LEN SRC+4 | |
42 | #endif | |
43 | ||
44 | #define CFI_PUSH(REG) \ | |
45 | cfi_adjust_cfa_offset (4); \ | |
46 | cfi_rel_offset (REG, 0) | |
47 | ||
48 | #define CFI_POP(REG) \ | |
49 | cfi_adjust_cfa_offset (-4); \ | |
50 | cfi_restore (REG) | |
51 | ||
52 | #define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
53 | #define POP(REG) popl REG; CFI_POP (REG) | |
54 | ||
dfc93c41 | 55 | #ifdef PIC |
3af48cbd L |
56 | # define PARMS 8 /* Preserve EBX. */ |
57 | # define ENTRANCE PUSH (%ebx); | |
58 | # define RETURN_END POP (%ebx); ret | |
59 | # define RETURN RETURN_END; CFI_PUSH (%ebx) | |
60 | # define JMPTBL(I, B) I - B | |
61 | ||
62 | /* Load an entry in a jump table into EBX and branch to it. TABLE is a | |
63 | jump table with relative offsets. INDEX is a register contains the | |
64 | index into the jump table. SCALE is the scale of INDEX. */ | |
65 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ | |
66 | /* We first load PC into EBX. */ \ | |
9a1d9254 | 67 | SETUP_PIC_REG(bx); \ |
3af48cbd L |
68 | /* Get the address of the jump table. */ \ |
69 | addl $(TABLE - .), %ebx; \ | |
70 | /* Get the entry and convert the relative offset to the \ | |
71 | absolute address. */ \ | |
72 | addl (%ebx,INDEX,SCALE), %ebx; \ | |
73 | /* We loaded the jump table. Go. */ \ | |
0a899af0 | 74 | _CET_NOTRACK jmp *%ebx |
3af48cbd L |
75 | |
76 | # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ | |
f9a97dda UD |
77 | addl $(TABLE - .), %ebx |
78 | ||
3af48cbd L |
79 | # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ |
80 | addl (%ebx,INDEX,SCALE), %ebx; \ | |
81 | /* We loaded the jump table. Go. */ \ | |
0a899af0 | 82 | _CET_NOTRACK jmp *%ebx |
3af48cbd L |
83 | #else |
84 | # define PARMS 4 | |
85 | # define ENTRANCE | |
86 | # define RETURN_END ret | |
87 | # define RETURN RETURN_END | |
88 | # define JMPTBL(I, B) I | |
89 | ||
90 | /* Branch to an entry in a jump table. TABLE is a jump table with | |
91 | absolute offsets. INDEX is a register contains the index into the | |
92 | jump table. SCALE is the scale of INDEX. */ | |
93 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ | |
0a899af0 | 94 | _CET_NOTRACK jmp *TABLE(,INDEX,SCALE) |
3af48cbd | 95 | |
f9a97dda | 96 | # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) |
3af48cbd L |
97 | |
98 | # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ | |
0a899af0 | 99 | _CET_NOTRACK jmp *TABLE(,INDEX,SCALE) |
3af48cbd L |
100 | #endif |
101 | ||
102 | .section .text.ssse3,"ax",@progbits | |
b358255f | 103 | #if !defined USE_AS_BCOPY && defined SHARED |
3af48cbd L |
104 | ENTRY (MEMCPY_CHK) |
105 | movl 12(%esp), %eax | |
106 | cmpl %eax, 16(%esp) | |
107 | jb HIDDEN_JUMPTARGET (__chk_fail) | |
108 | END (MEMCPY_CHK) | |
109 | #endif | |
110 | ENTRY (MEMCPY) | |
111 | ENTRANCE | |
112 | movl LEN(%esp), %ecx | |
113 | movl SRC(%esp), %eax | |
114 | movl DEST(%esp), %edx | |
115 | ||
116 | #ifdef USE_AS_MEMMOVE | |
117 | cmp %eax, %edx | |
118 | jb L(copy_forward) | |
119 | je L(fwd_write_0bytes) | |
3093e0c7 L |
120 | cmp $48, %ecx |
121 | jb L(bk_write_less48bytes) | |
3af48cbd L |
122 | add %ecx, %eax |
123 | cmp %eax, %edx | |
124 | movl SRC(%esp), %eax | |
125 | jb L(copy_backward) | |
126 | ||
127 | L(copy_forward): | |
128 | #endif | |
129 | cmp $48, %ecx | |
3093e0c7 | 130 | jae L(48bytesormore) |
3af48cbd L |
131 | |
132 | L(fwd_write_less32bytes): | |
133 | #ifndef USE_AS_MEMMOVE | |
134 | cmp %dl, %al | |
3093e0c7 | 135 | jb L(bk_write) |
3af48cbd L |
136 | #endif |
137 | add %ecx, %edx | |
138 | add %ecx, %eax | |
139 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) | |
140 | #ifndef USE_AS_MEMMOVE | |
141 | L(bk_write): | |
142 | BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) | |
143 | #endif | |
144 | ||
145 | ALIGN (4) | |
146 | /* ECX > 32 and EDX is 4 byte aligned. */ | |
147 | L(48bytesormore): | |
148 | movdqu (%eax), %xmm0 | |
149 | PUSH (%edi) | |
150 | movl %edx, %edi | |
151 | and $-16, %edx | |
152 | PUSH (%esi) | |
3093e0c7 | 153 | cfi_remember_state |
3af48cbd L |
154 | add $16, %edx |
155 | movl %edi, %esi | |
156 | sub %edx, %edi | |
157 | add %edi, %ecx | |
158 | sub %edi, %eax | |
159 | ||
160 | #ifdef SHARED_CACHE_SIZE_HALF | |
161 | cmp $SHARED_CACHE_SIZE_HALF, %ecx | |
162 | #else | |
dfc93c41 | 163 | # ifdef PIC |
9a1d9254 | 164 | SETUP_PIC_REG(bx) |
3af48cbd L |
165 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
166 | cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx | |
167 | # else | |
168 | cmp __x86_shared_cache_size_half, %ecx | |
169 | # endif | |
170 | #endif | |
171 | ||
172 | mov %eax, %edi | |
3093e0c7 | 173 | jae L(large_page) |
3af48cbd L |
174 | and $0xf, %edi |
175 | jz L(shl_0) | |
176 | ||
177 | BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) | |
178 | ||
179 | ALIGN (4) | |
180 | L(shl_0): | |
181 | movdqu %xmm0, (%esi) | |
182 | xor %edi, %edi | |
183 | cmp $127, %ecx | |
184 | ja L(shl_0_gobble) | |
185 | lea -32(%ecx), %ecx | |
186 | L(shl_0_loop): | |
187 | movdqa (%eax, %edi), %xmm0 | |
188 | movdqa 16(%eax, %edi), %xmm1 | |
189 | sub $32, %ecx | |
190 | movdqa %xmm0, (%edx, %edi) | |
191 | movdqa %xmm1, 16(%edx, %edi) | |
192 | lea 32(%edi), %edi | |
3093e0c7 | 193 | jb L(shl_0_end) |
3af48cbd L |
194 | |
195 | movdqa (%eax, %edi), %xmm0 | |
196 | movdqa 16(%eax, %edi), %xmm1 | |
197 | sub $32, %ecx | |
198 | movdqa %xmm0, (%edx, %edi) | |
199 | movdqa %xmm1, 16(%edx, %edi) | |
200 | lea 32(%edi), %edi | |
3093e0c7 | 201 | jb L(shl_0_end) |
3af48cbd L |
202 | |
203 | movdqa (%eax, %edi), %xmm0 | |
204 | movdqa 16(%eax, %edi), %xmm1 | |
205 | sub $32, %ecx | |
206 | movdqa %xmm0, (%edx, %edi) | |
207 | movdqa %xmm1, 16(%edx, %edi) | |
208 | lea 32(%edi), %edi | |
3093e0c7 | 209 | jb L(shl_0_end) |
3af48cbd L |
210 | |
211 | movdqa (%eax, %edi), %xmm0 | |
212 | movdqa 16(%eax, %edi), %xmm1 | |
213 | sub $32, %ecx | |
214 | movdqa %xmm0, (%edx, %edi) | |
215 | movdqa %xmm1, 16(%edx, %edi) | |
216 | lea 32(%edi), %edi | |
217 | L(shl_0_end): | |
218 | lea 32(%ecx), %ecx | |
219 | add %ecx, %edi | |
220 | add %edi, %edx | |
221 | add %edi, %eax | |
222 | POP (%esi) | |
223 | POP (%edi) | |
224 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) | |
225 | ||
3093e0c7 L |
226 | cfi_restore_state |
227 | cfi_remember_state | |
3af48cbd L |
228 | L(shl_0_gobble): |
229 | ||
230 | #ifdef DATA_CACHE_SIZE_HALF | |
231 | cmp $DATA_CACHE_SIZE_HALF, %ecx | |
232 | #else | |
dfc93c41 | 233 | # ifdef PIC |
9a1d9254 | 234 | SETUP_PIC_REG(bx) |
3af48cbd L |
235 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
236 | mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi | |
237 | # else | |
238 | mov __x86_data_cache_size_half, %edi | |
239 | # endif | |
240 | #endif | |
241 | mov %edi, %esi | |
242 | shr $3, %esi | |
243 | sub %esi, %edi | |
244 | cmp %edi, %ecx | |
3093e0c7 L |
245 | jae L(shl_0_gobble_mem_start) |
246 | sub $128, %ecx | |
3af48cbd L |
247 | ALIGN (4) |
248 | L(shl_0_gobble_cache_loop): | |
249 | movdqa (%eax), %xmm0 | |
250 | movaps 0x10(%eax), %xmm1 | |
251 | movaps 0x20(%eax), %xmm2 | |
252 | movaps 0x30(%eax), %xmm3 | |
253 | movaps 0x40(%eax), %xmm4 | |
254 | movaps 0x50(%eax), %xmm5 | |
255 | movaps 0x60(%eax), %xmm6 | |
256 | movaps 0x70(%eax), %xmm7 | |
257 | lea 0x80(%eax), %eax | |
258 | sub $128, %ecx | |
259 | movdqa %xmm0, (%edx) | |
260 | movaps %xmm1, 0x10(%edx) | |
261 | movaps %xmm2, 0x20(%edx) | |
262 | movaps %xmm3, 0x30(%edx) | |
263 | movaps %xmm4, 0x40(%edx) | |
264 | movaps %xmm5, 0x50(%edx) | |
265 | movaps %xmm6, 0x60(%edx) | |
266 | movaps %xmm7, 0x70(%edx) | |
267 | lea 0x80(%edx), %edx | |
268 | ||
3093e0c7 L |
269 | jae L(shl_0_gobble_cache_loop) |
270 | add $0x80, %ecx | |
271 | cmp $0x40, %ecx | |
272 | jb L(shl_0_cache_less_64bytes) | |
3af48cbd L |
273 | |
274 | movdqa (%eax), %xmm0 | |
275 | sub $0x40, %ecx | |
276 | movdqa 0x10(%eax), %xmm1 | |
277 | ||
278 | movdqa %xmm0, (%edx) | |
279 | movdqa %xmm1, 0x10(%edx) | |
280 | ||
281 | movdqa 0x20(%eax), %xmm0 | |
282 | movdqa 0x30(%eax), %xmm1 | |
283 | add $0x40, %eax | |
284 | ||
285 | movdqa %xmm0, 0x20(%edx) | |
286 | movdqa %xmm1, 0x30(%edx) | |
287 | add $0x40, %edx | |
288 | L(shl_0_cache_less_64bytes): | |
289 | cmp $0x20, %ecx | |
3093e0c7 | 290 | jb L(shl_0_cache_less_32bytes) |
3af48cbd L |
291 | movdqa (%eax), %xmm0 |
292 | sub $0x20, %ecx | |
293 | movdqa 0x10(%eax), %xmm1 | |
294 | add $0x20, %eax | |
295 | movdqa %xmm0, (%edx) | |
296 | movdqa %xmm1, 0x10(%edx) | |
297 | add $0x20, %edx | |
298 | L(shl_0_cache_less_32bytes): | |
299 | cmp $0x10, %ecx | |
3093e0c7 | 300 | jb L(shl_0_cache_less_16bytes) |
3af48cbd L |
301 | sub $0x10, %ecx |
302 | movdqa (%eax), %xmm0 | |
303 | add $0x10, %eax | |
304 | movdqa %xmm0, (%edx) | |
305 | add $0x10, %edx | |
306 | L(shl_0_cache_less_16bytes): | |
307 | add %ecx, %edx | |
308 | add %ecx, %eax | |
309 | POP (%esi) | |
310 | POP (%edi) | |
311 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) | |
312 | ||
3093e0c7 L |
313 | cfi_restore_state |
314 | cfi_remember_state | |
3af48cbd L |
315 | ALIGN (4) |
316 | L(shl_0_gobble_mem_start): | |
317 | cmp %al, %dl | |
318 | je L(copy_page_by_rep) | |
3093e0c7 | 319 | sub $128, %ecx |
3af48cbd L |
320 | L(shl_0_gobble_mem_loop): |
321 | prefetchnta 0x1c0(%eax) | |
322 | prefetchnta 0x280(%eax) | |
323 | prefetchnta 0x1c0(%edx) | |
324 | prefetchnta 0x280(%edx) | |
325 | ||
326 | movdqa (%eax), %xmm0 | |
327 | movaps 0x10(%eax), %xmm1 | |
328 | movaps 0x20(%eax), %xmm2 | |
329 | movaps 0x30(%eax), %xmm3 | |
330 | movaps 0x40(%eax), %xmm4 | |
331 | movaps 0x50(%eax), %xmm5 | |
332 | movaps 0x60(%eax), %xmm6 | |
333 | movaps 0x70(%eax), %xmm7 | |
334 | lea 0x80(%eax), %eax | |
335 | sub $0x80, %ecx | |
336 | movdqa %xmm0, (%edx) | |
337 | movaps %xmm1, 0x10(%edx) | |
338 | movaps %xmm2, 0x20(%edx) | |
339 | movaps %xmm3, 0x30(%edx) | |
340 | movaps %xmm4, 0x40(%edx) | |
341 | movaps %xmm5, 0x50(%edx) | |
342 | movaps %xmm6, 0x60(%edx) | |
343 | movaps %xmm7, 0x70(%edx) | |
344 | lea 0x80(%edx), %edx | |
345 | ||
3093e0c7 L |
346 | jae L(shl_0_gobble_mem_loop) |
347 | add $0x80, %ecx | |
348 | cmp $0x40, %ecx | |
349 | jb L(shl_0_mem_less_64bytes) | |
3af48cbd L |
350 | |
351 | movdqa (%eax), %xmm0 | |
352 | sub $0x40, %ecx | |
353 | movdqa 0x10(%eax), %xmm1 | |
354 | ||
355 | movdqa %xmm0, (%edx) | |
356 | movdqa %xmm1, 0x10(%edx) | |
357 | ||
358 | movdqa 0x20(%eax), %xmm0 | |
359 | movdqa 0x30(%eax), %xmm1 | |
360 | add $0x40, %eax | |
361 | ||
362 | movdqa %xmm0, 0x20(%edx) | |
363 | movdqa %xmm1, 0x30(%edx) | |
364 | add $0x40, %edx | |
365 | L(shl_0_mem_less_64bytes): | |
366 | cmp $0x20, %ecx | |
3093e0c7 | 367 | jb L(shl_0_mem_less_32bytes) |
3af48cbd L |
368 | movdqa (%eax), %xmm0 |
369 | sub $0x20, %ecx | |
370 | movdqa 0x10(%eax), %xmm1 | |
371 | add $0x20, %eax | |
372 | movdqa %xmm0, (%edx) | |
373 | movdqa %xmm1, 0x10(%edx) | |
374 | add $0x20, %edx | |
375 | L(shl_0_mem_less_32bytes): | |
376 | cmp $0x10, %ecx | |
3093e0c7 | 377 | jb L(shl_0_mem_less_16bytes) |
3af48cbd L |
378 | sub $0x10, %ecx |
379 | movdqa (%eax), %xmm0 | |
380 | add $0x10, %eax | |
381 | movdqa %xmm0, (%edx) | |
382 | add $0x10, %edx | |
383 | L(shl_0_mem_less_16bytes): | |
384 | add %ecx, %edx | |
385 | add %ecx, %eax | |
386 | POP (%esi) | |
387 | POP (%edi) | |
388 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) | |
389 | ||
3093e0c7 L |
390 | cfi_restore_state |
391 | cfi_remember_state | |
3af48cbd L |
392 | ALIGN (4) |
393 | L(shl_1): | |
394 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 395 | sub $1, %eax |
3af48cbd L |
396 | movaps (%eax), %xmm1 |
397 | xor %edi, %edi | |
3093e0c7 | 398 | sub $32, %ecx |
3af48cbd L |
399 | movdqu %xmm0, (%esi) |
400 | POP (%esi) | |
401 | L(shl_1_loop): | |
402 | ||
403 | movdqa 16(%eax, %edi), %xmm2 | |
404 | sub $32, %ecx | |
405 | movdqa 32(%eax, %edi), %xmm3 | |
406 | movdqa %xmm3, %xmm4 | |
407 | palignr $1, %xmm2, %xmm3 | |
408 | palignr $1, %xmm1, %xmm2 | |
409 | lea 32(%edi), %edi | |
410 | movdqa %xmm2, -32(%edx, %edi) | |
411 | movdqa %xmm3, -16(%edx, %edi) | |
412 | ||
3093e0c7 | 413 | jb L(shl_1_end) |
3af48cbd L |
414 | |
415 | movdqa 16(%eax, %edi), %xmm2 | |
416 | sub $32, %ecx | |
417 | movdqa 32(%eax, %edi), %xmm3 | |
418 | movdqa %xmm3, %xmm1 | |
419 | palignr $1, %xmm2, %xmm3 | |
420 | palignr $1, %xmm4, %xmm2 | |
421 | lea 32(%edi), %edi | |
422 | movdqa %xmm2, -32(%edx, %edi) | |
423 | movdqa %xmm3, -16(%edx, %edi) | |
424 | ||
425 | jae L(shl_1_loop) | |
426 | ||
427 | L(shl_1_end): | |
3093e0c7 | 428 | add $32, %ecx |
3af48cbd L |
429 | add %ecx, %edi |
430 | add %edi, %edx | |
431 | lea 1(%edi, %eax), %eax | |
432 | POP (%edi) | |
433 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
434 | ||
3093e0c7 L |
435 | cfi_restore_state |
436 | cfi_remember_state | |
3af48cbd L |
437 | ALIGN (4) |
438 | L(shl_2): | |
439 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 440 | sub $2, %eax |
3af48cbd L |
441 | movaps (%eax), %xmm1 |
442 | xor %edi, %edi | |
3093e0c7 | 443 | sub $32, %ecx |
3af48cbd L |
444 | movdqu %xmm0, (%esi) |
445 | POP (%esi) | |
446 | L(shl_2_loop): | |
447 | ||
448 | movdqa 16(%eax, %edi), %xmm2 | |
449 | sub $32, %ecx | |
450 | movdqa 32(%eax, %edi), %xmm3 | |
451 | movdqa %xmm3, %xmm4 | |
452 | palignr $2, %xmm2, %xmm3 | |
453 | palignr $2, %xmm1, %xmm2 | |
454 | lea 32(%edi), %edi | |
455 | movdqa %xmm2, -32(%edx, %edi) | |
456 | movdqa %xmm3, -16(%edx, %edi) | |
457 | ||
3093e0c7 | 458 | jb L(shl_2_end) |
3af48cbd L |
459 | |
460 | movdqa 16(%eax, %edi), %xmm2 | |
461 | sub $32, %ecx | |
462 | movdqa 32(%eax, %edi), %xmm3 | |
463 | movdqa %xmm3, %xmm1 | |
464 | palignr $2, %xmm2, %xmm3 | |
465 | palignr $2, %xmm4, %xmm2 | |
466 | lea 32(%edi), %edi | |
467 | movdqa %xmm2, -32(%edx, %edi) | |
468 | movdqa %xmm3, -16(%edx, %edi) | |
469 | ||
470 | jae L(shl_2_loop) | |
471 | ||
472 | L(shl_2_end): | |
3093e0c7 | 473 | add $32, %ecx |
3af48cbd L |
474 | add %ecx, %edi |
475 | add %edi, %edx | |
476 | lea 2(%edi, %eax), %eax | |
477 | POP (%edi) | |
478 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
479 | ||
3093e0c7 L |
480 | cfi_restore_state |
481 | cfi_remember_state | |
3af48cbd L |
482 | ALIGN (4) |
483 | L(shl_3): | |
484 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 485 | sub $3, %eax |
3af48cbd L |
486 | movaps (%eax), %xmm1 |
487 | xor %edi, %edi | |
3093e0c7 | 488 | sub $32, %ecx |
3af48cbd L |
489 | movdqu %xmm0, (%esi) |
490 | POP (%esi) | |
491 | L(shl_3_loop): | |
492 | ||
493 | movdqa 16(%eax, %edi), %xmm2 | |
494 | sub $32, %ecx | |
495 | movdqa 32(%eax, %edi), %xmm3 | |
496 | movdqa %xmm3, %xmm4 | |
497 | palignr $3, %xmm2, %xmm3 | |
498 | palignr $3, %xmm1, %xmm2 | |
499 | lea 32(%edi), %edi | |
500 | movdqa %xmm2, -32(%edx, %edi) | |
501 | movdqa %xmm3, -16(%edx, %edi) | |
502 | ||
3093e0c7 | 503 | jb L(shl_3_end) |
3af48cbd L |
504 | |
505 | movdqa 16(%eax, %edi), %xmm2 | |
506 | sub $32, %ecx | |
507 | movdqa 32(%eax, %edi), %xmm3 | |
508 | movdqa %xmm3, %xmm1 | |
509 | palignr $3, %xmm2, %xmm3 | |
510 | palignr $3, %xmm4, %xmm2 | |
511 | lea 32(%edi), %edi | |
512 | movdqa %xmm2, -32(%edx, %edi) | |
513 | movdqa %xmm3, -16(%edx, %edi) | |
514 | ||
515 | jae L(shl_3_loop) | |
516 | ||
517 | L(shl_3_end): | |
3093e0c7 | 518 | add $32, %ecx |
3af48cbd L |
519 | add %ecx, %edi |
520 | add %edi, %edx | |
521 | lea 3(%edi, %eax), %eax | |
522 | POP (%edi) | |
523 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
524 | ||
3093e0c7 L |
525 | cfi_restore_state |
526 | cfi_remember_state | |
3af48cbd L |
527 | ALIGN (4) |
528 | L(shl_4): | |
529 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 530 | sub $4, %eax |
3af48cbd L |
531 | movaps (%eax), %xmm1 |
532 | xor %edi, %edi | |
3093e0c7 | 533 | sub $32, %ecx |
3af48cbd L |
534 | movdqu %xmm0, (%esi) |
535 | POP (%esi) | |
536 | L(shl_4_loop): | |
537 | ||
538 | movdqa 16(%eax, %edi), %xmm2 | |
539 | sub $32, %ecx | |
540 | movdqa 32(%eax, %edi), %xmm3 | |
541 | movdqa %xmm3, %xmm4 | |
542 | palignr $4, %xmm2, %xmm3 | |
543 | palignr $4, %xmm1, %xmm2 | |
544 | lea 32(%edi), %edi | |
545 | movdqa %xmm2, -32(%edx, %edi) | |
546 | movdqa %xmm3, -16(%edx, %edi) | |
547 | ||
3093e0c7 | 548 | jb L(shl_4_end) |
3af48cbd L |
549 | |
550 | movdqa 16(%eax, %edi), %xmm2 | |
551 | sub $32, %ecx | |
552 | movdqa 32(%eax, %edi), %xmm3 | |
553 | movdqa %xmm3, %xmm1 | |
554 | palignr $4, %xmm2, %xmm3 | |
555 | palignr $4, %xmm4, %xmm2 | |
556 | lea 32(%edi), %edi | |
557 | movdqa %xmm2, -32(%edx, %edi) | |
558 | movdqa %xmm3, -16(%edx, %edi) | |
559 | ||
560 | jae L(shl_4_loop) | |
561 | ||
562 | L(shl_4_end): | |
3093e0c7 | 563 | add $32, %ecx |
3af48cbd L |
564 | add %ecx, %edi |
565 | add %edi, %edx | |
566 | lea 4(%edi, %eax), %eax | |
567 | POP (%edi) | |
568 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
569 | ||
3093e0c7 L |
570 | cfi_restore_state |
571 | cfi_remember_state | |
3af48cbd L |
572 | ALIGN (4) |
573 | L(shl_5): | |
574 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 575 | sub $5, %eax |
3af48cbd L |
576 | movaps (%eax), %xmm1 |
577 | xor %edi, %edi | |
3093e0c7 | 578 | sub $32, %ecx |
3af48cbd L |
579 | movdqu %xmm0, (%esi) |
580 | POP (%esi) | |
581 | L(shl_5_loop): | |
582 | ||
583 | movdqa 16(%eax, %edi), %xmm2 | |
584 | sub $32, %ecx | |
585 | movdqa 32(%eax, %edi), %xmm3 | |
586 | movdqa %xmm3, %xmm4 | |
587 | palignr $5, %xmm2, %xmm3 | |
588 | palignr $5, %xmm1, %xmm2 | |
589 | lea 32(%edi), %edi | |
590 | movdqa %xmm2, -32(%edx, %edi) | |
591 | movdqa %xmm3, -16(%edx, %edi) | |
592 | ||
3093e0c7 | 593 | jb L(shl_5_end) |
3af48cbd L |
594 | |
595 | movdqa 16(%eax, %edi), %xmm2 | |
596 | sub $32, %ecx | |
597 | movdqa 32(%eax, %edi), %xmm3 | |
598 | movdqa %xmm3, %xmm1 | |
599 | palignr $5, %xmm2, %xmm3 | |
600 | palignr $5, %xmm4, %xmm2 | |
601 | lea 32(%edi), %edi | |
602 | movdqa %xmm2, -32(%edx, %edi) | |
603 | movdqa %xmm3, -16(%edx, %edi) | |
604 | ||
605 | jae L(shl_5_loop) | |
606 | ||
607 | L(shl_5_end): | |
3093e0c7 | 608 | add $32, %ecx |
3af48cbd L |
609 | add %ecx, %edi |
610 | add %edi, %edx | |
611 | lea 5(%edi, %eax), %eax | |
612 | POP (%edi) | |
613 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
614 | ||
3093e0c7 L |
615 | cfi_restore_state |
616 | cfi_remember_state | |
3af48cbd L |
617 | ALIGN (4) |
618 | L(shl_6): | |
619 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 620 | sub $6, %eax |
3af48cbd L |
621 | movaps (%eax), %xmm1 |
622 | xor %edi, %edi | |
3093e0c7 | 623 | sub $32, %ecx |
3af48cbd L |
624 | movdqu %xmm0, (%esi) |
625 | POP (%esi) | |
626 | L(shl_6_loop): | |
627 | ||
628 | movdqa 16(%eax, %edi), %xmm2 | |
629 | sub $32, %ecx | |
630 | movdqa 32(%eax, %edi), %xmm3 | |
631 | movdqa %xmm3, %xmm4 | |
632 | palignr $6, %xmm2, %xmm3 | |
633 | palignr $6, %xmm1, %xmm2 | |
634 | lea 32(%edi), %edi | |
635 | movdqa %xmm2, -32(%edx, %edi) | |
636 | movdqa %xmm3, -16(%edx, %edi) | |
637 | ||
3093e0c7 | 638 | jb L(shl_6_end) |
3af48cbd L |
639 | |
640 | movdqa 16(%eax, %edi), %xmm2 | |
641 | sub $32, %ecx | |
642 | movdqa 32(%eax, %edi), %xmm3 | |
643 | movdqa %xmm3, %xmm1 | |
644 | palignr $6, %xmm2, %xmm3 | |
645 | palignr $6, %xmm4, %xmm2 | |
646 | lea 32(%edi), %edi | |
647 | movdqa %xmm2, -32(%edx, %edi) | |
648 | movdqa %xmm3, -16(%edx, %edi) | |
649 | ||
650 | jae L(shl_6_loop) | |
651 | ||
652 | L(shl_6_end): | |
3093e0c7 | 653 | add $32, %ecx |
3af48cbd L |
654 | add %ecx, %edi |
655 | add %edi, %edx | |
656 | lea 6(%edi, %eax), %eax | |
657 | POP (%edi) | |
658 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
659 | ||
3093e0c7 L |
660 | cfi_restore_state |
661 | cfi_remember_state | |
3af48cbd L |
662 | ALIGN (4) |
663 | L(shl_7): | |
664 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 665 | sub $7, %eax |
3af48cbd L |
666 | movaps (%eax), %xmm1 |
667 | xor %edi, %edi | |
3093e0c7 | 668 | sub $32, %ecx |
3af48cbd L |
669 | movdqu %xmm0, (%esi) |
670 | POP (%esi) | |
671 | L(shl_7_loop): | |
672 | ||
673 | movdqa 16(%eax, %edi), %xmm2 | |
674 | sub $32, %ecx | |
675 | movdqa 32(%eax, %edi), %xmm3 | |
676 | movdqa %xmm3, %xmm4 | |
677 | palignr $7, %xmm2, %xmm3 | |
678 | palignr $7, %xmm1, %xmm2 | |
679 | lea 32(%edi), %edi | |
680 | movdqa %xmm2, -32(%edx, %edi) | |
681 | movdqa %xmm3, -16(%edx, %edi) | |
682 | ||
3093e0c7 | 683 | jb L(shl_7_end) |
3af48cbd L |
684 | |
685 | movdqa 16(%eax, %edi), %xmm2 | |
686 | sub $32, %ecx | |
687 | movdqa 32(%eax, %edi), %xmm3 | |
688 | movdqa %xmm3, %xmm1 | |
689 | palignr $7, %xmm2, %xmm3 | |
690 | palignr $7, %xmm4, %xmm2 | |
691 | lea 32(%edi), %edi | |
692 | movdqa %xmm2, -32(%edx, %edi) | |
693 | movdqa %xmm3, -16(%edx, %edi) | |
694 | ||
695 | jae L(shl_7_loop) | |
696 | ||
697 | L(shl_7_end): | |
3093e0c7 | 698 | add $32, %ecx |
3af48cbd L |
699 | add %ecx, %edi |
700 | add %edi, %edx | |
701 | lea 7(%edi, %eax), %eax | |
702 | POP (%edi) | |
703 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
704 | ||
3093e0c7 L |
705 | cfi_restore_state |
706 | cfi_remember_state | |
3af48cbd L |
707 | ALIGN (4) |
708 | L(shl_8): | |
709 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 710 | sub $8, %eax |
3af48cbd L |
711 | movaps (%eax), %xmm1 |
712 | xor %edi, %edi | |
3093e0c7 | 713 | sub $32, %ecx |
3af48cbd L |
714 | movdqu %xmm0, (%esi) |
715 | POP (%esi) | |
716 | L(shl_8_loop): | |
717 | ||
718 | movdqa 16(%eax, %edi), %xmm2 | |
719 | sub $32, %ecx | |
720 | movdqa 32(%eax, %edi), %xmm3 | |
721 | movdqa %xmm3, %xmm4 | |
722 | palignr $8, %xmm2, %xmm3 | |
723 | palignr $8, %xmm1, %xmm2 | |
724 | lea 32(%edi), %edi | |
725 | movdqa %xmm2, -32(%edx, %edi) | |
726 | movdqa %xmm3, -16(%edx, %edi) | |
727 | ||
3093e0c7 | 728 | jb L(shl_8_end) |
3af48cbd L |
729 | |
730 | movdqa 16(%eax, %edi), %xmm2 | |
731 | sub $32, %ecx | |
732 | movdqa 32(%eax, %edi), %xmm3 | |
733 | movdqa %xmm3, %xmm1 | |
734 | palignr $8, %xmm2, %xmm3 | |
735 | palignr $8, %xmm4, %xmm2 | |
736 | lea 32(%edi), %edi | |
737 | movdqa %xmm2, -32(%edx, %edi) | |
738 | movdqa %xmm3, -16(%edx, %edi) | |
739 | ||
740 | jae L(shl_8_loop) | |
741 | ||
742 | L(shl_8_end): | |
3093e0c7 | 743 | add $32, %ecx |
3af48cbd L |
744 | add %ecx, %edi |
745 | add %edi, %edx | |
746 | lea 8(%edi, %eax), %eax | |
747 | POP (%edi) | |
748 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
749 | ||
3093e0c7 L |
750 | cfi_restore_state |
751 | cfi_remember_state | |
3af48cbd L |
752 | ALIGN (4) |
753 | L(shl_9): | |
754 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 755 | sub $9, %eax |
3af48cbd L |
756 | movaps (%eax), %xmm1 |
757 | xor %edi, %edi | |
3093e0c7 | 758 | sub $32, %ecx |
3af48cbd L |
759 | movdqu %xmm0, (%esi) |
760 | POP (%esi) | |
761 | L(shl_9_loop): | |
762 | ||
763 | movdqa 16(%eax, %edi), %xmm2 | |
764 | sub $32, %ecx | |
765 | movdqa 32(%eax, %edi), %xmm3 | |
766 | movdqa %xmm3, %xmm4 | |
767 | palignr $9, %xmm2, %xmm3 | |
768 | palignr $9, %xmm1, %xmm2 | |
769 | lea 32(%edi), %edi | |
770 | movdqa %xmm2, -32(%edx, %edi) | |
771 | movdqa %xmm3, -16(%edx, %edi) | |
772 | ||
3093e0c7 | 773 | jb L(shl_9_end) |
3af48cbd L |
774 | |
775 | movdqa 16(%eax, %edi), %xmm2 | |
776 | sub $32, %ecx | |
777 | movdqa 32(%eax, %edi), %xmm3 | |
778 | movdqa %xmm3, %xmm1 | |
779 | palignr $9, %xmm2, %xmm3 | |
780 | palignr $9, %xmm4, %xmm2 | |
781 | lea 32(%edi), %edi | |
782 | movdqa %xmm2, -32(%edx, %edi) | |
783 | movdqa %xmm3, -16(%edx, %edi) | |
784 | ||
785 | jae L(shl_9_loop) | |
786 | ||
787 | L(shl_9_end): | |
3093e0c7 | 788 | add $32, %ecx |
3af48cbd L |
789 | add %ecx, %edi |
790 | add %edi, %edx | |
791 | lea 9(%edi, %eax), %eax | |
792 | POP (%edi) | |
793 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
794 | ||
3093e0c7 L |
795 | cfi_restore_state |
796 | cfi_remember_state | |
3af48cbd L |
797 | ALIGN (4) |
798 | L(shl_10): | |
799 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 800 | sub $10, %eax |
3af48cbd L |
801 | movaps (%eax), %xmm1 |
802 | xor %edi, %edi | |
3093e0c7 | 803 | sub $32, %ecx |
3af48cbd L |
804 | movdqu %xmm0, (%esi) |
805 | POP (%esi) | |
806 | L(shl_10_loop): | |
807 | ||
808 | movdqa 16(%eax, %edi), %xmm2 | |
809 | sub $32, %ecx | |
810 | movdqa 32(%eax, %edi), %xmm3 | |
811 | movdqa %xmm3, %xmm4 | |
812 | palignr $10, %xmm2, %xmm3 | |
813 | palignr $10, %xmm1, %xmm2 | |
814 | lea 32(%edi), %edi | |
815 | movdqa %xmm2, -32(%edx, %edi) | |
816 | movdqa %xmm3, -16(%edx, %edi) | |
817 | ||
3093e0c7 | 818 | jb L(shl_10_end) |
3af48cbd L |
819 | |
820 | movdqa 16(%eax, %edi), %xmm2 | |
821 | sub $32, %ecx | |
822 | movdqa 32(%eax, %edi), %xmm3 | |
823 | movdqa %xmm3, %xmm1 | |
824 | palignr $10, %xmm2, %xmm3 | |
825 | palignr $10, %xmm4, %xmm2 | |
826 | lea 32(%edi), %edi | |
827 | movdqa %xmm2, -32(%edx, %edi) | |
828 | movdqa %xmm3, -16(%edx, %edi) | |
829 | ||
830 | jae L(shl_10_loop) | |
831 | ||
832 | L(shl_10_end): | |
3093e0c7 | 833 | add $32, %ecx |
3af48cbd L |
834 | add %ecx, %edi |
835 | add %edi, %edx | |
836 | lea 10(%edi, %eax), %eax | |
837 | POP (%edi) | |
838 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
839 | ||
3093e0c7 L |
840 | cfi_restore_state |
841 | cfi_remember_state | |
3af48cbd L |
842 | ALIGN (4) |
843 | L(shl_11): | |
844 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 845 | sub $11, %eax |
3af48cbd L |
846 | movaps (%eax), %xmm1 |
847 | xor %edi, %edi | |
3093e0c7 | 848 | sub $32, %ecx |
3af48cbd L |
849 | movdqu %xmm0, (%esi) |
850 | POP (%esi) | |
851 | L(shl_11_loop): | |
852 | ||
853 | movdqa 16(%eax, %edi), %xmm2 | |
854 | sub $32, %ecx | |
855 | movdqa 32(%eax, %edi), %xmm3 | |
856 | movdqa %xmm3, %xmm4 | |
857 | palignr $11, %xmm2, %xmm3 | |
858 | palignr $11, %xmm1, %xmm2 | |
859 | lea 32(%edi), %edi | |
860 | movdqa %xmm2, -32(%edx, %edi) | |
861 | movdqa %xmm3, -16(%edx, %edi) | |
862 | ||
3093e0c7 | 863 | jb L(shl_11_end) |
3af48cbd L |
864 | |
865 | movdqa 16(%eax, %edi), %xmm2 | |
866 | sub $32, %ecx | |
867 | movdqa 32(%eax, %edi), %xmm3 | |
868 | movdqa %xmm3, %xmm1 | |
869 | palignr $11, %xmm2, %xmm3 | |
870 | palignr $11, %xmm4, %xmm2 | |
871 | lea 32(%edi), %edi | |
872 | movdqa %xmm2, -32(%edx, %edi) | |
873 | movdqa %xmm3, -16(%edx, %edi) | |
874 | ||
875 | jae L(shl_11_loop) | |
876 | ||
877 | L(shl_11_end): | |
3093e0c7 | 878 | add $32, %ecx |
3af48cbd L |
879 | add %ecx, %edi |
880 | add %edi, %edx | |
881 | lea 11(%edi, %eax), %eax | |
882 | POP (%edi) | |
883 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
884 | ||
3093e0c7 L |
885 | cfi_restore_state |
886 | cfi_remember_state | |
3af48cbd L |
887 | ALIGN (4) |
888 | L(shl_12): | |
889 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 890 | sub $12, %eax |
3af48cbd L |
891 | movaps (%eax), %xmm1 |
892 | xor %edi, %edi | |
3093e0c7 | 893 | sub $32, %ecx |
3af48cbd L |
894 | movdqu %xmm0, (%esi) |
895 | POP (%esi) | |
896 | L(shl_12_loop): | |
897 | ||
898 | movdqa 16(%eax, %edi), %xmm2 | |
899 | sub $32, %ecx | |
900 | movdqa 32(%eax, %edi), %xmm3 | |
901 | movdqa %xmm3, %xmm4 | |
902 | palignr $12, %xmm2, %xmm3 | |
903 | palignr $12, %xmm1, %xmm2 | |
904 | lea 32(%edi), %edi | |
905 | movdqa %xmm2, -32(%edx, %edi) | |
906 | movdqa %xmm3, -16(%edx, %edi) | |
907 | ||
3093e0c7 | 908 | jb L(shl_12_end) |
3af48cbd L |
909 | |
910 | movdqa 16(%eax, %edi), %xmm2 | |
911 | sub $32, %ecx | |
912 | movdqa 32(%eax, %edi), %xmm3 | |
913 | movdqa %xmm3, %xmm1 | |
914 | palignr $12, %xmm2, %xmm3 | |
915 | palignr $12, %xmm4, %xmm2 | |
916 | lea 32(%edi), %edi | |
917 | movdqa %xmm2, -32(%edx, %edi) | |
918 | movdqa %xmm3, -16(%edx, %edi) | |
919 | ||
920 | jae L(shl_12_loop) | |
921 | ||
922 | L(shl_12_end): | |
3093e0c7 | 923 | add $32, %ecx |
3af48cbd L |
924 | add %ecx, %edi |
925 | add %edi, %edx | |
926 | lea 12(%edi, %eax), %eax | |
927 | POP (%edi) | |
928 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
929 | ||
3093e0c7 L |
930 | cfi_restore_state |
931 | cfi_remember_state | |
3af48cbd L |
932 | ALIGN (4) |
933 | L(shl_13): | |
934 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 935 | sub $13, %eax |
3af48cbd L |
936 | movaps (%eax), %xmm1 |
937 | xor %edi, %edi | |
3093e0c7 | 938 | sub $32, %ecx |
3af48cbd L |
939 | movdqu %xmm0, (%esi) |
940 | POP (%esi) | |
941 | L(shl_13_loop): | |
942 | ||
943 | movdqa 16(%eax, %edi), %xmm2 | |
944 | sub $32, %ecx | |
945 | movdqa 32(%eax, %edi), %xmm3 | |
946 | movdqa %xmm3, %xmm4 | |
947 | palignr $13, %xmm2, %xmm3 | |
948 | palignr $13, %xmm1, %xmm2 | |
949 | lea 32(%edi), %edi | |
950 | movdqa %xmm2, -32(%edx, %edi) | |
951 | movdqa %xmm3, -16(%edx, %edi) | |
952 | ||
3093e0c7 | 953 | jb L(shl_13_end) |
3af48cbd L |
954 | |
955 | movdqa 16(%eax, %edi), %xmm2 | |
956 | sub $32, %ecx | |
957 | movdqa 32(%eax, %edi), %xmm3 | |
958 | movdqa %xmm3, %xmm1 | |
959 | palignr $13, %xmm2, %xmm3 | |
960 | palignr $13, %xmm4, %xmm2 | |
961 | lea 32(%edi), %edi | |
962 | movdqa %xmm2, -32(%edx, %edi) | |
963 | movdqa %xmm3, -16(%edx, %edi) | |
964 | ||
965 | jae L(shl_13_loop) | |
966 | ||
967 | L(shl_13_end): | |
3093e0c7 | 968 | add $32, %ecx |
3af48cbd L |
969 | add %ecx, %edi |
970 | add %edi, %edx | |
971 | lea 13(%edi, %eax), %eax | |
972 | POP (%edi) | |
973 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
974 | ||
3093e0c7 L |
975 | cfi_restore_state |
976 | cfi_remember_state | |
3af48cbd L |
977 | ALIGN (4) |
978 | L(shl_14): | |
979 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 980 | sub $14, %eax |
3af48cbd L |
981 | movaps (%eax), %xmm1 |
982 | xor %edi, %edi | |
3093e0c7 | 983 | sub $32, %ecx |
3af48cbd L |
984 | movdqu %xmm0, (%esi) |
985 | POP (%esi) | |
986 | L(shl_14_loop): | |
987 | ||
988 | movdqa 16(%eax, %edi), %xmm2 | |
989 | sub $32, %ecx | |
990 | movdqa 32(%eax, %edi), %xmm3 | |
991 | movdqa %xmm3, %xmm4 | |
992 | palignr $14, %xmm2, %xmm3 | |
993 | palignr $14, %xmm1, %xmm2 | |
994 | lea 32(%edi), %edi | |
995 | movdqa %xmm2, -32(%edx, %edi) | |
996 | movdqa %xmm3, -16(%edx, %edi) | |
997 | ||
3093e0c7 | 998 | jb L(shl_14_end) |
3af48cbd L |
999 | |
1000 | movdqa 16(%eax, %edi), %xmm2 | |
1001 | sub $32, %ecx | |
1002 | movdqa 32(%eax, %edi), %xmm3 | |
1003 | movdqa %xmm3, %xmm1 | |
1004 | palignr $14, %xmm2, %xmm3 | |
1005 | palignr $14, %xmm4, %xmm2 | |
1006 | lea 32(%edi), %edi | |
1007 | movdqa %xmm2, -32(%edx, %edi) | |
1008 | movdqa %xmm3, -16(%edx, %edi) | |
1009 | ||
1010 | jae L(shl_14_loop) | |
1011 | ||
1012 | L(shl_14_end): | |
3093e0c7 | 1013 | add $32, %ecx |
3af48cbd L |
1014 | add %ecx, %edi |
1015 | add %edi, %edx | |
1016 | lea 14(%edi, %eax), %eax | |
1017 | POP (%edi) | |
1018 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
1019 | ||
3093e0c7 L |
1020 | cfi_restore_state |
1021 | cfi_remember_state | |
3af48cbd L |
1022 | ALIGN (4) |
1023 | L(shl_15): | |
1024 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) | |
3093e0c7 | 1025 | sub $15, %eax |
3af48cbd L |
1026 | movaps (%eax), %xmm1 |
1027 | xor %edi, %edi | |
3093e0c7 | 1028 | sub $32, %ecx |
3af48cbd L |
1029 | movdqu %xmm0, (%esi) |
1030 | POP (%esi) | |
1031 | L(shl_15_loop): | |
1032 | ||
1033 | movdqa 16(%eax, %edi), %xmm2 | |
1034 | sub $32, %ecx | |
1035 | movdqa 32(%eax, %edi), %xmm3 | |
1036 | movdqa %xmm3, %xmm4 | |
1037 | palignr $15, %xmm2, %xmm3 | |
1038 | palignr $15, %xmm1, %xmm2 | |
1039 | lea 32(%edi), %edi | |
1040 | movdqa %xmm2, -32(%edx, %edi) | |
1041 | movdqa %xmm3, -16(%edx, %edi) | |
1042 | ||
3093e0c7 | 1043 | jb L(shl_15_end) |
3af48cbd L |
1044 | |
1045 | movdqa 16(%eax, %edi), %xmm2 | |
1046 | sub $32, %ecx | |
1047 | movdqa 32(%eax, %edi), %xmm3 | |
1048 | movdqa %xmm3, %xmm1 | |
1049 | palignr $15, %xmm2, %xmm3 | |
1050 | palignr $15, %xmm4, %xmm2 | |
1051 | lea 32(%edi), %edi | |
1052 | movdqa %xmm2, -32(%edx, %edi) | |
1053 | movdqa %xmm3, -16(%edx, %edi) | |
1054 | ||
1055 | jae L(shl_15_loop) | |
1056 | ||
1057 | L(shl_15_end): | |
3093e0c7 | 1058 | add $32, %ecx |
3af48cbd L |
1059 | add %ecx, %edi |
1060 | add %edi, %edx | |
1061 | lea 15(%edi, %eax), %eax | |
1062 | POP (%edi) | |
1063 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) | |
1064 | ||
1065 | ||
1066 | ALIGN (4) | |
1067 | L(fwd_write_44bytes): | |
1068 | movl -44(%eax), %ecx | |
1069 | movl %ecx, -44(%edx) | |
1070 | L(fwd_write_40bytes): | |
1071 | movl -40(%eax), %ecx | |
1072 | movl %ecx, -40(%edx) | |
1073 | L(fwd_write_36bytes): | |
1074 | movl -36(%eax), %ecx | |
1075 | movl %ecx, -36(%edx) | |
1076 | L(fwd_write_32bytes): | |
1077 | movl -32(%eax), %ecx | |
1078 | movl %ecx, -32(%edx) | |
1079 | L(fwd_write_28bytes): | |
1080 | movl -28(%eax), %ecx | |
1081 | movl %ecx, -28(%edx) | |
1082 | L(fwd_write_24bytes): | |
1083 | movl -24(%eax), %ecx | |
1084 | movl %ecx, -24(%edx) | |
1085 | L(fwd_write_20bytes): | |
1086 | movl -20(%eax), %ecx | |
1087 | movl %ecx, -20(%edx) | |
1088 | L(fwd_write_16bytes): | |
1089 | movl -16(%eax), %ecx | |
1090 | movl %ecx, -16(%edx) | |
1091 | L(fwd_write_12bytes): | |
1092 | movl -12(%eax), %ecx | |
1093 | movl %ecx, -12(%edx) | |
1094 | L(fwd_write_8bytes): | |
1095 | movl -8(%eax), %ecx | |
1096 | movl %ecx, -8(%edx) | |
1097 | L(fwd_write_4bytes): | |
1098 | movl -4(%eax), %ecx | |
1099 | movl %ecx, -4(%edx) | |
1100 | L(fwd_write_0bytes): | |
1101 | #ifndef USE_AS_BCOPY | |
1102 | # ifdef USE_AS_MEMPCPY | |
1103 | movl %edx, %eax | |
1104 | # else | |
1105 | movl DEST(%esp), %eax | |
1106 | # endif | |
1107 | #endif | |
1108 | RETURN | |
1109 | ||
1110 | ALIGN (4) | |
1111 | L(fwd_write_5bytes): | |
1112 | movl -5(%eax), %ecx | |
1113 | movl -4(%eax), %eax | |
1114 | movl %ecx, -5(%edx) | |
1115 | movl %eax, -4(%edx) | |
1116 | #ifndef USE_AS_BCOPY | |
1117 | # ifdef USE_AS_MEMPCPY | |
1118 | movl %edx, %eax | |
1119 | # else | |
1120 | movl DEST(%esp), %eax | |
1121 | # endif | |
1122 | #endif | |
1123 | RETURN | |
1124 | ||
1125 | ALIGN (4) | |
1126 | L(fwd_write_45bytes): | |
1127 | movl -45(%eax), %ecx | |
1128 | movl %ecx, -45(%edx) | |
1129 | L(fwd_write_41bytes): | |
1130 | movl -41(%eax), %ecx | |
1131 | movl %ecx, -41(%edx) | |
1132 | L(fwd_write_37bytes): | |
1133 | movl -37(%eax), %ecx | |
1134 | movl %ecx, -37(%edx) | |
1135 | L(fwd_write_33bytes): | |
1136 | movl -33(%eax), %ecx | |
1137 | movl %ecx, -33(%edx) | |
1138 | L(fwd_write_29bytes): | |
1139 | movl -29(%eax), %ecx | |
1140 | movl %ecx, -29(%edx) | |
1141 | L(fwd_write_25bytes): | |
1142 | movl -25(%eax), %ecx | |
1143 | movl %ecx, -25(%edx) | |
1144 | L(fwd_write_21bytes): | |
1145 | movl -21(%eax), %ecx | |
1146 | movl %ecx, -21(%edx) | |
1147 | L(fwd_write_17bytes): | |
1148 | movl -17(%eax), %ecx | |
1149 | movl %ecx, -17(%edx) | |
1150 | L(fwd_write_13bytes): | |
1151 | movl -13(%eax), %ecx | |
1152 | movl %ecx, -13(%edx) | |
1153 | L(fwd_write_9bytes): | |
1154 | movl -9(%eax), %ecx | |
1155 | movl %ecx, -9(%edx) | |
1156 | movl -5(%eax), %ecx | |
1157 | movl %ecx, -5(%edx) | |
1158 | L(fwd_write_1bytes): | |
1159 | movzbl -1(%eax), %ecx | |
1160 | movb %cl, -1(%edx) | |
1161 | #ifndef USE_AS_BCOPY | |
1162 | # ifdef USE_AS_MEMPCPY | |
1163 | movl %edx, %eax | |
1164 | # else | |
1165 | movl DEST(%esp), %eax | |
1166 | # endif | |
1167 | #endif | |
1168 | RETURN | |
1169 | ||
1170 | ALIGN (4) | |
1171 | L(fwd_write_46bytes): | |
1172 | movl -46(%eax), %ecx | |
1173 | movl %ecx, -46(%edx) | |
1174 | L(fwd_write_42bytes): | |
1175 | movl -42(%eax), %ecx | |
1176 | movl %ecx, -42(%edx) | |
1177 | L(fwd_write_38bytes): | |
1178 | movl -38(%eax), %ecx | |
1179 | movl %ecx, -38(%edx) | |
1180 | L(fwd_write_34bytes): | |
1181 | movl -34(%eax), %ecx | |
1182 | movl %ecx, -34(%edx) | |
1183 | L(fwd_write_30bytes): | |
1184 | movl -30(%eax), %ecx | |
1185 | movl %ecx, -30(%edx) | |
1186 | L(fwd_write_26bytes): | |
1187 | movl -26(%eax), %ecx | |
1188 | movl %ecx, -26(%edx) | |
1189 | L(fwd_write_22bytes): | |
1190 | movl -22(%eax), %ecx | |
1191 | movl %ecx, -22(%edx) | |
1192 | L(fwd_write_18bytes): | |
1193 | movl -18(%eax), %ecx | |
1194 | movl %ecx, -18(%edx) | |
1195 | L(fwd_write_14bytes): | |
1196 | movl -14(%eax), %ecx | |
1197 | movl %ecx, -14(%edx) | |
1198 | L(fwd_write_10bytes): | |
1199 | movl -10(%eax), %ecx | |
1200 | movl %ecx, -10(%edx) | |
1201 | L(fwd_write_6bytes): | |
1202 | movl -6(%eax), %ecx | |
1203 | movl %ecx, -6(%edx) | |
1204 | L(fwd_write_2bytes): | |
1205 | movzwl -2(%eax), %ecx | |
1206 | movw %cx, -2(%edx) | |
1207 | #ifndef USE_AS_BCOPY | |
1208 | # ifdef USE_AS_MEMPCPY | |
1209 | movl %edx, %eax | |
1210 | # else | |
1211 | movl DEST(%esp), %eax | |
1212 | # endif | |
1213 | #endif | |
1214 | RETURN | |
1215 | ||
1216 | ALIGN (4) | |
1217 | L(fwd_write_47bytes): | |
1218 | movl -47(%eax), %ecx | |
1219 | movl %ecx, -47(%edx) | |
1220 | L(fwd_write_43bytes): | |
1221 | movl -43(%eax), %ecx | |
1222 | movl %ecx, -43(%edx) | |
1223 | L(fwd_write_39bytes): | |
1224 | movl -39(%eax), %ecx | |
1225 | movl %ecx, -39(%edx) | |
1226 | L(fwd_write_35bytes): | |
1227 | movl -35(%eax), %ecx | |
1228 | movl %ecx, -35(%edx) | |
1229 | L(fwd_write_31bytes): | |
1230 | movl -31(%eax), %ecx | |
1231 | movl %ecx, -31(%edx) | |
1232 | L(fwd_write_27bytes): | |
1233 | movl -27(%eax), %ecx | |
1234 | movl %ecx, -27(%edx) | |
1235 | L(fwd_write_23bytes): | |
1236 | movl -23(%eax), %ecx | |
1237 | movl %ecx, -23(%edx) | |
1238 | L(fwd_write_19bytes): | |
1239 | movl -19(%eax), %ecx | |
1240 | movl %ecx, -19(%edx) | |
1241 | L(fwd_write_15bytes): | |
1242 | movl -15(%eax), %ecx | |
1243 | movl %ecx, -15(%edx) | |
1244 | L(fwd_write_11bytes): | |
1245 | movl -11(%eax), %ecx | |
1246 | movl %ecx, -11(%edx) | |
1247 | L(fwd_write_7bytes): | |
1248 | movl -7(%eax), %ecx | |
1249 | movl %ecx, -7(%edx) | |
1250 | L(fwd_write_3bytes): | |
1251 | movzwl -3(%eax), %ecx | |
1252 | movzbl -1(%eax), %eax | |
1253 | movw %cx, -3(%edx) | |
1254 | movb %al, -1(%edx) | |
1255 | #ifndef USE_AS_BCOPY | |
1256 | # ifdef USE_AS_MEMPCPY | |
1257 | movl %edx, %eax | |
1258 | # else | |
1259 | movl DEST(%esp), %eax | |
1260 | # endif | |
1261 | #endif | |
3093e0c7 | 1262 | RETURN_END |
3af48cbd | 1263 | |
3093e0c7 L |
1264 | cfi_restore_state |
1265 | cfi_remember_state | |
3af48cbd L |
1266 | ALIGN (4) |
1267 | L(large_page): | |
1268 | movdqu (%eax), %xmm1 | |
3af48cbd L |
1269 | movdqu %xmm0, (%esi) |
1270 | movntdq %xmm1, (%edx) | |
3093e0c7 L |
1271 | add $0x10, %eax |
1272 | add $0x10, %edx | |
1273 | sub $0x10, %ecx | |
3af48cbd L |
1274 | cmp %al, %dl |
1275 | je L(copy_page_by_rep) | |
1276 | L(large_page_loop_init): | |
1277 | POP (%esi) | |
3093e0c7 | 1278 | sub $0x80, %ecx |
3af48cbd L |
1279 | POP (%edi) |
1280 | L(large_page_loop): | |
1281 | prefetchnta 0x1c0(%eax) | |
1282 | prefetchnta 0x280(%eax) | |
1283 | movdqu (%eax), %xmm0 | |
1284 | movdqu 0x10(%eax), %xmm1 | |
1285 | movdqu 0x20(%eax), %xmm2 | |
1286 | movdqu 0x30(%eax), %xmm3 | |
1287 | movdqu 0x40(%eax), %xmm4 | |
1288 | movdqu 0x50(%eax), %xmm5 | |
1289 | movdqu 0x60(%eax), %xmm6 | |
1290 | movdqu 0x70(%eax), %xmm7 | |
1291 | lea 0x80(%eax), %eax | |
f9a97dda | 1292 | lfence |
3af48cbd L |
1293 | sub $0x80, %ecx |
1294 | movntdq %xmm0, (%edx) | |
1295 | movntdq %xmm1, 0x10(%edx) | |
1296 | movntdq %xmm2, 0x20(%edx) | |
1297 | movntdq %xmm3, 0x30(%edx) | |
1298 | movntdq %xmm4, 0x40(%edx) | |
1299 | movntdq %xmm5, 0x50(%edx) | |
1300 | movntdq %xmm6, 0x60(%edx) | |
1301 | movntdq %xmm7, 0x70(%edx) | |
1302 | lea 0x80(%edx), %edx | |
1303 | jae L(large_page_loop) | |
3093e0c7 L |
1304 | add $0x80, %ecx |
1305 | cmp $0x40, %ecx | |
1306 | jb L(large_page_less_64bytes) | |
3af48cbd L |
1307 | |
1308 | movdqu (%eax), %xmm0 | |
1309 | movdqu 0x10(%eax), %xmm1 | |
1310 | movdqu 0x20(%eax), %xmm2 | |
1311 | movdqu 0x30(%eax), %xmm3 | |
1312 | lea 0x40(%eax), %eax | |
1313 | ||
1314 | movntdq %xmm0, (%edx) | |
1315 | movntdq %xmm1, 0x10(%edx) | |
1316 | movntdq %xmm2, 0x20(%edx) | |
1317 | movntdq %xmm3, 0x30(%edx) | |
1318 | lea 0x40(%edx), %edx | |
1319 | sub $0x40, %ecx | |
1320 | L(large_page_less_64bytes): | |
1321 | cmp $32, %ecx | |
3093e0c7 | 1322 | jb L(large_page_less_32bytes) |
3af48cbd L |
1323 | movdqu (%eax), %xmm0 |
1324 | movdqu 0x10(%eax), %xmm1 | |
1325 | lea 0x20(%eax), %eax | |
1326 | movntdq %xmm0, (%edx) | |
1327 | movntdq %xmm1, 0x10(%edx) | |
1328 | lea 0x20(%edx), %edx | |
1329 | sub $0x20, %ecx | |
1330 | L(large_page_less_32bytes): | |
1331 | add %ecx, %edx | |
1332 | add %ecx, %eax | |
1333 | sfence | |
1334 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) | |
1335 | ||
3093e0c7 L |
1336 | cfi_restore_state |
1337 | cfi_remember_state | |
3af48cbd L |
1338 | ALIGN (4) |
1339 | L(copy_page_by_rep): | |
1340 | mov %eax, %esi | |
1341 | mov %edx, %edi | |
1342 | mov %ecx, %edx | |
1343 | shr $2, %ecx | |
1344 | and $3, %edx | |
1345 | rep movsl | |
1346 | jz L(copy_page_by_rep_exit) | |
1347 | cmp $2, %edx | |
1348 | jb L(copy_page_by_rep_left_1) | |
1349 | movzwl (%esi), %eax | |
1350 | movw %ax, (%edi) | |
f9a97dda UD |
1351 | add $2, %esi |
1352 | add $2, %edi | |
3af48cbd L |
1353 | sub $2, %edx |
1354 | jz L(copy_page_by_rep_exit) | |
1355 | L(copy_page_by_rep_left_1): | |
1356 | movzbl (%esi), %eax | |
1357 | movb %al, (%edi) | |
1358 | L(copy_page_by_rep_exit): | |
1359 | POP (%esi) | |
1360 | POP (%edi) | |
1361 | #ifndef USE_AS_BCOPY | |
1362 | movl DEST(%esp), %eax | |
1363 | # ifdef USE_AS_MEMPCPY | |
1364 | movl LEN(%esp), %ecx | |
1365 | add %ecx, %eax | |
1366 | # endif | |
1367 | #endif | |
1368 | RETURN | |
1369 | ||
1370 | ALIGN (4) | |
1371 | L(bk_write_44bytes): | |
1372 | movl 40(%eax), %ecx | |
1373 | movl %ecx, 40(%edx) | |
1374 | L(bk_write_40bytes): | |
1375 | movl 36(%eax), %ecx | |
1376 | movl %ecx, 36(%edx) | |
1377 | L(bk_write_36bytes): | |
1378 | movl 32(%eax), %ecx | |
1379 | movl %ecx, 32(%edx) | |
1380 | L(bk_write_32bytes): | |
1381 | movl 28(%eax), %ecx | |
1382 | movl %ecx, 28(%edx) | |
1383 | L(bk_write_28bytes): | |
1384 | movl 24(%eax), %ecx | |
1385 | movl %ecx, 24(%edx) | |
1386 | L(bk_write_24bytes): | |
1387 | movl 20(%eax), %ecx | |
1388 | movl %ecx, 20(%edx) | |
1389 | L(bk_write_20bytes): | |
1390 | movl 16(%eax), %ecx | |
1391 | movl %ecx, 16(%edx) | |
1392 | L(bk_write_16bytes): | |
1393 | movl 12(%eax), %ecx | |
1394 | movl %ecx, 12(%edx) | |
1395 | L(bk_write_12bytes): | |
1396 | movl 8(%eax), %ecx | |
1397 | movl %ecx, 8(%edx) | |
1398 | L(bk_write_8bytes): | |
1399 | movl 4(%eax), %ecx | |
1400 | movl %ecx, 4(%edx) | |
1401 | L(bk_write_4bytes): | |
1402 | movl (%eax), %ecx | |
1403 | movl %ecx, (%edx) | |
1404 | L(bk_write_0bytes): | |
1405 | #ifndef USE_AS_BCOPY | |
1406 | movl DEST(%esp), %eax | |
1407 | # ifdef USE_AS_MEMPCPY | |
1408 | movl LEN(%esp), %ecx | |
1409 | add %ecx, %eax | |
1410 | # endif | |
1411 | #endif | |
1412 | RETURN | |
1413 | ||
1414 | ALIGN (4) | |
1415 | L(bk_write_45bytes): | |
1416 | movl 41(%eax), %ecx | |
1417 | movl %ecx, 41(%edx) | |
1418 | L(bk_write_41bytes): | |
1419 | movl 37(%eax), %ecx | |
1420 | movl %ecx, 37(%edx) | |
1421 | L(bk_write_37bytes): | |
1422 | movl 33(%eax), %ecx | |
1423 | movl %ecx, 33(%edx) | |
1424 | L(bk_write_33bytes): | |
1425 | movl 29(%eax), %ecx | |
1426 | movl %ecx, 29(%edx) | |
1427 | L(bk_write_29bytes): | |
1428 | movl 25(%eax), %ecx | |
1429 | movl %ecx, 25(%edx) | |
1430 | L(bk_write_25bytes): | |
1431 | movl 21(%eax), %ecx | |
1432 | movl %ecx, 21(%edx) | |
1433 | L(bk_write_21bytes): | |
1434 | movl 17(%eax), %ecx | |
1435 | movl %ecx, 17(%edx) | |
1436 | L(bk_write_17bytes): | |
1437 | movl 13(%eax), %ecx | |
1438 | movl %ecx, 13(%edx) | |
1439 | L(bk_write_13bytes): | |
1440 | movl 9(%eax), %ecx | |
1441 | movl %ecx, 9(%edx) | |
1442 | L(bk_write_9bytes): | |
1443 | movl 5(%eax), %ecx | |
1444 | movl %ecx, 5(%edx) | |
1445 | L(bk_write_5bytes): | |
1446 | movl 1(%eax), %ecx | |
1447 | movl %ecx, 1(%edx) | |
1448 | L(bk_write_1bytes): | |
1449 | movzbl (%eax), %ecx | |
1450 | movb %cl, (%edx) | |
1451 | #ifndef USE_AS_BCOPY | |
1452 | movl DEST(%esp), %eax | |
1453 | # ifdef USE_AS_MEMPCPY | |
1454 | movl LEN(%esp), %ecx | |
1455 | add %ecx, %eax | |
1456 | # endif | |
1457 | #endif | |
1458 | RETURN | |
1459 | ||
1460 | ALIGN (4) | |
1461 | L(bk_write_46bytes): | |
1462 | movl 42(%eax), %ecx | |
1463 | movl %ecx, 42(%edx) | |
1464 | L(bk_write_42bytes): | |
1465 | movl 38(%eax), %ecx | |
1466 | movl %ecx, 38(%edx) | |
1467 | L(bk_write_38bytes): | |
1468 | movl 34(%eax), %ecx | |
1469 | movl %ecx, 34(%edx) | |
1470 | L(bk_write_34bytes): | |
1471 | movl 30(%eax), %ecx | |
1472 | movl %ecx, 30(%edx) | |
1473 | L(bk_write_30bytes): | |
1474 | movl 26(%eax), %ecx | |
1475 | movl %ecx, 26(%edx) | |
1476 | L(bk_write_26bytes): | |
1477 | movl 22(%eax), %ecx | |
1478 | movl %ecx, 22(%edx) | |
1479 | L(bk_write_22bytes): | |
1480 | movl 18(%eax), %ecx | |
1481 | movl %ecx, 18(%edx) | |
1482 | L(bk_write_18bytes): | |
1483 | movl 14(%eax), %ecx | |
1484 | movl %ecx, 14(%edx) | |
1485 | L(bk_write_14bytes): | |
1486 | movl 10(%eax), %ecx | |
1487 | movl %ecx, 10(%edx) | |
1488 | L(bk_write_10bytes): | |
1489 | movl 6(%eax), %ecx | |
1490 | movl %ecx, 6(%edx) | |
1491 | L(bk_write_6bytes): | |
1492 | movl 2(%eax), %ecx | |
1493 | movl %ecx, 2(%edx) | |
1494 | L(bk_write_2bytes): | |
1495 | movzwl (%eax), %ecx | |
1496 | movw %cx, (%edx) | |
1497 | #ifndef USE_AS_BCOPY | |
1498 | movl DEST(%esp), %eax | |
1499 | # ifdef USE_AS_MEMPCPY | |
1500 | movl LEN(%esp), %ecx | |
1501 | add %ecx, %eax | |
1502 | # endif | |
1503 | #endif | |
1504 | RETURN | |
1505 | ||
1506 | ALIGN (4) | |
1507 | L(bk_write_47bytes): | |
1508 | movl 43(%eax), %ecx | |
1509 | movl %ecx, 43(%edx) | |
1510 | L(bk_write_43bytes): | |
1511 | movl 39(%eax), %ecx | |
1512 | movl %ecx, 39(%edx) | |
1513 | L(bk_write_39bytes): | |
1514 | movl 35(%eax), %ecx | |
1515 | movl %ecx, 35(%edx) | |
1516 | L(bk_write_35bytes): | |
1517 | movl 31(%eax), %ecx | |
1518 | movl %ecx, 31(%edx) | |
1519 | L(bk_write_31bytes): | |
1520 | movl 27(%eax), %ecx | |
1521 | movl %ecx, 27(%edx) | |
1522 | L(bk_write_27bytes): | |
1523 | movl 23(%eax), %ecx | |
1524 | movl %ecx, 23(%edx) | |
1525 | L(bk_write_23bytes): | |
1526 | movl 19(%eax), %ecx | |
1527 | movl %ecx, 19(%edx) | |
1528 | L(bk_write_19bytes): | |
1529 | movl 15(%eax), %ecx | |
1530 | movl %ecx, 15(%edx) | |
1531 | L(bk_write_15bytes): | |
1532 | movl 11(%eax), %ecx | |
1533 | movl %ecx, 11(%edx) | |
1534 | L(bk_write_11bytes): | |
1535 | movl 7(%eax), %ecx | |
1536 | movl %ecx, 7(%edx) | |
1537 | L(bk_write_7bytes): | |
1538 | movl 3(%eax), %ecx | |
1539 | movl %ecx, 3(%edx) | |
1540 | L(bk_write_3bytes): | |
1541 | movzwl 1(%eax), %ecx | |
1542 | movw %cx, 1(%edx) | |
1543 | movzbl (%eax), %eax | |
1544 | movb %al, (%edx) | |
1545 | #ifndef USE_AS_BCOPY | |
1546 | movl DEST(%esp), %eax | |
1547 | # ifdef USE_AS_MEMPCPY | |
1548 | movl LEN(%esp), %ecx | |
1549 | add %ecx, %eax | |
1550 | # endif | |
1551 | #endif | |
1552 | RETURN_END | |
1553 | ||
1554 | ||
1555 | .pushsection .rodata.ssse3,"a",@progbits | |
1556 | ALIGN (2) | |
1557 | L(table_48bytes_fwd): | |
1558 | .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) | |
1559 | .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) | |
1560 | .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) | |
1561 | .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) | |
1562 | .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) | |
1563 | .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) | |
1564 | .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) | |
1565 | .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) | |
1566 | .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) | |
1567 | .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) | |
1568 | .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) | |
1569 | .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) | |
1570 | .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) | |
1571 | .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) | |
1572 | .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) | |
1573 | .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) | |
1574 | .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) | |
1575 | .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) | |
1576 | .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) | |
1577 | .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) | |
1578 | .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) | |
1579 | .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) | |
1580 | .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) | |
1581 | .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) | |
1582 | .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) | |
1583 | .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) | |
1584 | .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) | |
1585 | .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) | |
1586 | .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) | |
1587 | .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) | |
1588 | .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) | |
1589 | .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) | |
1590 | .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) | |
1591 | .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) | |
1592 | .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) | |
1593 | .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) | |
1594 | .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) | |
1595 | .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) | |
1596 | .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) | |
1597 | .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) | |
1598 | .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) | |
1599 | .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) | |
1600 | .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) | |
1601 | .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) | |
1602 | .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) | |
1603 | .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) | |
1604 | .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) | |
1605 | .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) | |
1606 | ||
1607 | ALIGN (2) | |
1608 | L(shl_table): | |
1609 | .int JMPTBL (L(shl_0), L(shl_table)) | |
1610 | .int JMPTBL (L(shl_1), L(shl_table)) | |
1611 | .int JMPTBL (L(shl_2), L(shl_table)) | |
1612 | .int JMPTBL (L(shl_3), L(shl_table)) | |
1613 | .int JMPTBL (L(shl_4), L(shl_table)) | |
1614 | .int JMPTBL (L(shl_5), L(shl_table)) | |
1615 | .int JMPTBL (L(shl_6), L(shl_table)) | |
1616 | .int JMPTBL (L(shl_7), L(shl_table)) | |
1617 | .int JMPTBL (L(shl_8), L(shl_table)) | |
1618 | .int JMPTBL (L(shl_9), L(shl_table)) | |
1619 | .int JMPTBL (L(shl_10), L(shl_table)) | |
1620 | .int JMPTBL (L(shl_11), L(shl_table)) | |
1621 | .int JMPTBL (L(shl_12), L(shl_table)) | |
1622 | .int JMPTBL (L(shl_13), L(shl_table)) | |
1623 | .int JMPTBL (L(shl_14), L(shl_table)) | |
1624 | .int JMPTBL (L(shl_15), L(shl_table)) | |
1625 | ||
1626 | ALIGN (2) | |
1627 | L(table_48_bytes_bwd): | |
1628 | .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) | |
1629 | .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) | |
1630 | .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) | |
1631 | .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) | |
1632 | .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) | |
1633 | .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) | |
1634 | .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) | |
1635 | .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) | |
1636 | .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) | |
1637 | .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) | |
1638 | .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) | |
1639 | .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) | |
1640 | .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) | |
1641 | .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) | |
1642 | .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) | |
1643 | .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) | |
1644 | .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) | |
1645 | .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) | |
1646 | .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) | |
1647 | .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) | |
1648 | .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) | |
1649 | .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) | |
1650 | .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) | |
1651 | .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) | |
1652 | .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) | |
1653 | .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) | |
1654 | .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) | |
1655 | .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) | |
1656 | .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) | |
1657 | .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) | |
1658 | .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) | |
1659 | .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) | |
1660 | .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) | |
1661 | .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) | |
1662 | .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) | |
1663 | .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) | |
1664 | .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) | |
1665 | .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) | |
1666 | .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) | |
1667 | .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) | |
1668 | .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) | |
1669 | .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) | |
1670 | .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) | |
1671 | .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) | |
1672 | .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) | |
1673 | .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) | |
1674 | .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) | |
1675 | .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) | |
1676 | ||
1677 | .popsection | |
1678 | ||
1679 | #ifdef USE_AS_MEMMOVE | |
1680 | ALIGN (4) | |
1681 | L(copy_backward): | |
1682 | PUSH (%esi) | |
1683 | movl %eax, %esi | |
3093e0c7 L |
1684 | add %ecx, %edx |
1685 | add %ecx, %esi | |
3af48cbd L |
1686 | testl $0x3, %edx |
1687 | jnz L(bk_align) | |
1688 | ||
1689 | L(bk_aligned_4): | |
1690 | cmp $64, %ecx | |
3093e0c7 | 1691 | jae L(bk_write_more64bytes) |
3af48cbd L |
1692 | |
1693 | L(bk_write_64bytesless): | |
1694 | cmp $32, %ecx | |
3093e0c7 | 1695 | jb L(bk_write_less32bytes) |
3af48cbd L |
1696 | |
1697 | L(bk_write_more32bytes): | |
1698 | /* Copy 32 bytes at a time. */ | |
1699 | sub $32, %ecx | |
1700 | movl -4(%esi), %eax | |
1701 | movl %eax, -4(%edx) | |
1702 | movl -8(%esi), %eax | |
1703 | movl %eax, -8(%edx) | |
1704 | movl -12(%esi), %eax | |
1705 | movl %eax, -12(%edx) | |
1706 | movl -16(%esi), %eax | |
1707 | movl %eax, -16(%edx) | |
1708 | movl -20(%esi), %eax | |
1709 | movl %eax, -20(%edx) | |
1710 | movl -24(%esi), %eax | |
1711 | movl %eax, -24(%edx) | |
1712 | movl -28(%esi), %eax | |
1713 | movl %eax, -28(%edx) | |
1714 | movl -32(%esi), %eax | |
1715 | movl %eax, -32(%edx) | |
1716 | sub $32, %edx | |
1717 | sub $32, %esi | |
1718 | ||
1719 | L(bk_write_less32bytes): | |
1720 | movl %esi, %eax | |
1721 | sub %ecx, %edx | |
1722 | sub %ecx, %eax | |
1723 | POP (%esi) | |
3093e0c7 | 1724 | L(bk_write_less48bytes): |
3af48cbd L |
1725 | BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
1726 | ||
3093e0c7 | 1727 | CFI_PUSH (%esi) |
3af48cbd L |
1728 | ALIGN (4) |
1729 | L(bk_align): | |
1730 | cmp $8, %ecx | |
3093e0c7 | 1731 | jbe L(bk_write_less32bytes) |
3af48cbd L |
1732 | testl $1, %edx |
1733 | /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, | |
1734 | then (EDX & 2) must be != 0. */ | |
1735 | jz L(bk_got2) | |
1736 | sub $1, %esi | |
1737 | sub $1, %ecx | |
1738 | sub $1, %edx | |
1739 | movzbl (%esi), %eax | |
1740 | movb %al, (%edx) | |
1741 | ||
1742 | testl $2, %edx | |
1743 | jz L(bk_aligned_4) | |
1744 | ||
1745 | L(bk_got2): | |
1746 | sub $2, %esi | |
1747 | sub $2, %ecx | |
1748 | sub $2, %edx | |
1749 | movzwl (%esi), %eax | |
1750 | movw %ax, (%edx) | |
1751 | jmp L(bk_aligned_4) | |
1752 | ||
1753 | ALIGN (4) | |
1754 | L(bk_write_more64bytes): | |
1755 | /* Check alignment of last byte. */ | |
1756 | testl $15, %edx | |
1757 | jz L(bk_ssse3_cpy_pre) | |
1758 | ||
1759 | /* EDX is aligned 4 bytes, but not 16 bytes. */ | |
1760 | L(bk_ssse3_align): | |
1761 | sub $4, %esi | |
1762 | sub $4, %ecx | |
1763 | sub $4, %edx | |
1764 | movl (%esi), %eax | |
1765 | movl %eax, (%edx) | |
1766 | ||
1767 | testl $15, %edx | |
1768 | jz L(bk_ssse3_cpy_pre) | |
1769 | ||
1770 | sub $4, %esi | |
1771 | sub $4, %ecx | |
1772 | sub $4, %edx | |
1773 | movl (%esi), %eax | |
1774 | movl %eax, (%edx) | |
1775 | ||
1776 | testl $15, %edx | |
1777 | jz L(bk_ssse3_cpy_pre) | |
1778 | ||
1779 | sub $4, %esi | |
1780 | sub $4, %ecx | |
1781 | sub $4, %edx | |
1782 | movl (%esi), %eax | |
1783 | movl %eax, (%edx) | |
1784 | ||
1785 | L(bk_ssse3_cpy_pre): | |
1786 | cmp $64, %ecx | |
3093e0c7 | 1787 | jb L(bk_write_more32bytes) |
3af48cbd L |
1788 | |
1789 | L(bk_ssse3_cpy): | |
1790 | sub $64, %esi | |
1791 | sub $64, %ecx | |
1792 | sub $64, %edx | |
1793 | movdqu 0x30(%esi), %xmm3 | |
1794 | movdqa %xmm3, 0x30(%edx) | |
1795 | movdqu 0x20(%esi), %xmm2 | |
1796 | movdqa %xmm2, 0x20(%edx) | |
1797 | movdqu 0x10(%esi), %xmm1 | |
1798 | movdqa %xmm1, 0x10(%edx) | |
1799 | movdqu (%esi), %xmm0 | |
1800 | movdqa %xmm0, (%edx) | |
1801 | cmp $64, %ecx | |
3093e0c7 | 1802 | jae L(bk_ssse3_cpy) |
3af48cbd L |
1803 | jmp L(bk_write_64bytesless) |
1804 | ||
1805 | #endif | |
1806 | ||
1807 | END (MEMCPY) | |
1808 | ||
1809 | #endif |