sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S

   1 /* memcpy optimized with SSE2 unaligned memory access instructions.
   2    Copyright (C) 2014-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #if IS_IN (libc) \
  20     && (defined SHARED \
  21         || defined USE_AS_MEMMOVE \
  22         || !defined USE_MULTIARCH)
  23
  24 # include <sysdep.h>
  25 # include "asm-syntax.h"
  26
  27 # ifndef MEMCPY
  28 #  define MEMCPY        __memcpy_sse2_unaligned
  29 #  define MEMCPY_CHK    __memcpy_chk_sse2_unaligned
  30 # endif
  31
  32 # ifdef USE_AS_BCOPY
  33 #  define SRC           PARMS
  34 #  define DEST          SRC+4
  35 #  define LEN           DEST+4
  36 # else
  37 #  define DEST          PARMS
  38 #  define SRC           DEST+4
  39 #  define LEN           SRC+4
  40 # endif
  41
  42 # define CFI_PUSH(REG)          \
  43   cfi_adjust_cfa_offset (4);            \
  44   cfi_rel_offset (REG, 0)
  45
  46 # define CFI_POP(REG)           \
  47   cfi_adjust_cfa_offset (-4);           \
  48   cfi_restore (REG)
  49
  50 # define PUSH(REG)      pushl REG; CFI_PUSH (REG)
  51 # define POP(REG)       popl REG; CFI_POP (REG)
  52
  53 # define PARMS          8               /* Preserve EBX.  */
  54 # define ENTRANCE       PUSH (%ebx);
  55 # define RETURN_END     POP (%ebx); ret
  56 # define RETURN RETURN_END; CFI_PUSH (%ebx)
  57
  58         .section .text.sse2,"ax",@progbits
  59 # if !defined USE_AS_BCOPY && defined SHARED
  60 ENTRY (MEMCPY_CHK)
  61         movl    12(%esp), %eax
  62         cmpl    %eax, 16(%esp)
  63         jb      HIDDEN_JUMPTARGET (__chk_fail)
  64 END (MEMCPY_CHK)
  65 # endif
  66
  67 ENTRY (MEMCPY)
  68         ENTRANCE
  69         movl    LEN(%esp), %ecx
  70         movl    SRC(%esp), %eax
  71         movl    DEST(%esp), %edx
  72         cmp     %edx, %eax
  73
  74 # ifdef USE_AS_MEMMOVE
  75         jg      L(check_forward)
  76
  77 L(mm_len_0_or_more_backward):
  78 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
  79         separately.  */
  80         cmp     $16, %ecx
  81         jbe     L(mm_len_0_16_bytes_backward)
  82
  83         cmpl    $32, %ecx
  84         jg      L(mm_len_32_or_more_backward)
  85
  86 /* Copy [0..32] and return.  */
  87         movdqu  (%eax), %xmm0
  88         movdqu  -16(%eax, %ecx), %xmm1
  89         movdqu  %xmm0, (%edx)
  90         movdqu  %xmm1, -16(%edx, %ecx)
  91         jmp     L(return)
  92
  93 L(mm_len_32_or_more_backward):
  94         cmpl    $64, %ecx
  95         jg      L(mm_len_64_or_more_backward)
  96
  97 /* Copy [0..64] and return.  */
  98         movdqu  (%eax), %xmm0
  99         movdqu  16(%eax), %xmm1
 100         movdqu  -16(%eax, %ecx), %xmm2
 101         movdqu  -32(%eax, %ecx), %xmm3
 102         movdqu  %xmm0, (%edx)
 103         movdqu  %xmm1, 16(%edx)
 104         movdqu  %xmm2, -16(%edx, %ecx)
 105         movdqu  %xmm3, -32(%edx, %ecx)
 106         jmp     L(return)
 107
 108 L(mm_len_64_or_more_backward):
 109         cmpl    $128, %ecx
 110         jg      L(mm_len_128_or_more_backward)
 111
 112 /* Copy [0..128] and return.  */
 113         movdqu  (%eax), %xmm0
 114         movdqu  16(%eax), %xmm1
 115         movdqu  32(%eax), %xmm2
 116         movdqu  48(%eax), %xmm3
 117         movdqu  -64(%eax, %ecx), %xmm4
 118         movdqu  -48(%eax, %ecx), %xmm5
 119         movdqu  -32(%eax, %ecx), %xmm6
 120         movdqu  -16(%eax, %ecx), %xmm7
 121         movdqu  %xmm0, (%edx)
 122         movdqu  %xmm1, 16(%edx)
 123         movdqu  %xmm2, 32(%edx)
 124         movdqu  %xmm3, 48(%edx)
 125         movdqu  %xmm4, -64(%edx, %ecx)
 126         movdqu  %xmm5, -48(%edx, %ecx)
 127         movdqu  %xmm6, -32(%edx, %ecx)
 128         movdqu  %xmm7, -16(%edx, %ecx)
 129         jmp     L(return)
 130
 131 L(mm_len_128_or_more_backward):
 132         add     %ecx, %eax
 133         cmp     %edx, %eax
 134         movl    SRC(%esp), %eax
 135         jle     L(forward)
 136         PUSH (%esi)
 137         PUSH (%edi)
 138         PUSH (%ebx)
 139
 140 /* Aligning the address of destination. */
 141         movdqu  (%eax), %xmm4
 142         movdqu  16(%eax), %xmm5
 143         movdqu  32(%eax), %xmm6
 144         movdqu  48(%eax), %xmm7
 145         leal    (%edx, %ecx), %esi
 146         movdqu  -16(%eax, %ecx), %xmm0
 147         subl    $16, %esp
 148         movdqu  %xmm0, (%esp)
 149         mov     %ecx, %edi
 150         movl    %esi, %ecx
 151         andl    $-16, %ecx
 152         leal    (%ecx), %ebx
 153         subl    %edx, %ebx
 154         leal    (%eax, %ebx), %eax
 155         shrl    $6, %ebx
 156
 157 # ifdef SHARED_CACHE_SIZE_HALF
 158         cmp     $SHARED_CACHE_SIZE_HALF, %edi
 159 # else
 160 #  ifdef SHARED
 161         PUSH (%ebx)
 162         SETUP_PIC_REG (bx)
 163         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 164         cmp     __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
 165         POP (%ebx)
 166 #  else
 167         cmp     __x86_shared_cache_size_half, %edi
 168 #  endif
 169 # endif
 170         jae     L(mm_large_page_loop_backward)
 171
 172         .p2align 4
 173 L(mm_main_loop_backward):
 174
 175         prefetcht0 -128(%eax)
 176
 177         movdqu  -64(%eax), %xmm0
 178         movdqu  -48(%eax), %xmm1
 179         movdqu  -32(%eax), %xmm2
 180         movdqu  -16(%eax), %xmm3
 181         movaps  %xmm0, -64(%ecx)
 182         subl    $64, %eax
 183         movaps  %xmm1, -48(%ecx)
 184         movaps  %xmm2, -32(%ecx)
 185         movaps  %xmm3, -16(%ecx)
 186         subl    $64, %ecx
 187         sub     $1, %ebx
 188         jnz     L(mm_main_loop_backward)
 189         movdqu  (%esp), %xmm0
 190         addl    $16, %esp
 191         movdqu  %xmm0, -16(%esi)
 192         movdqu  %xmm4, (%edx)
 193         movdqu  %xmm5, 16(%edx)
 194         movdqu  %xmm6, 32(%edx)
 195         movdqu  %xmm7, 48(%edx)
 196         POP (%ebx)
 197         jmp     L(mm_return_pop_all)
 198
 199 /* Copy [0..16] and return.  */
 200 L(mm_len_0_16_bytes_backward):
 201         testb   $24, %cl
 202         jnz     L(mm_len_9_16_bytes_backward)
 203         testb   $4, %cl
 204         .p2align 4,,5
 205         jnz     L(mm_len_5_8_bytes_backward)
 206         testl   %ecx, %ecx
 207         .p2align 4,,2
 208         je      L(return)
 209         testb   $2, %cl
 210         .p2align 4,,1
 211         jne     L(mm_len_3_4_bytes_backward)
 212         movzbl  -1(%eax,%ecx), %ebx
 213         movzbl  (%eax), %eax
 214         movb    %bl, -1(%edx,%ecx)
 215         movb    %al, (%edx)
 216         jmp     L(return)
 217
 218 L(mm_len_3_4_bytes_backward):
 219         movzwl  -2(%eax,%ecx), %ebx
 220         movzwl  (%eax), %eax
 221         movw    %bx, -2(%edx,%ecx)
 222         movw    %ax, (%edx)
 223         jmp     L(return)
 224
 225 L(mm_len_9_16_bytes_backward):
 226         PUSH (%esi)
 227         movl    -4(%eax,%ecx), %ebx
 228         movl    -8(%eax,%ecx), %esi
 229         movl    %ebx, -4(%edx,%ecx)
 230         movl    %esi, -8(%edx,%ecx)
 231         subl    $8, %ecx
 232         POP (%esi)
 233         jmp     L(mm_len_0_16_bytes_backward)
 234
 235 L(mm_len_5_8_bytes_backward):
 236         movl    (%eax), %ebx
 237         movl    -4(%eax,%ecx), %eax
 238         movl    %ebx, (%edx)
 239         movl    %eax, -4(%edx,%ecx)
 240         jmp     L(return)
 241
 242 /* Big length copy backward part.  */
 243         .p2align 4
 244 L(mm_large_page_loop_backward):
 245         movdqu  -64(%eax), %xmm0
 246         movdqu  -48(%eax), %xmm1
 247         movdqu  -32(%eax), %xmm2
 248         movdqu  -16(%eax), %xmm3
 249         movntdq %xmm0, -64(%ecx)
 250         subl    $64, %eax
 251         movntdq %xmm1, -48(%ecx)
 252         movntdq %xmm2, -32(%ecx)
 253         movntdq %xmm3, -16(%ecx)
 254         subl    $64, %ecx
 255         sub     $1, %ebx
 256         jnz     L(mm_large_page_loop_backward)
 257         sfence
 258         movdqu  (%esp), %xmm0
 259         addl    $16, %esp
 260         movdqu  %xmm0, -16(%esi)
 261         movdqu  %xmm4, (%edx)
 262         movdqu  %xmm5, 16(%edx)
 263         movdqu  %xmm6, 32(%edx)
 264         movdqu  %xmm7, 48(%edx)
 265         POP (%ebx)
 266         jmp     L(mm_return_pop_all)
 267
 268 L(check_forward):
 269         add     %edx, %ecx
 270         cmp     %eax, %ecx
 271         movl    LEN(%esp), %ecx
 272         jle     L(forward)
 273
 274 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
 275         separately.  */
 276         cmp     $16, %ecx
 277         jbe     L(mm_len_0_16_bytes_forward)
 278
 279         cmpl    $32, %ecx
 280         ja      L(mm_len_32_or_more_forward)
 281
 282 /* Copy [0..32] and return.  */
 283         movdqu  (%eax), %xmm0
 284         movdqu  -16(%eax, %ecx), %xmm1
 285         movdqu  %xmm0, (%edx)
 286         movdqu  %xmm1, -16(%edx, %ecx)
 287         jmp     L(return)
 288
 289 L(mm_len_32_or_more_forward):
 290         cmpl    $64, %ecx
 291         ja      L(mm_len_64_or_more_forward)
 292
 293 /* Copy [0..64] and return.  */
 294         movdqu  (%eax), %xmm0
 295         movdqu  16(%eax), %xmm1
 296         movdqu  -16(%eax, %ecx), %xmm2
 297         movdqu  -32(%eax, %ecx), %xmm3
 298         movdqu  %xmm0, (%edx)
 299         movdqu  %xmm1, 16(%edx)
 300         movdqu  %xmm2, -16(%edx, %ecx)
 301         movdqu  %xmm3, -32(%edx, %ecx)
 302         jmp     L(return)
 303
 304 L(mm_len_64_or_more_forward):
 305         cmpl    $128, %ecx
 306         ja      L(mm_len_128_or_more_forward)
 307
 308 /* Copy [0..128] and return.  */
 309         movdqu  (%eax), %xmm0
 310         movdqu  16(%eax), %xmm1
 311         movdqu  32(%eax), %xmm2
 312         movdqu  48(%eax), %xmm3
 313         movdqu  -64(%eax, %ecx), %xmm4
 314         movdqu  -48(%eax, %ecx), %xmm5
 315         movdqu  -32(%eax, %ecx), %xmm6
 316         movdqu  -16(%eax, %ecx), %xmm7
 317         movdqu  %xmm0, (%edx)
 318         movdqu  %xmm1, 16(%edx)
 319         movdqu  %xmm2, 32(%edx)
 320         movdqu  %xmm3, 48(%edx)
 321         movdqu  %xmm4, -64(%edx, %ecx)
 322         movdqu  %xmm5, -48(%edx, %ecx)
 323         movdqu  %xmm6, -32(%edx, %ecx)
 324         movdqu  %xmm7, -16(%edx, %ecx)
 325         jmp     L(return)
 326
 327 L(mm_len_128_or_more_forward):
 328         PUSH (%esi)
 329         PUSH (%edi)
 330         PUSH (%ebx)
 331
 332 /* Aligning the address of destination. */
 333         movdqu  -16(%eax, %ecx), %xmm4
 334         movdqu  -32(%eax, %ecx), %xmm5
 335         movdqu  -48(%eax, %ecx), %xmm6
 336         movdqu  -64(%eax, %ecx), %xmm7
 337         leal    (%edx, %ecx), %esi
 338         movdqu  (%eax), %xmm0
 339         subl    $16, %esp
 340         movdqu  %xmm0, (%esp)
 341         mov     %ecx, %edi
 342         leal    16(%edx), %ecx
 343         andl    $-16, %ecx
 344         movl    %ecx, %ebx
 345         subl    %edx, %ebx
 346         addl    %ebx, %eax
 347         movl    %esi, %ebx
 348         subl    %ecx, %ebx
 349         shrl    $6, %ebx
 350
 351 # ifdef SHARED_CACHE_SIZE_HALF
 352         cmp     $SHARED_CACHE_SIZE_HALF, %edi
 353 # else
 354 #  ifdef SHARED
 355         PUSH (%ebx)
 356         SETUP_PIC_REG(bx)
 357         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 358         cmp     __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
 359         POP (%ebx)
 360 #  else
 361         cmp     __x86_shared_cache_size_half, %edi
 362 #  endif
 363 # endif
 364         jae     L(mm_large_page_loop_forward)
 365
 366         .p2align 4
 367 L(mm_main_loop_forward):
 368
 369         prefetcht0 128(%eax)
 370
 371         movdqu  (%eax), %xmm0
 372         movdqu  16(%eax), %xmm1
 373         movdqu  32(%eax), %xmm2
 374         movdqu  48(%eax), %xmm3
 375         movdqa  %xmm0, (%ecx)
 376         addl    $64, %eax
 377         movaps  %xmm1, 16(%ecx)
 378         movaps  %xmm2, 32(%ecx)
 379         movaps  %xmm3, 48(%ecx)
 380         addl    $64, %ecx
 381         sub     $1, %ebx
 382         jnz     L(mm_main_loop_forward)
 383         movdqu  (%esp), %xmm0
 384         addl    $16, %esp
 385         movdqu  %xmm0, (%edx)
 386         movdqu  %xmm4, -16(%esi)
 387         movdqu  %xmm5, -32(%esi)
 388         movdqu  %xmm6, -48(%esi)
 389         movdqu  %xmm7, -64(%esi)
 390         POP (%ebx)
 391         jmp     L(mm_return_pop_all)
 392
 393 L(mm_len_0_16_bytes_forward):
 394         testb   $24, %cl
 395         jne     L(mm_len_9_16_bytes_forward)
 396         testb   $4, %cl
 397         .p2align 4,,5
 398         jne     L(mm_len_5_8_bytes_forward)
 399         testl   %ecx, %ecx
 400         .p2align 4,,2
 401         je      L(return)
 402         testb   $2, %cl
 403         .p2align 4,,1
 404         jne     L(mm_len_2_4_bytes_forward)
 405         movzbl  -1(%eax,%ecx), %ebx
 406         movzbl  (%eax), %eax
 407         movb    %bl, -1(%edx,%ecx)
 408         movb    %al, (%edx)
 409         jmp     L(return)
 410
 411 L(mm_len_2_4_bytes_forward):
 412         movzwl  -2(%eax,%ecx), %ebx
 413         movzwl  (%eax), %eax
 414         movw    %bx, -2(%edx,%ecx)
 415         movw    %ax, (%edx)
 416         jmp     L(return)
 417
 418 L(mm_len_5_8_bytes_forward):
 419         movl    (%eax), %ebx
 420         movl    -4(%eax,%ecx), %eax
 421         movl    %ebx, (%edx)
 422         movl    %eax, -4(%edx,%ecx)
 423         jmp     L(return)
 424
 425 L(mm_len_9_16_bytes_forward):
 426         movq    (%eax), %xmm0
 427         movq    -8(%eax, %ecx), %xmm1
 428         movq    %xmm0, (%edx)
 429         movq    %xmm1, -8(%edx, %ecx)
 430         jmp     L(return)
 431
 432 L(mm_return_pop_all):
 433         movl    %edx, %eax
 434         POP (%edi)
 435         POP (%esi)
 436         RETURN
 437
 438 /* Big length copy forward part.  */
 439         .p2align 4
 440 L(mm_large_page_loop_forward):
 441         movdqu  (%eax), %xmm0
 442         movdqu  16(%eax), %xmm1
 443         movdqu  32(%eax), %xmm2
 444         movdqu  48(%eax), %xmm3
 445         movntdq %xmm0, (%ecx)
 446         addl    $64, %eax
 447         movntdq %xmm1, 16(%ecx)
 448         movntdq %xmm2, 32(%ecx)
 449         movntdq %xmm3, 48(%ecx)
 450         addl    $64, %ecx
 451         sub     $1, %ebx
 452         jnz     L(mm_large_page_loop_forward)
 453         sfence
 454         movdqu  (%esp), %xmm0
 455         addl    $16, %esp
 456         movdqu  %xmm0, (%edx)
 457         movdqu  %xmm4, -16(%esi)
 458         movdqu  %xmm5, -32(%esi)
 459         movdqu  %xmm6, -48(%esi)
 460         movdqu  %xmm7, -64(%esi)
 461         POP (%ebx)
 462         jmp     L(mm_return_pop_all)
 463 # endif
 464
 465 L(forward):
 466         cmp     $16, %ecx
 467         jbe     L(len_0_16_bytes)
 468
 469 # ifdef SHARED_CACHE_SIZE_HALF
 470         cmp     $SHARED_CACHE_SIZE_HALF, %ecx
 471 # else
 472 #  ifdef SHARED
 473         SETUP_PIC_REG(bx)
 474         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 475         cmp     __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
 476 #  else
 477         cmp     __x86_shared_cache_size_half, %ecx
 478 #  endif
 479 # endif
 480         jae     L(large_page)
 481
 482         movdqu  (%eax), %xmm0
 483         movdqu  -16(%eax, %ecx), %xmm1
 484         cmpl    $32, %ecx
 485         movdqu  %xmm0, (%edx)
 486         movdqu  %xmm1, -16(%edx, %ecx)
 487         jbe     L(return)
 488
 489         movdqu  16(%eax), %xmm0
 490         movdqu  -32(%eax, %ecx), %xmm1
 491         cmpl    $64, %ecx
 492         movdqu  %xmm0, 16(%edx)
 493         movdqu  %xmm1, -32(%edx, %ecx)
 494         jbe     L(return)
 495
 496         movdqu  32(%eax), %xmm0
 497         movdqu  48(%eax), %xmm1
 498         movdqu  -48(%eax, %ecx), %xmm2
 499         movdqu  -64(%eax, %ecx), %xmm3
 500         cmpl    $128, %ecx
 501         movdqu  %xmm0, 32(%edx)
 502         movdqu  %xmm1, 48(%edx)
 503         movdqu  %xmm2, -48(%edx, %ecx)
 504         movdqu  %xmm3, -64(%edx, %ecx)
 505         jbe     L(return)
 506
 507 /* Now the main loop: we align the address of the destination.  */
 508         leal    64(%edx), %ebx
 509         andl    $-64, %ebx
 510
 511         addl    %edx, %ecx
 512         andl    $-64, %ecx
 513
 514         subl    %edx, %eax
 515
 516 /* We should stop two iterations before the termination
 517         (in order not to misprefetch).  */
 518         subl    $64, %ecx
 519         cmpl    %ebx, %ecx
 520         je      L(main_loop_just_one_iteration)
 521
 522         subl    $64, %ecx
 523         cmpl    %ebx, %ecx
 524         je      L(main_loop_last_two_iterations)
 525
 526         .p2align 4
 527 L(main_loop_cache):
 528
 529         prefetcht0 128(%ebx, %eax)
 530
 531         movdqu  (%ebx, %eax), %xmm0
 532         movdqu  16(%ebx, %eax), %xmm1
 533         movdqu  32(%ebx, %eax), %xmm2
 534         movdqu  48(%ebx, %eax), %xmm3
 535         movdqa  %xmm0, (%ebx)
 536         movaps  %xmm1, 16(%ebx)
 537         movaps  %xmm2, 32(%ebx)
 538         movaps  %xmm3, 48(%ebx)
 539         lea     64(%ebx), %ebx
 540         cmpl    %ebx, %ecx
 541         jne     L(main_loop_cache)
 542
 543 L(main_loop_last_two_iterations):
 544         movdqu  (%ebx, %eax), %xmm0
 545         movdqu  16(%ebx, %eax), %xmm1
 546         movdqu  32(%ebx, %eax), %xmm2
 547         movdqu  48(%ebx, %eax), %xmm3
 548         movdqu  64(%ebx, %eax), %xmm4
 549         movdqu  80(%ebx, %eax), %xmm5
 550         movdqu  96(%ebx, %eax), %xmm6
 551         movdqu  112(%ebx, %eax), %xmm7
 552         movdqa  %xmm0, (%ebx)
 553         movaps  %xmm1, 16(%ebx)
 554         movaps  %xmm2, 32(%ebx)
 555         movaps  %xmm3, 48(%ebx)
 556         movaps  %xmm4, 64(%ebx)
 557         movaps  %xmm5, 80(%ebx)
 558         movaps  %xmm6, 96(%ebx)
 559         movaps  %xmm7, 112(%ebx)
 560         jmp     L(return)
 561
 562 L(main_loop_just_one_iteration):
 563         movdqu  (%ebx, %eax), %xmm0
 564         movdqu  16(%ebx, %eax), %xmm1
 565         movdqu  32(%ebx, %eax), %xmm2
 566         movdqu  48(%ebx, %eax), %xmm3
 567         movdqa  %xmm0, (%ebx)
 568         movaps  %xmm1, 16(%ebx)
 569         movaps  %xmm2, 32(%ebx)
 570         movaps  %xmm3, 48(%ebx)
 571         jmp     L(return)
 572
 573 L(large_page):
 574         movdqu  (%eax), %xmm0
 575         movdqu  16(%eax), %xmm1
 576         movdqu  32(%eax), %xmm2
 577         movdqu  48(%eax), %xmm3
 578         movdqu  -64(%eax, %ecx), %xmm4
 579         movdqu  -48(%eax, %ecx), %xmm5
 580         movdqu  -32(%eax, %ecx), %xmm6
 581         movdqu  -16(%eax, %ecx), %xmm7
 582         movdqu  %xmm0, (%edx)
 583         movdqu  %xmm1, 16(%edx)
 584         movdqu  %xmm2, 32(%edx)
 585         movdqu  %xmm3, 48(%edx)
 586         movdqu  %xmm4, -64(%edx, %ecx)
 587         movdqu  %xmm5, -48(%edx, %ecx)
 588         movdqu  %xmm6, -32(%edx, %ecx)
 589         movdqu  %xmm7, -16(%edx, %ecx)
 590
 591         movdqu  64(%eax), %xmm0
 592         movdqu  80(%eax), %xmm1
 593         movdqu  96(%eax), %xmm2
 594         movdqu  112(%eax), %xmm3
 595         movdqu  -128(%eax, %ecx), %xmm4
 596         movdqu  -112(%eax, %ecx), %xmm5
 597         movdqu  -96(%eax, %ecx), %xmm6
 598         movdqu  -80(%eax, %ecx), %xmm7
 599         movdqu  %xmm0, 64(%edx)
 600         movdqu  %xmm1, 80(%edx)
 601         movdqu  %xmm2, 96(%edx)
 602         movdqu  %xmm3, 112(%edx)
 603         movdqu  %xmm4, -128(%edx, %ecx)
 604         movdqu  %xmm5, -112(%edx, %ecx)
 605         movdqu  %xmm6, -96(%edx, %ecx)
 606         movdqu  %xmm7, -80(%edx, %ecx)
 607
 608 /* Now the main loop with non temporal stores. We align
 609         the address of the destination.  */
 610         leal    128(%edx), %ebx
 611         andl    $-128, %ebx
 612
 613         addl    %edx, %ecx
 614         andl    $-128, %ecx
 615
 616         subl    %edx, %eax
 617
 618         .p2align 4
 619 L(main_loop_large_page):
 620         movdqu  (%ebx, %eax), %xmm0
 621         movdqu  16(%ebx, %eax), %xmm1
 622         movdqu  32(%ebx, %eax), %xmm2
 623         movdqu  48(%ebx, %eax), %xmm3
 624         movdqu  64(%ebx, %eax), %xmm4
 625         movdqu  80(%ebx, %eax), %xmm5
 626         movdqu  96(%ebx, %eax), %xmm6
 627         movdqu  112(%ebx, %eax), %xmm7
 628         movntdq %xmm0, (%ebx)
 629         movntdq %xmm1, 16(%ebx)
 630         movntdq %xmm2, 32(%ebx)
 631         movntdq %xmm3, 48(%ebx)
 632         movntdq %xmm4, 64(%ebx)
 633         movntdq %xmm5, 80(%ebx)
 634         movntdq %xmm6, 96(%ebx)
 635         movntdq %xmm7, 112(%ebx)
 636         lea     128(%ebx), %ebx
 637         cmpl    %ebx, %ecx
 638         jne     L(main_loop_large_page)
 639         sfence
 640         jmp     L(return)
 641
 642 L(len_0_16_bytes):
 643         testb   $24, %cl
 644         jne     L(len_9_16_bytes)
 645         testb   $4, %cl
 646         .p2align 4,,5
 647         jne     L(len_5_8_bytes)
 648         testl   %ecx, %ecx
 649         .p2align 4,,2
 650         je      L(return)
 651         movzbl  (%eax), %ebx
 652         testb   $2, %cl
 653         movb    %bl, (%edx)
 654         je      L(return)
 655         movzwl  -2(%eax,%ecx), %ebx
 656         movw    %bx, -2(%edx,%ecx)
 657         jmp     L(return)
 658
 659 L(len_9_16_bytes):
 660         movq    (%eax), %xmm0
 661         movq    -8(%eax, %ecx), %xmm1
 662         movq    %xmm0, (%edx)
 663         movq    %xmm1, -8(%edx, %ecx)
 664         jmp     L(return)
 665
 666 L(len_5_8_bytes):
 667         movl    (%eax), %ebx
 668         movl    %ebx, (%edx)
 669         movl    -4(%eax,%ecx), %ebx
 670         movl    %ebx, -4(%edx,%ecx)
 671
 672 L(return):
 673         movl    %edx, %eax
 674 # if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
 675         movl    LEN(%esp), %ecx
 676         add     %ecx, %eax
 677 # endif
 678         RETURN
 679
 680 END (MEMCPY)
 681 #endif