unaligned sse2 memcmp

author Ondřej Bílka <neleai@seznam.cz>

Thu, 18 Jun 2015 07:02:22 +0000 (09:02 +0200)

committer Ondřej Bílka <neleai@seznam.cz>

Wed, 24 Jun 2015 10:48:22 +0000 (12:48 +0200)
author Ondřej Bílka <neleai@seznam.cz>
Thu, 18 Jun 2015 07:02:22 +0000 (09:02 +0200)
committer Ondřej Bílka <neleai@seznam.cz>
Wed, 24 Jun 2015 10:48:22 +0000 (12:48 +0200)
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S

index f636716b643481ecc8c8d299c6cab9ff246e57e3..88c0c4a0cacbfe54369e43620eab2fc25b414f89 100644 (file)
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -19,340 +19,204 @@
  
  #include <sysdep.h>
  
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
         .text
-ENTRY (memcmp)
-       test    %rdx, %rdx
-       jz      L(finz)
-       cmpq    $1, %rdx
-       jle     L(finr1b)
-       subq    %rdi, %rsi
-       movq    %rdx, %r10
-       cmpq    $32, %r10
-       jge     L(gt32)
-       /* Handle small chunks and last block of less than 32 bytes.  */
-L(small):
-       testq   $1, %r10
-       jz      L(s2b)
-       movzbl  (%rdi), %eax
-       movzbl  (%rdi, %rsi), %edx
-       subq    $1, %r10
-       je      L(finz1)
-       addq    $1, %rdi
-       subl    %edx, %eax
-       jnz     L(exit)
-L(s2b):
-       testq   $2, %r10
-       jz      L(s4b)
-       movzwl  (%rdi), %eax
-       movzwl  (%rdi, %rsi), %edx
-       subq    $2, %r10
-       je      L(fin2_7)
-       addq    $2, %rdi
-       cmpl    %edx, %eax
-       jnz     L(fin2_7)
-L(s4b):
-       testq   $4, %r10
-       jz      L(s8b)
-       movl    (%rdi), %eax
-       movl    (%rdi, %rsi), %edx
-       subq    $4, %r10
-       je      L(fin2_7)
-       addq    $4, %rdi
-       cmpl    %edx, %eax
-       jnz     L(fin2_7)
-L(s8b):
-       testq   $8, %r10
-       jz      L(s16b)
-       movq    (%rdi), %rax
-       movq    (%rdi, %rsi), %rdx
-       subq    $8, %r10
-       je      L(fin2_7)
-       addq    $8, %rdi
-       cmpq    %rdx, %rax
-       jnz     L(fin2_7)
-L(s16b):
-       movdqu    (%rdi), %xmm1
-       movdqu    (%rdi, %rsi), %xmm0
-       pcmpeqb   %xmm0, %xmm1
-       pmovmskb  %xmm1, %edx
-       xorl      %eax, %eax
-       subl      $0xffff, %edx
-       jz        L(finz)
-       bsfl      %edx, %ecx
-       leaq     (%rdi, %rcx), %rcx
-       movzbl   (%rcx), %eax
-       movzbl   (%rsi, %rcx), %edx
-       jmp      L(finz1)
+ENTRY (MEMCMP)
+       testq   %rdx, %rdx
+       je      L(return_zero)
+#ifdef AS_WMEMCMP
+       shl     $2, %rdx
+#endif
+       pxor    %xmm4, %xmm4
+       movl    %edi, %eax
+       andl    $4095, %eax
+       cmpl    $4032, %eax
+       ja      L(cross_page_start)
+L(handle_end):
+       movl    %esi, %eax
+       andl    $4095, %eax
+       cmpl    $4032, %eax
+       ja      L(cross_page_start)
+L(back_header):
+       xor     %ecx, %ecx
+       bts     %rdx, %rcx
+       sub     $1, %rcx
+       movdqu  (%rdi), %xmm0
+       movdqu  (%rsi), %xmm1
+       pcmpeqb %xmm1, %xmm0
+       pcmpeqb %xmm4, %xmm0
+       pmovmskb %xmm0, %eax
+       and     %ecx, %eax
+       jne     L(different)
+       cmpq    $16, %rdx
+       ja      L(next)
+       ret
+L(next):
+       pmovmskb %xmm0, %r8d
+       movdqu  16(%rdi), %xmm2
+       movdqu  16(%rsi), %xmm6
+       movdqu  32(%rdi), %xmm1
+       pcmpeqb %xmm6, %xmm2
+       movdqu  32(%rsi), %xmm5
+       pcmpeqb %xmm4, %xmm2
+       pcmpeqb %xmm5, %xmm1
+       movdqu  48(%rdi), %xmm7
+       pmovmskb %xmm2, %eax
+       movdqu  48(%rsi), %xmm3
+       pcmpeqb %xmm4, %xmm1
+       pmovmskb %xmm1, %r9d
+       sal     $16, %eax
+       pcmpeqb %xmm3, %xmm7
+       salq    $32, %r9
+       pcmpeqb %xmm4, %xmm7
+       orq     %r9, %rax
+       orq     %r8, %rax
+       pmovmskb %xmm7, %r8d
+       salq    $48, %r8
+       orq     %r8, %rax
+       movq    %rax, %r8
+       andq    %rcx, %rax
+       jne     L(different)
+       cmpq    $64, %rdx
+       jb      L(return_zero)
+       movq    %r8, %rax
+       testq   %rax, %rax
+       jne     L(different)
+L(align_loop):
+       leaq    64(%rdi), %rax
+       andq    $-64, %rax
+       subq    %rdi, %rax
+       subq    %rax, %rdx
+       addq    %rax, %rdi
+       addq    %rax, %rsi
+       cmpq    $64, %rdx
+       ja      L(loop_start)
+       testq   %rdx, %rdx
+       jne     L(handle_end)
+       xorl    %eax, %eax
+       ret
  
-       .p2align 4,, 4
-L(finr1b):
-       movzbl  (%rdi), %eax
-       movzbl  (%rsi), %edx
-L(finz1):
+       .p2align 4
+L(different):
+       bsfq    %rax, %rdx
+#ifdef AS_WMEMCMP
+       and     $-4, %rdx
+       mov     (%rdi,%rdx), %eax
+       mov     (%rsi,%rdx), %edx
         subl    %edx, %eax
-L(exit):
+       jg      L(ret1)
+       jl      L(ret_neg_1)
         ret
-
-       .p2align 4,, 4
-L(fin2_7):
-       cmpq    %rdx, %rax
-       jz      L(finz)
-       movq    %rax, %r11
-       subq    %rdx, %r11
-       bsfq    %r11, %rcx
-       sarq    $3, %rcx
-       salq    $3, %rcx
-       sarq    %cl, %rax
-       movzbl  %al, %eax
-       sarq    %cl, %rdx
-       movzbl  %dl, %edx
+L(ret1):
+       mov $1, %eax
+       ret
+L(ret_neg_1):
+       mov $-1, %eax
+       ret
+#else
+       movzbl  (%rdi,%rdx), %eax
+       movzbl  (%rsi,%rdx), %edx
         subl    %edx, %eax
         ret
-
-       .p2align 4,, 4
-L(finz):
+#endif
+L(return_zero):
+       xor     %eax, %eax
+       ret
+       .p2align 4
+L(loop):
+       subq    $64, %rdx
+       addq    $64, %rdi
+       addq    $64, %rsi
+       cmpq    $64, %rdx
+       jbe     L(less_64_bytes)
+L(loop_start):
+       movdqu  (%rsi), %xmm0
+       movdqu  16(%rsi), %xmm1
+       pcmpeqb (%rdi), %xmm0
+       movdqu  32(%rsi), %xmm2
+       pcmpeqb 16(%rdi), %xmm1
+       movdqu  48(%rsi), %xmm3
+       pcmpeqb 32(%rdi), %xmm2
+       pcmpeqb 48(%rdi), %xmm3
+       pminub  %xmm0, %xmm3
+       pminub  %xmm1, %xmm3
+       pminub  %xmm2, %xmm3
+       pcmpeqb %xmm4, %xmm3
+       pmovmskb %xmm3, %eax
+       testl   %eax, %eax
+       je      L(loop)
+       shl     $48, %rax
+       pcmpeqb %xmm4, %xmm0
+       pcmpeqb %xmm4, %xmm1
+       pcmpeqb %xmm4, %xmm2
+       pmovmskb %xmm0, %r8
+       pmovmskb %xmm1, %rcx
+       pmovmskb %xmm2, %r9
+       shl     $16, %ecx
+       shl     $32, %r9
+       or      %r8, %rax
+       or      %r9, %rax
+       or      %rcx, %rax
+       jmp     L(different)
+
+       .p2align 4
+L(less_64_bytes):
+       testq   %rdx, %rdx
+       jne     L(handle_end)
         xorl    %eax, %eax
         ret
  
-       /* For blocks bigger than 32 bytes
-          1. Advance one of the addr pointer to be 16B aligned.
-          2. Treat the case of both addr pointers aligned to 16B
-             separately to avoid movdqu.
-          3. Handle any blocks of greater than 64 consecutive bytes with
-             unrolling to reduce branches.
-          4. At least one addr pointer is 16B aligned, use memory version
-             of pcmbeqb.
-       */
-       .p2align 4,, 4
-L(gt32):
-       movq    %rdx, %r11
-       addq    %rdi, %r11
-       movq    %rdi, %r8
-
-       andq    $15, %r8
-       jz      L(16am)
-       /* Both pointers may be misaligned.  */
-       movdqu  (%rdi), %xmm1
-       movdqu  (%rdi, %rsi), %xmm0
-       pcmpeqb   %xmm0, %xmm1
-       pmovmskb  %xmm1, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       neg      %r8
-       leaq    16(%rdi, %r8), %rdi
-L(16am):
-       /* Handle two 16B aligned pointers separately.  */
-       testq   $15, %rsi
-       jz      L(ATR)
-       testq   $16, %rdi
-       jz      L(A32)
-       movdqu  (%rdi, %rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq    $16, %rdi
-L(A32):
-       movq    %r11, %r10
-       andq    $-32, %r10
-       cmpq    %r10, %rdi
-        jge    L(mt16)
-       /* Pre-unroll to be ready for unrolled 64B loop.  */
-       testq   $32, %rdi
-       jz      L(A64)
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb  (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-L(A64):
-       movq    %r11, %r10
-       andq    $-64, %r10
-       cmpq    %r10, %rdi
-        jge    L(mt32)
-
-L(A64main):
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb  (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       cmpq       %rdi, %r10
-       jne       L(A64main)
-
-L(mt32):
-       movq    %r11, %r10
-       andq    $-32, %r10
-       cmpq    %r10, %rdi
-        jge    L(mt16)
  
-L(A32main):
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqu    (%rdi,%rsi), %xmm0
-       pcmpeqb  (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       cmpq       %rdi, %r10
-       jne       L(A32main)
-L(mt16):
-       subq       %rdi, %r11
-       je        L(finz)
-       movq      %r11, %r10
-       jmp       L(small)
-
-       .p2align 4,, 4
-L(neq):
-       bsfl      %edx, %ecx
-       movzbl   (%rdi, %rcx), %eax
-       addq     %rdi, %rsi
-       movzbl   (%rsi,%rcx), %edx
-       jmp      L(finz1)
-
-       .p2align 4,, 4
-L(ATR):
-       movq    %r11, %r10
-       andq    $-32, %r10
-       cmpq    %r10, %rdi
-        jge    L(mt16)
-       testq   $16, %rdi
-       jz      L(ATR32)
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-       cmpq       %rdi, %r10
-       je       L(mt16)
-
-L(ATR32):
-       movq    %r11, %r10
-       andq    $-64, %r10
-       testq   $32, %rdi
-       jz      L(ATR64)
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-L(ATR64):
-       cmpq       %rdi, %r10
-       je         L(mt32)
-
-L(ATR64main):
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-       cmpq       %rdi, %r10
-       jne       L(ATR64main)
-
-       movq    %r11, %r10
-       andq    $-32, %r10
-       cmpq    %r10, %rdi
-        jge    L(mt16)
-
-L(ATR32res):
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       movdqa    (%rdi,%rsi), %xmm0
-       pcmpeqb   (%rdi), %xmm0
-       pmovmskb  %xmm0, %edx
-       subl      $0xffff, %edx
-       jnz       L(neq)
-       addq       $16, %rdi
-
-       cmpq      %r10, %rdi
-       jne       L(ATR32res)
-
-       subq       %rdi, %r11
-       je        L(finz)
-       movq      %r11, %r10
-       jmp       L(small)
-       /* Align to 16byte to improve instruction fetch.  */
-       .p2align 4,, 4
-END(memcmp)
+       .p2align 4
+L(cross_page_start):
+       cmp     $64, %rdx
+       ja      L(back_header)
+
+       .p2align 4
+L(cross_page):
+       test    %edx, %edx
+       je      L(return_zero)
+#ifdef AS_WMEMCMP
+       mov     (%rdi), %eax
+       mov     (%rsi), %ecx
+       subl    %ecx, %eax
+       jg      L(ret1)
+       jl      L(ret_neg_1)
+#else
+       movzbl  (%rdi), %eax
+       movzbl  (%rsi), %ecx
+       subl    %ecx, %eax
+       jne     L(return)
+       cmp     $1, %edx
+       je      L(return)
+       movzbl  1(%rdi), %eax
+       movzbl  1(%rsi), %ecx
+       subl    %ecx, %eax
+       jne     L(return)
+       cmp     $2, %edx
+       je      L(return)
+       movzbl  2(%rdi), %eax
+       movzbl  2(%rsi), %ecx
+       subl    %ecx, %eax
+       jne     L(return)
+       cmp     $3, %edx
+       je      L(return)
+       movzbl  3(%rdi), %eax
+       movzbl  3(%rsi), %ecx
+       subl    %ecx, %eax
+       jne     L(return)
+#endif
+       sub     $4, %edx
+       add     $4, %rdi
+       add     $4, %rsi
+       jmp     L(cross_page)
+L(return):
+       ret
+END(MEMCMP)
  
-#undef bcmp
+#undef bcmp
  weak_alias (memcmp, bcmp)
  libc_hidden_builtin_def (memcmp)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile

index c57374454e30d1c2aabe3dee657e8fadbb88ca99..679db2a2969e9891bfed7f5fd9fe0c8acbe5cc8d 100644 (file)
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
  
  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
                    strcmp-sse2-unaligned strncmp-ssse3 \
-                  memcmp-sse4 memcpy-ssse3 \
+                  memcpy-ssse3 \
                    memcpy-sse2-unaligned mempcpy-ssse3 \
                    memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
                    memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
@@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4
  endif
  
  ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
  endif
  endif
  
  ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
  endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c

index d398e43d29f63e2a21437b94c1e1bd37c431354c..b3dbe65052f00a24597ece9f3b28c010a242af7e 100644 (file)
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
    IFUNC_IMPL (i, name, memcmp,
-             IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
-                             __memcmp_sse4_1)
+             IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2)
               IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
-             IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+             IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned))
  
    /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
    IFUNC_IMPL (i, name, __memmove_chk,
@@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
    IFUNC_IMPL (i, name, wmemcmp,
-             IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
-                             __wmemcmp_sse4_1)
+             IFUNC_IMPL_ADD (array, i, wmemcmp, 1,
+                             __wmemcmp_sse2_unaligned)
               IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
                               __wmemcmp_ssse3)
               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S

new file mode 100644 (file)

index 0000000..60483bf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AVX2
+#define MEMCMP __memcmp_avx2
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S

deleted file mode 100644 (file)

index 533fece..0000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,1776 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP       __memcmp_sse4_1
-# endif
-
-# define JMPTBL(I, B)  (I - B)
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)           \
-  lea          TABLE(%rip), %r11;                              \
-  movslq       (%r11, INDEX, SCALE), %rcx;                     \
-  add          %r11, %rcx;                                     \
-  jmp          *%rcx;                                          \
-  ud2
-
-/* Warning!
-           wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-       .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-       shl     $2, %rdx
-# endif
-       pxor    %xmm0, %xmm0
-       cmp     $79, %rdx
-       ja      L(79bytesormore)
-# ifndef USE_AS_WMEMCMP
-       cmp     $1, %rdx
-       je      L(firstbyte)
-# endif
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-# ifndef USE_AS_WMEMCMP
-       .p2align 4
-L(firstbyte):
-       movzbl  (%rdi), %eax
-       movzbl  (%rsi), %ecx
-       sub     %ecx, %eax
-       ret
-# endif
-
-       .p2align 4
-L(79bytesormore):
-       movdqu  (%rsi), %xmm1
-       movdqu  (%rdi), %xmm2
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-       mov     %rsi, %rcx
-       and     $-16, %rsi
-       add     $16, %rsi
-       sub     %rsi, %rcx
-
-       sub     %rcx, %rdi
-       add     %rcx, %rdx
-       test    $0xf, %rdi
-       jz      L(2aligned)
-
-       cmp     $128, %rdx
-       ja      L(128bytesormore)
-L(less128bytes):
-       sub     $64, %rdx
-
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqu  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-
-       movdqu  32(%rdi), %xmm2
-       pxor    32(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(48bytesin256)
-
-       movdqu  48(%rdi), %xmm2
-       pxor    48(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(64bytesin256)
-       cmp     $32, %rdx
-       jb      L(less32bytesin64)
-
-       movdqu  64(%rdi), %xmm2
-       pxor    64(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(80bytesin256)
-
-       movdqu  80(%rdi), %xmm2
-       pxor    80(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(96bytesin256)
-       sub     $32, %rdx
-       add     $32, %rdi
-       add     $32, %rsi
-L(less32bytesin64):
-       add     $64, %rdi
-       add     $64, %rsi
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(128bytesormore):
-       cmp     $512, %rdx
-       ja      L(512bytesormore)
-       cmp     $256, %rdx
-       ja      L(less512bytes)
-L(less256bytes):
-       sub     $128, %rdx
-
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqu  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-
-       movdqu  32(%rdi), %xmm2
-       pxor    32(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(48bytesin256)
-
-       movdqu  48(%rdi), %xmm2
-       pxor    48(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(64bytesin256)
-
-       movdqu  64(%rdi), %xmm2
-       pxor    64(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(80bytesin256)
-
-       movdqu  80(%rdi), %xmm2
-       pxor    80(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(96bytesin256)
-
-       movdqu  96(%rdi), %xmm2
-       pxor    96(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(112bytesin256)
-
-       movdqu  112(%rdi), %xmm2
-       pxor    112(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(128bytesin256)
-
-       add     $128, %rsi
-       add     $128, %rdi
-
-       cmp     $64, %rdx
-       jae     L(less128bytes)
-
-       cmp     $32, %rdx
-       jb      L(less32bytesin128)
-
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqu  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-       sub     $32, %rdx
-       add     $32, %rdi
-       add     $32, %rsi
-L(less32bytesin128):
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(less512bytes):
-       sub     $256, %rdx
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqu  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-
-       movdqu  32(%rdi), %xmm2
-       pxor    32(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(48bytesin256)
-
-       movdqu  48(%rdi), %xmm2
-       pxor    48(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(64bytesin256)
-
-       movdqu  64(%rdi), %xmm2
-       pxor    64(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(80bytesin256)
-
-       movdqu  80(%rdi), %xmm2
-       pxor    80(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(96bytesin256)
-
-       movdqu  96(%rdi), %xmm2
-       pxor    96(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(112bytesin256)
-
-       movdqu  112(%rdi), %xmm2
-       pxor    112(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(128bytesin256)
-
-       movdqu  128(%rdi), %xmm2
-       pxor    128(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(144bytesin256)
-
-       movdqu  144(%rdi), %xmm2
-       pxor    144(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(160bytesin256)
-
-       movdqu  160(%rdi), %xmm2
-       pxor    160(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(176bytesin256)
-
-       movdqu  176(%rdi), %xmm2
-       pxor    176(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(192bytesin256)
-
-       movdqu  192(%rdi), %xmm2
-       pxor    192(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(208bytesin256)
-
-       movdqu  208(%rdi), %xmm2
-       pxor    208(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(224bytesin256)
-
-       movdqu  224(%rdi), %xmm2
-       pxor    224(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(240bytesin256)
-
-       movdqu  240(%rdi), %xmm2
-       pxor    240(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(256bytesin256)
-
-       add     $256, %rsi
-       add     $256, %rdi
-
-       cmp     $128, %rdx
-       jae     L(less256bytes)
-
-       cmp     $64, %rdx
-       jae     L(less128bytes)
-
-       cmp     $32, %rdx
-       jb      L(less32bytesin256)
-
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqu  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-       sub     $32, %rdx
-       add     $32, %rdi
-       add     $32, %rsi
-L(less32bytesin256):
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-       .p2align 4
-L(512bytesormore):
-# ifdef DATA_CACHE_SIZE_HALF
-       mov     $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-       mov     __x86_data_cache_size_half(%rip), %R8_LP
-# endif
-       mov     %r8, %r9
-       shr     $1, %r8
-       add     %r9, %r8
-       cmp     %r8, %rdx
-       ja      L(L2_L3_cache_unaglined)
-       sub     $64, %rdx
-       .p2align 4
-L(64bytesormore_loop):
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       movdqa  %xmm2, %xmm1
-
-       movdqu  16(%rdi), %xmm3
-       pxor    16(%rsi), %xmm3
-       por     %xmm3, %xmm1
-
-       movdqu  32(%rdi), %xmm4
-       pxor    32(%rsi), %xmm4
-       por     %xmm4, %xmm1
-
-       movdqu  48(%rdi), %xmm5
-       pxor    48(%rsi), %xmm5
-       por     %xmm5, %xmm1
-
-       ptest   %xmm1, %xmm0
-       jnc     L(64bytesormore_loop_end)
-       add     $64, %rsi
-       add     $64, %rdi
-       sub     $64, %rdx
-       jae     L(64bytesormore_loop)
-
-       add     $64, %rdx
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(L2_L3_cache_unaglined):
-       sub     $64, %rdx
-       .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
-       prefetchnta 0x1c0(%rdi)
-       prefetchnta 0x1c0(%rsi)
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       movdqa  %xmm2, %xmm1
-
-       movdqu  16(%rdi), %xmm3
-       pxor    16(%rsi), %xmm3
-       por     %xmm3, %xmm1
-
-       movdqu  32(%rdi), %xmm4
-       pxor    32(%rsi), %xmm4
-       por     %xmm4, %xmm1
-
-       movdqu  48(%rdi), %xmm5
-       pxor    48(%rsi), %xmm5
-       por     %xmm5, %xmm1
-
-       ptest   %xmm1, %xmm0
-       jnc     L(64bytesormore_loop_end)
-       add     $64, %rsi
-       add     $64, %rdi
-       sub     $64, %rdx
-       jae     L(L2_L3_unaligned_128bytes_loop)
-
-       add     $64, %rdx
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-/*
- * This case is for machines which are sensitive for unaligned instructions.
- */
-       .p2align 4
-L(2aligned):
-       cmp     $128, %rdx
-       ja      L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
-       sub     $64, %rdx
-
-       movdqa  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqa  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-
-       movdqa  32(%rdi), %xmm2
-       pxor    32(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(48bytesin256)
-
-       movdqa  48(%rdi), %xmm2
-       pxor    48(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(64bytesin256)
-       cmp     $32, %rdx
-       jb      L(less32bytesin64in2alinged)
-
-       movdqa  64(%rdi), %xmm2
-       pxor    64(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(80bytesin256)
-
-       movdqa  80(%rdi), %xmm2
-       pxor    80(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(96bytesin256)
-       sub     $32, %rdx
-       add     $32, %rdi
-       add     $32, %rsi
-L(less32bytesin64in2alinged):
-       add     $64, %rdi
-       add     $64, %rsi
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-       .p2align 4
-L(128bytesormorein2aligned):
-       cmp     $512, %rdx
-       ja      L(512bytesormorein2aligned)
-       cmp     $256, %rdx
-       ja      L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
-       sub     $128, %rdx
-
-       movdqa  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqa  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-
-       movdqa  32(%rdi), %xmm2
-       pxor    32(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(48bytesin256)
-
-       movdqa  48(%rdi), %xmm2
-       pxor    48(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(64bytesin256)
-
-       movdqa  64(%rdi), %xmm2
-       pxor    64(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(80bytesin256)
-
-       movdqa  80(%rdi), %xmm2
-       pxor    80(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(96bytesin256)
-
-       movdqa  96(%rdi), %xmm2
-       pxor    96(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(112bytesin256)
-
-       movdqa  112(%rdi), %xmm2
-       pxor    112(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(128bytesin256)
-
-       add     $128, %rsi
-       add     $128, %rdi
-
-       cmp     $64, %rdx
-       jae     L(less128bytesin2aligned)
-
-       cmp     $32, %rdx
-       jb      L(less32bytesin128in2aligned)
-
-       movdqu  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqu  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-       sub     $32, %rdx
-       add     $32, %rdi
-       add     $32, %rsi
-L(less32bytesin128in2aligned):
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-       .p2align 4
-L(256bytesormorein2aligned):
-
-       sub     $256, %rdx
-       movdqa  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqa  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-
-       movdqa  32(%rdi), %xmm2
-       pxor    32(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(48bytesin256)
-
-       movdqa  48(%rdi), %xmm2
-       pxor    48(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(64bytesin256)
-
-       movdqa  64(%rdi), %xmm2
-       pxor    64(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(80bytesin256)
-
-       movdqa  80(%rdi), %xmm2
-       pxor    80(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(96bytesin256)
-
-       movdqa  96(%rdi), %xmm2
-       pxor    96(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(112bytesin256)
-
-       movdqa  112(%rdi), %xmm2
-       pxor    112(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(128bytesin256)
-
-       movdqa  128(%rdi), %xmm2
-       pxor    128(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(144bytesin256)
-
-       movdqa  144(%rdi), %xmm2
-       pxor    144(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(160bytesin256)
-
-       movdqa  160(%rdi), %xmm2
-       pxor    160(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(176bytesin256)
-
-       movdqa  176(%rdi), %xmm2
-       pxor    176(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(192bytesin256)
-
-       movdqa  192(%rdi), %xmm2
-       pxor    192(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(208bytesin256)
-
-       movdqa  208(%rdi), %xmm2
-       pxor    208(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(224bytesin256)
-
-       movdqa  224(%rdi), %xmm2
-       pxor    224(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(240bytesin256)
-
-       movdqa  240(%rdi), %xmm2
-       pxor    240(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(256bytesin256)
-
-       add     $256, %rsi
-       add     $256, %rdi
-
-       cmp     $128, %rdx
-       jae     L(less256bytesin2alinged)
-
-       cmp     $64, %rdx
-       jae     L(less128bytesin2aligned)
-
-       cmp     $32, %rdx
-       jb      L(less32bytesin256in2alinged)
-
-       movdqa  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytesin256)
-
-       movdqa  16(%rdi), %xmm2
-       pxor    16(%rsi), %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(32bytesin256)
-       sub     $32, %rdx
-       add     $32, %rdi
-       add     $32, %rsi
-L(less32bytesin256in2alinged):
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-       .p2align 4
-L(512bytesormorein2aligned):
-# ifdef DATA_CACHE_SIZE_HALF
-       mov     $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-       mov     __x86_data_cache_size_half(%rip), %R8_LP
-# endif
-       mov     %r8, %r9
-       shr     $1, %r8
-       add     %r9, %r8
-       cmp     %r8, %rdx
-       ja      L(L2_L3_cache_aglined)
-
-       sub     $64, %rdx
-       .p2align 4
-L(64bytesormore_loopin2aligned):
-       movdqa  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       movdqa  %xmm2, %xmm1
-
-       movdqa  16(%rdi), %xmm3
-       pxor    16(%rsi), %xmm3
-       por     %xmm3, %xmm1
-
-       movdqa  32(%rdi), %xmm4
-       pxor    32(%rsi), %xmm4
-       por     %xmm4, %xmm1
-
-       movdqa  48(%rdi), %xmm5
-       pxor    48(%rsi), %xmm5
-       por     %xmm5, %xmm1
-
-       ptest   %xmm1, %xmm0
-       jnc     L(64bytesormore_loop_end)
-       add     $64, %rsi
-       add     $64, %rdi
-       sub     $64, %rdx
-       jae     L(64bytesormore_loopin2aligned)
-
-       add     $64, %rdx
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-L(L2_L3_cache_aglined):
-       sub     $64, %rdx
-
-       .p2align 4
-L(L2_L3_aligned_128bytes_loop):
-       prefetchnta 0x1c0(%rdi)
-       prefetchnta 0x1c0(%rsi)
-       movdqa  (%rdi), %xmm2
-       pxor    (%rsi), %xmm2
-       movdqa  %xmm2, %xmm1
-
-       movdqa  16(%rdi), %xmm3
-       pxor    16(%rsi), %xmm3
-       por     %xmm3, %xmm1
-
-       movdqa  32(%rdi), %xmm4
-       pxor    32(%rsi), %xmm4
-       por     %xmm4, %xmm1
-
-       movdqa  48(%rdi), %xmm5
-       pxor    48(%rsi), %xmm5
-       por     %xmm5, %xmm1
-
-       ptest   %xmm1, %xmm0
-       jnc     L(64bytesormore_loop_end)
-       add     $64, %rsi
-       add     $64, %rdi
-       sub     $64, %rdx
-       jae     L(L2_L3_aligned_128bytes_loop)
-
-       add     $64, %rdx
-       add     %rdx, %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-
-       .p2align 4
-L(64bytesormore_loop_end):
-       add     $16, %rdi
-       add     $16, %rsi
-       ptest   %xmm2, %xmm0
-       jnc     L(16bytes)
-
-       add     $16, %rdi
-       add     $16, %rsi
-       ptest   %xmm3, %xmm0
-       jnc     L(16bytes)
-
-       add     $16, %rdi
-       add     $16, %rsi
-       ptest   %xmm4, %xmm0
-       jnc     L(16bytes)
-
-       add     $16, %rdi
-       add     $16, %rsi
-       jmp     L(16bytes)
-
-L(256bytesin256):
-       add     $256, %rdi
-       add     $256, %rsi
-       jmp     L(16bytes)
-L(240bytesin256):
-       add     $240, %rdi
-       add     $240, %rsi
-       jmp     L(16bytes)
-L(224bytesin256):
-       add     $224, %rdi
-       add     $224, %rsi
-       jmp     L(16bytes)
-L(208bytesin256):
-       add     $208, %rdi
-       add     $208, %rsi
-       jmp     L(16bytes)
-L(192bytesin256):
-       add     $192, %rdi
-       add     $192, %rsi
-       jmp     L(16bytes)
-L(176bytesin256):
-       add     $176, %rdi
-       add     $176, %rsi
-       jmp     L(16bytes)
-L(160bytesin256):
-       add     $160, %rdi
-       add     $160, %rsi
-       jmp     L(16bytes)
-L(144bytesin256):
-       add     $144, %rdi
-       add     $144, %rsi
-       jmp     L(16bytes)
-L(128bytesin256):
-       add     $128, %rdi
-       add     $128, %rsi
-       jmp     L(16bytes)
-L(112bytesin256):
-       add     $112, %rdi
-       add     $112, %rsi
-       jmp     L(16bytes)
-L(96bytesin256):
-       add     $96, %rdi
-       add     $96, %rsi
-       jmp     L(16bytes)
-L(80bytesin256):
-       add     $80, %rdi
-       add     $80, %rsi
-       jmp     L(16bytes)
-L(64bytesin256):
-       add     $64, %rdi
-       add     $64, %rsi
-       jmp     L(16bytes)
-L(48bytesin256):
-       add     $16, %rdi
-       add     $16, %rsi
-L(32bytesin256):
-       add     $16, %rdi
-       add     $16, %rsi
-L(16bytesin256):
-       add     $16, %rdi
-       add     $16, %rsi
-L(16bytes):
-       mov     -16(%rdi), %rax
-       mov     -16(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-L(8bytes):
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(12bytes):
-       mov     -12(%rdi), %rax
-       mov     -12(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-L(4bytes):
-       mov     -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
-       mov     -4(%rdi), %eax
-       cmp     %eax, %ecx
-# else
-       cmp     -4(%rdi), %ecx
-# endif
-       jne     L(diffin4bytes)
-L(0bytes):
-       xor     %eax, %eax
-       ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal case for wmemcmp */
-       .p2align 4
-L(65bytes):
-       movdqu  -65(%rdi), %xmm1
-       movdqu  -65(%rsi), %xmm2
-       mov     $-65, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(49bytes):
-       movdqu  -49(%rdi), %xmm1
-       movdqu  -49(%rsi), %xmm2
-       mov     $-49, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(33bytes):
-       movdqu  -33(%rdi), %xmm1
-       movdqu  -33(%rsi), %xmm2
-       mov     $-33, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(17bytes):
-       mov     -17(%rdi), %rax
-       mov     -17(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-L(9bytes):
-       mov     -9(%rdi), %rax
-       mov     -9(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       movzbl  -1(%rdi), %eax
-       movzbl  -1(%rsi), %edx
-       sub     %edx, %eax
-       ret
-
-       .p2align 4
-L(13bytes):
-       mov     -13(%rdi), %rax
-       mov     -13(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(5bytes):
-       mov     -5(%rdi), %eax
-       mov     -5(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       movzbl  -1(%rdi), %eax
-       movzbl  -1(%rsi), %edx
-       sub     %edx, %eax
-       ret
-
-       .p2align 4
-L(66bytes):
-       movdqu  -66(%rdi), %xmm1
-       movdqu  -66(%rsi), %xmm2
-       mov     $-66, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(50bytes):
-       movdqu  -50(%rdi), %xmm1
-       movdqu  -50(%rsi), %xmm2
-       mov     $-50, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(34bytes):
-       movdqu  -34(%rdi), %xmm1
-       movdqu  -34(%rsi), %xmm2
-       mov     $-34, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(18bytes):
-       mov     -18(%rdi), %rax
-       mov     -18(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-L(10bytes):
-       mov     -10(%rdi), %rax
-       mov     -10(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       movzwl  -2(%rdi), %eax
-       movzwl  -2(%rsi), %ecx
-       cmp     %cl, %al
-       jne     L(end)
-       and     $0xffff, %eax
-       and     $0xffff, %ecx
-       sub     %ecx, %eax
-       ret
-
-       .p2align 4
-L(14bytes):
-       mov     -14(%rdi), %rax
-       mov     -14(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(6bytes):
-       mov     -6(%rdi), %eax
-       mov     -6(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-L(2bytes):
-       movzwl  -2(%rsi), %ecx
-       movzwl  -2(%rdi), %eax
-       cmp     %cl, %al
-       jne     L(end)
-       and     $0xffff, %eax
-       and     $0xffff, %ecx
-       sub     %ecx, %eax
-       ret
-
-       .p2align 4
-L(67bytes):
-       movdqu  -67(%rdi), %xmm2
-       movdqu  -67(%rsi), %xmm1
-       mov     $-67, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(51bytes):
-       movdqu  -51(%rdi), %xmm2
-       movdqu  -51(%rsi), %xmm1
-       mov     $-51, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(35bytes):
-       movdqu  -35(%rsi), %xmm1
-       movdqu  -35(%rdi), %xmm2
-       mov     $-35, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(19bytes):
-       mov     -19(%rdi), %rax
-       mov     -19(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-L(11bytes):
-       mov     -11(%rdi), %rax
-       mov     -11(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -4(%rdi), %eax
-       mov     -4(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(15bytes):
-       mov     -15(%rdi), %rax
-       mov     -15(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(7bytes):
-       mov     -7(%rdi), %eax
-       mov     -7(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       mov     -4(%rdi), %eax
-       mov     -4(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(3bytes):
-       movzwl  -3(%rdi), %eax
-       movzwl  -3(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin2bytes)
-L(1bytes):
-       movzbl  -1(%rdi), %eax
-       movzbl  -1(%rsi), %ecx
-       sub     %ecx, %eax
-       ret
-# endif
-
-       .p2align 4
-L(68bytes):
-       movdqu  -68(%rdi), %xmm2
-       movdqu  -68(%rsi), %xmm1
-       mov     $-68, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(52bytes):
-       movdqu  -52(%rdi), %xmm2
-       movdqu  -52(%rsi), %xmm1
-       mov     $-52, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(36bytes):
-       movdqu  -36(%rdi), %xmm2
-       movdqu  -36(%rsi), %xmm1
-       mov     $-36, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(20bytes):
-       movdqu  -20(%rdi), %xmm2
-       movdqu  -20(%rsi), %xmm1
-       mov     $-20, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -4(%rsi), %ecx
-
-# ifndef USE_AS_WMEMCMP
-       mov     -4(%rdi), %eax
-       cmp     %eax, %ecx
-# else
-       cmp     -4(%rdi), %ecx
-# endif
-       jne     L(diffin4bytes)
-       xor     %eax, %eax
-       ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-       .p2align 4
-L(69bytes):
-       movdqu  -69(%rsi), %xmm1
-       movdqu  -69(%rdi), %xmm2
-       mov     $-69, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(53bytes):
-       movdqu  -53(%rsi), %xmm1
-       movdqu  -53(%rdi), %xmm2
-       mov     $-53, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(37bytes):
-       movdqu  -37(%rsi), %xmm1
-       movdqu  -37(%rdi), %xmm2
-       mov     $-37, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(21bytes):
-       movdqu  -21(%rsi), %xmm1
-       movdqu  -21(%rdi), %xmm2
-       mov     $-21, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(70bytes):
-       movdqu  -70(%rsi), %xmm1
-       movdqu  -70(%rdi), %xmm2
-       mov     $-70, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(54bytes):
-       movdqu  -54(%rsi), %xmm1
-       movdqu  -54(%rdi), %xmm2
-       mov     $-54, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(38bytes):
-       movdqu  -38(%rsi), %xmm1
-       movdqu  -38(%rdi), %xmm2
-       mov     $-38, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(22bytes):
-       movdqu  -22(%rsi), %xmm1
-       movdqu  -22(%rdi), %xmm2
-       mov     $-22, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(71bytes):
-       movdqu  -71(%rsi), %xmm1
-       movdqu  -71(%rdi), %xmm2
-       mov     $-71, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(55bytes):
-       movdqu  -55(%rdi), %xmm2
-       movdqu  -55(%rsi), %xmm1
-       mov     $-55, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(39bytes):
-       movdqu  -39(%rdi), %xmm2
-       movdqu  -39(%rsi), %xmm1
-       mov     $-39, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(23bytes):
-       movdqu  -23(%rdi), %xmm2
-       movdqu  -23(%rsi), %xmm1
-       mov     $-23, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-# endif
-
-       .p2align 4
-L(72bytes):
-       movdqu  -72(%rsi), %xmm1
-       movdqu  -72(%rdi), %xmm2
-       mov     $-72, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(56bytes):
-       movdqu  -56(%rdi), %xmm2
-       movdqu  -56(%rsi), %xmm1
-       mov     $-56, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(40bytes):
-       movdqu  -40(%rdi), %xmm2
-       movdqu  -40(%rsi), %xmm1
-       mov     $-40, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(24bytes):
-       movdqu  -24(%rdi), %xmm2
-       movdqu  -24(%rsi), %xmm1
-       mov     $-24, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-
-       mov     -8(%rsi), %rcx
-       mov     -8(%rdi), %rax
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-       .p2align 4
-L(73bytes):
-       movdqu  -73(%rsi), %xmm1
-       movdqu  -73(%rdi), %xmm2
-       mov     $-73, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(57bytes):
-       movdqu  -57(%rdi), %xmm2
-       movdqu  -57(%rsi), %xmm1
-       mov     $-57, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(41bytes):
-       movdqu  -41(%rdi), %xmm2
-       movdqu  -41(%rsi), %xmm1
-       mov     $-41, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(25bytes):
-       movdqu  -25(%rdi), %xmm2
-       movdqu  -25(%rsi), %xmm1
-       mov     $-25, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -9(%rdi), %rax
-       mov     -9(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       movzbl  -1(%rdi), %eax
-       movzbl  -1(%rsi), %ecx
-       sub     %ecx, %eax
-       ret
-
-       .p2align 4
-L(74bytes):
-       movdqu  -74(%rsi), %xmm1
-       movdqu  -74(%rdi), %xmm2
-       mov     $-74, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(58bytes):
-       movdqu  -58(%rdi), %xmm2
-       movdqu  -58(%rsi), %xmm1
-       mov     $-58, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(42bytes):
-       movdqu  -42(%rdi), %xmm2
-       movdqu  -42(%rsi), %xmm1
-       mov     $-42, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(26bytes):
-       movdqu  -26(%rdi), %xmm2
-       movdqu  -26(%rsi), %xmm1
-       mov     $-26, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -10(%rdi), %rax
-       mov     -10(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       movzwl  -2(%rdi), %eax
-       movzwl  -2(%rsi), %ecx
-       jmp     L(diffin2bytes)
-
-       .p2align 4
-L(75bytes):
-       movdqu  -75(%rsi), %xmm1
-       movdqu  -75(%rdi), %xmm2
-       mov     $-75, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(59bytes):
-       movdqu  -59(%rdi), %xmm2
-       movdqu  -59(%rsi), %xmm1
-       mov     $-59, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(43bytes):
-       movdqu  -43(%rdi), %xmm2
-       movdqu  -43(%rsi), %xmm1
-       mov     $-43, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(27bytes):
-       movdqu  -27(%rdi), %xmm2
-       movdqu  -27(%rsi), %xmm1
-       mov     $-27, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -11(%rdi), %rax
-       mov     -11(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -4(%rdi), %eax
-       mov     -4(%rsi), %ecx
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       xor     %eax, %eax
-       ret
-# endif
-       .p2align 4
-L(76bytes):
-       movdqu  -76(%rsi), %xmm1
-       movdqu  -76(%rdi), %xmm2
-       mov     $-76, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(60bytes):
-       movdqu  -60(%rdi), %xmm2
-       movdqu  -60(%rsi), %xmm1
-       mov     $-60, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(44bytes):
-       movdqu  -44(%rdi), %xmm2
-       movdqu  -44(%rsi), %xmm1
-       mov     $-44, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(28bytes):
-       movdqu  -28(%rdi), %xmm2
-       movdqu  -28(%rsi), %xmm1
-       mov     $-28, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -12(%rdi), %rax
-       mov     -12(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
-       mov     -4(%rdi), %eax
-       cmp     %eax, %ecx
-# else
-       cmp     -4(%rdi), %ecx
-# endif
-       jne     L(diffin4bytes)
-       xor     %eax, %eax
-       ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-       .p2align 4
-L(77bytes):
-       movdqu  -77(%rsi), %xmm1
-       movdqu  -77(%rdi), %xmm2
-       mov     $-77, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(61bytes):
-       movdqu  -61(%rdi), %xmm2
-       movdqu  -61(%rsi), %xmm1
-       mov     $-61, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(45bytes):
-       movdqu  -45(%rdi), %xmm2
-       movdqu  -45(%rsi), %xmm1
-       mov     $-45, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(29bytes):
-       movdqu  -29(%rdi), %xmm2
-       movdqu  -29(%rsi), %xmm1
-       mov     $-29, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-
-       mov     -13(%rdi), %rax
-       mov     -13(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(78bytes):
-       movdqu  -78(%rsi), %xmm1
-       movdqu  -78(%rdi), %xmm2
-       mov     $-78, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(62bytes):
-       movdqu  -62(%rdi), %xmm2
-       movdqu  -62(%rsi), %xmm1
-       mov     $-62, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(46bytes):
-       movdqu  -46(%rdi), %xmm2
-       movdqu  -46(%rsi), %xmm1
-       mov     $-46, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(30bytes):
-       movdqu  -30(%rdi), %xmm2
-       movdqu  -30(%rsi), %xmm1
-       mov     $-30, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -14(%rdi), %rax
-       mov     -14(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-       .p2align 4
-L(79bytes):
-       movdqu  -79(%rsi), %xmm1
-       movdqu  -79(%rdi), %xmm2
-       mov     $-79, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(63bytes):
-       movdqu  -63(%rdi), %xmm2
-       movdqu  -63(%rsi), %xmm1
-       mov     $-63, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(47bytes):
-       movdqu  -47(%rdi), %xmm2
-       movdqu  -47(%rsi), %xmm1
-       mov     $-47, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(31bytes):
-       movdqu  -31(%rdi), %xmm2
-       movdqu  -31(%rsi), %xmm1
-       mov     $-31, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-       mov     -15(%rdi), %rax
-       mov     -15(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-# endif
-       .p2align 4
-L(64bytes):
-       movdqu  -64(%rdi), %xmm2
-       movdqu  -64(%rsi), %xmm1
-       mov     $-64, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(48bytes):
-       movdqu  -48(%rdi), %xmm2
-       movdqu  -48(%rsi), %xmm1
-       mov     $-48, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-L(32bytes):
-       movdqu  -32(%rdi), %xmm2
-       movdqu  -32(%rsi), %xmm1
-       mov     $-32, %dl
-       pxor    %xmm1, %xmm2
-       ptest   %xmm2, %xmm0
-       jnc     L(less16bytes)
-
-       mov     -16(%rdi), %rax
-       mov     -16(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-
-       mov     -8(%rdi), %rax
-       mov     -8(%rsi), %rcx
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       xor     %eax, %eax
-       ret
-
-/*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
- */
-       .p2align 3
-L(less16bytes):
-       movsbq  %dl, %rdx
-       mov     (%rsi, %rdx), %rcx
-       mov     (%rdi, %rdx), %rax
-       cmp     %rax, %rcx
-       jne     L(diffin8bytes)
-       mov     8(%rsi, %rdx), %rcx
-       mov     8(%rdi, %rdx), %rax
-L(diffin8bytes):
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       shr     $32, %rcx
-       shr     $32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
-       cmp     %eax, %ecx
-       jne     L(diffin4bytes)
-       xor     %eax, %eax
-       ret
-# endif
-
-L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
-       cmp     %cx, %ax
-       jne     L(diffin2bytes)
-       shr     $16, %ecx
-       shr     $16, %eax
-L(diffin2bytes):
-       cmp     %cl, %al
-       jne     L(end)
-       and     $0xffff, %eax
-       and     $0xffff, %ecx
-       sub     %ecx, %eax
-       ret
-
-       .p2align 4
-L(end):
-       and     $0xff, %eax
-       and     $0xff, %ecx
-       sub     %ecx, %eax
-       ret
-# else
-
-/* for wmemcmp */
-       mov     $1, %eax
-       jl      L(nequal_bigger)
-       neg     %eax
-       ret
-
-       .p2align 4
-L(nequal_bigger):
-       ret
-
-L(unreal_case):
-       xor     %eax, %eax
-       ret
-# endif
-
-END (MEMCMP)
-
-       .section .rodata.sse4.1,"a",@progbits
-       .p2align 3
-# ifndef USE_AS_WMEMCMP
-L(table_64bytes):
-       .int    JMPTBL (L(0bytes), L(table_64bytes))
-       .int    JMPTBL (L(1bytes), L(table_64bytes))
-       .int    JMPTBL (L(2bytes), L(table_64bytes))
-       .int    JMPTBL (L(3bytes), L(table_64bytes))
-       .int    JMPTBL (L(4bytes), L(table_64bytes))
-       .int    JMPTBL (L(5bytes), L(table_64bytes))
-       .int    JMPTBL (L(6bytes), L(table_64bytes))
-       .int    JMPTBL (L(7bytes), L(table_64bytes))
-       .int    JMPTBL (L(8bytes), L(table_64bytes))
-       .int    JMPTBL (L(9bytes), L(table_64bytes))
-       .int    JMPTBL (L(10bytes), L(table_64bytes))
-       .int    JMPTBL (L(11bytes), L(table_64bytes))
-       .int    JMPTBL (L(12bytes), L(table_64bytes))
-       .int    JMPTBL (L(13bytes), L(table_64bytes))
-       .int    JMPTBL (L(14bytes), L(table_64bytes))
-       .int    JMPTBL (L(15bytes), L(table_64bytes))
-       .int    JMPTBL (L(16bytes), L(table_64bytes))
-       .int    JMPTBL (L(17bytes), L(table_64bytes))
-       .int    JMPTBL (L(18bytes), L(table_64bytes))
-       .int    JMPTBL (L(19bytes), L(table_64bytes))
-       .int    JMPTBL (L(20bytes), L(table_64bytes))
-       .int    JMPTBL (L(21bytes), L(table_64bytes))
-       .int    JMPTBL (L(22bytes), L(table_64bytes))
-       .int    JMPTBL (L(23bytes), L(table_64bytes))
-       .int    JMPTBL (L(24bytes), L(table_64bytes))
-       .int    JMPTBL (L(25bytes), L(table_64bytes))
-       .int    JMPTBL (L(26bytes), L(table_64bytes))
-       .int    JMPTBL (L(27bytes), L(table_64bytes))
-       .int    JMPTBL (L(28bytes), L(table_64bytes))
-       .int    JMPTBL (L(29bytes), L(table_64bytes))
-       .int    JMPTBL (L(30bytes), L(table_64bytes))
-       .int    JMPTBL (L(31bytes), L(table_64bytes))
-       .int    JMPTBL (L(32bytes), L(table_64bytes))
-       .int    JMPTBL (L(33bytes), L(table_64bytes))
-       .int    JMPTBL (L(34bytes), L(table_64bytes))
-       .int    JMPTBL (L(35bytes), L(table_64bytes))
-       .int    JMPTBL (L(36bytes), L(table_64bytes))
-       .int    JMPTBL (L(37bytes), L(table_64bytes))
-       .int    JMPTBL (L(38bytes), L(table_64bytes))
-       .int    JMPTBL (L(39bytes), L(table_64bytes))
-       .int    JMPTBL (L(40bytes), L(table_64bytes))
-       .int    JMPTBL (L(41bytes), L(table_64bytes))
-       .int    JMPTBL (L(42bytes), L(table_64bytes))
-       .int    JMPTBL (L(43bytes), L(table_64bytes))
-       .int    JMPTBL (L(44bytes), L(table_64bytes))
-       .int    JMPTBL (L(45bytes), L(table_64bytes))
-       .int    JMPTBL (L(46bytes), L(table_64bytes))
-       .int    JMPTBL (L(47bytes), L(table_64bytes))
-       .int    JMPTBL (L(48bytes), L(table_64bytes))
-       .int    JMPTBL (L(49bytes), L(table_64bytes))
-       .int    JMPTBL (L(50bytes), L(table_64bytes))
-       .int    JMPTBL (L(51bytes), L(table_64bytes))
-       .int    JMPTBL (L(52bytes), L(table_64bytes))
-       .int    JMPTBL (L(53bytes), L(table_64bytes))
-       .int    JMPTBL (L(54bytes), L(table_64bytes))
-       .int    JMPTBL (L(55bytes), L(table_64bytes))
-       .int    JMPTBL (L(56bytes), L(table_64bytes))
-       .int    JMPTBL (L(57bytes), L(table_64bytes))
-       .int    JMPTBL (L(58bytes), L(table_64bytes))
-       .int    JMPTBL (L(59bytes), L(table_64bytes))
-       .int    JMPTBL (L(60bytes), L(table_64bytes))
-       .int    JMPTBL (L(61bytes), L(table_64bytes))
-       .int    JMPTBL (L(62bytes), L(table_64bytes))
-       .int    JMPTBL (L(63bytes), L(table_64bytes))
-       .int    JMPTBL (L(64bytes), L(table_64bytes))
-       .int    JMPTBL (L(65bytes), L(table_64bytes))
-       .int    JMPTBL (L(66bytes), L(table_64bytes))
-       .int    JMPTBL (L(67bytes), L(table_64bytes))
-       .int    JMPTBL (L(68bytes), L(table_64bytes))
-       .int    JMPTBL (L(69bytes), L(table_64bytes))
-       .int    JMPTBL (L(70bytes), L(table_64bytes))
-       .int    JMPTBL (L(71bytes), L(table_64bytes))
-       .int    JMPTBL (L(72bytes), L(table_64bytes))
-       .int    JMPTBL (L(73bytes), L(table_64bytes))
-       .int    JMPTBL (L(74bytes), L(table_64bytes))
-       .int    JMPTBL (L(75bytes), L(table_64bytes))
-       .int    JMPTBL (L(76bytes), L(table_64bytes))
-       .int    JMPTBL (L(77bytes), L(table_64bytes))
-       .int    JMPTBL (L(78bytes), L(table_64bytes))
-       .int    JMPTBL (L(79bytes), L(table_64bytes))
-# else
-L(table_64bytes):
-       .int    JMPTBL (L(0bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(4bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(8bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(12bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(16bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(20bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(24bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(28bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(32bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(36bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(40bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(44bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(48bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(52bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(56bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(60bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(64bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(68bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(72bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(76bytes), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-       .int    JMPTBL (L(unreal_case), L(table_64bytes))
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S

index f8b46363d08e28e3664affd9d4ff82897fc78b15..5d87a17e0093d04c034093c4ceeb9259a360940b 100644 (file)
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -29,33 +29,28 @@ ENTRY(memcmp)
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
         jne     1f
         call    __init_cpu_features
-
-1:     testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+        testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
         jnz     2f
-       leaq    __memcmp_sse2(%rip), %rax
-       ret
-
-2:     testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-       jz      3f
-       leaq    __memcmp_sse4_1(%rip), %rax
+1:     testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+       jnz     3f
+2:     leaq    __memcmp_sse2_unaligned(%rip), %rax
         ret
  
  3:     leaq    __memcmp_ssse3(%rip), %rax
         ret
-
  END(memcmp)
  
  # undef ENTRY
  # define ENTRY(name) \
-       .type __memcmp_sse2, @function; \
+       .type __memcmp_sse2_unaligned, @function; \
         .p2align 4; \
-       .globl __memcmp_sse2; \
-       .hidden __memcmp_sse2; \
-       __memcmp_sse2: cfi_startproc; \
+       .globl __memcmp_sse2_unaligned; \
+       .hidden __memcmp_sse2_unaligned; \
+       __memcmp_sse2_unaligned: cfi_startproc; \
         CALL_MCOUNT
  # undef END
  # define END(name) \
-       cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+       cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned
  
  # ifdef SHARED
  #  undef libc_hidden_builtin_def
@@ -63,7 +58,7 @@ END(memcmp)
     they will be called without setting up EBX needed for PLT which is
     used by IFUNC.  */
  #  define libc_hidden_builtin_def(name) \
-       .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+       .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned
  # endif
  #endif
  
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S

index 695a23688bbeea82469f0ebd6f9af34a4b4bf7c8..5dd8d4463408025e6b9007f14b0ad051726c7a73 100644 (file)
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -201,6 +201,10 @@ L(prepare_loop):
         movdqu  %xmm2, 96(%rdi)
         movdqu  %xmm3, 112(%rdi)
  
+#ifdef USE_AVX2
+       vpxor   %xmm5, %xmm5, %xmm5
+#endif
+
         subq    %rsi, %rdi
         add     $64, %rsi
         andq    $-64, %rsi
@@ -348,10 +352,13 @@ L(cross_loop):
         sub     $1, %rcx
         ja      L(cross_loop)
  
+#ifdef USE_AVX2
+       vpxor   %xmm5, %xmm5, %xmm5
+#else
         pxor    %xmm5, %xmm5
         pxor    %xmm6, %xmm6
         pxor    %xmm7, %xmm7
-
+#endif
         lea     -64(%rsi), %rdx
         andq    $-64, %rdx
         addq    %rdx, %rdi
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2-unaligned.S

new file mode 100644 (file)

index 0000000..575f92e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define MEMCMP __wmemcmp_sse2_unaligned
+#define AS_WMEMCMP
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S

deleted file mode 100644 (file)

index b07973a..0000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_sse4_1
-
-#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S

index 109e2457fea079ace51908011cb11b22d10eac0a..dabd3edc75b7b7db323e38957088556b1603da24 100644 (file)
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -30,18 +30,16 @@ ENTRY(wmemcmp)
         jne     1f
         call    __init_cpu_features
  
-1:     testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+        testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
         jnz     2f
-       leaq    __wmemcmp_sse2(%rip), %rax
-       ret
-
-2:     testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-       jz      3f
-       leaq    __wmemcmp_sse4_1(%rip), %rax
+1:     testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+       jnz     3f
+2:     leaq    __wmemcmp_sse2_unaligned(%rip), %rax
         ret
  
  3:     leaq    __wmemcmp_ssse3(%rip), %rax
         ret
  
+
  END(wmemcmp)
  #endif
author	Ondřej Bílka <neleai@seznam.cz>
	Thu, 18 Jun 2015 07:02:22 +0000 (09:02 +0200)
committer	Ondřej Bílka <neleai@seznam.cz>
	Wed, 24 Jun 2015 10:48:22 +0000 (12:48 +0200)
sysdeps/x86_64/memcmp.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/Makefile		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/ifunc-impl-list.c		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memcmp-avx2.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/memcmp-sse4.S	[deleted file]	patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/memcmp.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/wmemcmp-sse2-unaligned.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/wmemcmp-sse4.S	[deleted file]	patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/wmemcmp.S		patch \| blob \| blame \| history