Use unsigned comparison in sse memcpy/memset

author H.J. Lu <hongjiu.lu@intel.com>

Fri, 12 Feb 2010 15:41:49 +0000 (16:41 +0100)

committer Andreas Schwab <schwab@redhat.com>

Mon, 22 Feb 2010 15:34:01 +0000 (16:34 +0100)
author H.J. Lu <hongjiu.lu@intel.com>
Fri, 12 Feb 2010 15:41:49 +0000 (16:41 +0100)
committer Andreas Schwab <schwab@redhat.com>
Mon, 22 Feb 2010 15:34:01 +0000 (16:34 +0100)
diff --git a/ChangeLog b/ChangeLog

index 7812fa6a4804846f8e09f20d35fb14db1a51fe37..cc4855fc277bbb11f13bb8458950a6010d23e5bd 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2010-02-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+       * sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: Use unsigned
+       conditional jumps.
+       (shl_0_gobble_cache_loop_tail): Removed.
+       * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Likewise.
+
+       * sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Use unsigned
+       conditional jumps.
+       * sysdeps/i386/i686/multiarch/memset-sse2.S: Likewise.
+
  2009-10-27  Aurelien Jarno  <aurelien@aurel32.net>
  
         [BZ #10855]
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S

index b26037d279f813ec12fc5faa781413f352c47e9c..f85049185c45524e49fa148ddad6bb663d32aceb 100644 (file)
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -128,7 +128,7 @@ ENTRY (MEMCPY)
         jb      L(copy_forward)
         je      L(fwd_write_0bytes)
         cmp     $32, %ecx
-       jge     L(memmove_bwd)
+       jae     L(memmove_bwd)
         jmp     L(bk_write_less32bytes_2)
  L(memmove_bwd):
         add     %ecx, %eax
@@ -139,12 +139,12 @@ L(memmove_bwd):
  L(copy_forward):
  #endif
         cmp     $48, %ecx
-       jge     L(48bytesormore)
+       jae     L(48bytesormore)
  
  L(fwd_write_less32bytes):
  #ifndef USE_AS_MEMMOVE
         cmp     %dl, %al
-       jl      L(bk_write)
+       jb      L(bk_write)
  #endif
         add     %ecx, %edx
         add     %ecx, %eax
@@ -181,7 +181,7 @@ L(48bytesormore):
  #endif
  
         mov     %eax, %edi
-       jge     L(large_page)
+       jae     L(large_page)
         and     $0xf, %edi
         jz      L(shl_0)
  
@@ -201,7 +201,7 @@ L(shl_0_loop):
         movdqa  %xmm0, (%edx, %edi)
         movdqa  %xmm1, 16(%edx, %edi)
         lea     32(%edi), %edi
-       jl      L(shl_0_end)
+       jb      L(shl_0_end)
  
         movdqa  (%eax, %edi), %xmm0
         movdqa  16(%eax, %edi), %xmm1
@@ -209,7 +209,7 @@ L(shl_0_loop):
         movdqa  %xmm0, (%edx, %edi)
         movdqa  %xmm1, 16(%edx, %edi)
         lea     32(%edi), %edi
-       jl      L(shl_0_end)
+       jb      L(shl_0_end)
  
         movdqa  (%eax, %edi), %xmm0
         movdqa  16(%eax, %edi), %xmm1
@@ -217,7 +217,7 @@ L(shl_0_loop):
         movdqa  %xmm0, (%edx, %edi)
         movdqa  %xmm1, 16(%edx, %edi)
         lea     32(%edi), %edi
-       jl      L(shl_0_end)
+       jb      L(shl_0_end)
  
         movdqa  (%eax, %edi), %xmm0
         movdqa  16(%eax, %edi), %xmm1
@@ -251,7 +251,7 @@ L(shl_0_gobble):
         shr     $3, %esi
         sub     %esi, %edi
         cmp     %edi, %ecx
-       jge     L(shl_0_gobble_mem_start)
+       jae     L(shl_0_gobble_mem_start)
         lea     -128(%ecx), %ecx
         ALIGN (4)
  L(shl_0_gobble_cache_loop):
@@ -275,8 +275,7 @@ L(shl_0_gobble_cache_loop):
         movaps  %xmm7, 0x70(%edx)
         lea     0x80(%edx), %edx
  
-       jge     L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
+       jae     L(shl_0_gobble_cache_loop)
         cmp     $-0x40, %ecx
         lea     0x80(%ecx), %ecx
         jl      L(shl_0_cache_less_64bytes)
@@ -297,7 +296,7 @@ L(shl_0_gobble_cache_loop_tail):
         add     $0x40, %edx
  L(shl_0_cache_less_64bytes):
         cmp     $0x20, %ecx
-       jl      L(shl_0_cache_less_32bytes)
+       jb      L(shl_0_cache_less_32bytes)
         movdqa  (%eax), %xmm0
         sub     $0x20, %ecx
         movdqa  0x10(%eax), %xmm1
@@ -307,7 +306,7 @@ L(shl_0_cache_less_64bytes):
         add     $0x20, %edx
  L(shl_0_cache_less_32bytes):
         cmp     $0x10, %ecx
-       jl      L(shl_0_cache_less_16bytes)
+       jb      L(shl_0_cache_less_16bytes)
         sub     $0x10, %ecx
         movdqa  (%eax), %xmm0
         add     $0x10, %eax
@@ -352,7 +351,7 @@ L(shl_0_gobble_mem_loop):
         movaps  %xmm7, 0x70(%edx)
         lea     0x80(%edx), %edx
  
-       jge     L(shl_0_gobble_mem_loop)
+       jae     L(shl_0_gobble_mem_loop)
         cmp     $-0x40, %ecx
         lea     0x80(%ecx), %ecx
         jl      L(shl_0_mem_less_64bytes)
@@ -373,7 +372,7 @@ L(shl_0_gobble_mem_loop):
         add     $0x40, %edx
  L(shl_0_mem_less_64bytes):
         cmp     $0x20, %ecx
-       jl      L(shl_0_mem_less_32bytes)
+       jb      L(shl_0_mem_less_32bytes)
         movdqa  (%eax), %xmm0
         sub     $0x20, %ecx
         movdqa  0x10(%eax), %xmm1
@@ -383,7 +382,7 @@ L(shl_0_mem_less_64bytes):
         add     $0x20, %edx
  L(shl_0_mem_less_32bytes):
         cmp     $0x10, %ecx
-       jl      L(shl_0_mem_less_16bytes)
+       jb      L(shl_0_mem_less_16bytes)
         sub     $0x10, %ecx
         movdqa  (%eax), %xmm0
         add     $0x10, %eax
@@ -418,7 +417,7 @@ L(shl_1_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_1_end)
+       jb      L(shl_1_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -461,7 +460,7 @@ L(shl_2_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_2_end)
+       jb      L(shl_2_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -504,7 +503,7 @@ L(shl_3_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_3_end)
+       jb      L(shl_3_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -547,7 +546,7 @@ L(shl_4_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_4_end)
+       jb      L(shl_4_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -590,7 +589,7 @@ L(shl_5_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_5_end)
+       jb      L(shl_5_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -634,7 +633,7 @@ L(shl_6_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_6_end)
+       jb      L(shl_6_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -677,7 +676,7 @@ L(shl_7_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_7_end)
+       jb      L(shl_7_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -720,7 +719,7 @@ L(shl_8_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_8_end)
+       jb      L(shl_8_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -763,7 +762,7 @@ L(shl_9_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_9_end)
+       jb      L(shl_9_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -806,7 +805,7 @@ L(shl_10_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_10_end)
+       jb      L(shl_10_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -849,7 +848,7 @@ L(shl_11_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_11_end)
+       jb      L(shl_11_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -892,7 +891,7 @@ L(shl_12_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_12_end)
+       jb      L(shl_12_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -935,7 +934,7 @@ L(shl_13_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_13_end)
+       jb      L(shl_13_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -978,7 +977,7 @@ L(shl_14_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_14_end)
+       jb      L(shl_14_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -1022,7 +1021,7 @@ L(shl_15_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_15_end)
+       jb      L(shl_15_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -1298,7 +1297,7 @@ L(large_page_loop):
         sub     $0x40, %ecx
  L(large_page_less_64bytes):
         cmp     $32, %ecx
-       jl      L(large_page_less_32bytes)
+       jb      L(large_page_less_32bytes)
         movdqu  (%eax), %xmm0
         movdqu  0x10(%eax), %xmm1
         lea     0x20(%eax), %eax
@@ -1665,11 +1664,11 @@ L(copy_backward):
  
  L(bk_aligned_4):
         cmp     $64, %ecx
-       jge     L(bk_write_more64bytes)
+       jae     L(bk_write_more64bytes)
  
  L(bk_write_64bytesless):
         cmp     $32, %ecx
-       jl      L(bk_write_less32bytes)
+       jb      L(bk_write_less32bytes)
  
  L(bk_write_more32bytes):
         /* Copy 32 bytes at a time.  */
@@ -1704,7 +1703,7 @@ L(bk_write_less32bytes_2):
         ALIGN (4)
  L(bk_align):
         cmp     $8, %ecx
-       jle     L(bk_write_less32bytes)
+       jbe     L(bk_write_less32bytes)
         testl   $1, %edx
         /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
            then (EDX & 2) must be != 0.  */
@@ -1760,7 +1759,7 @@ L(bk_ssse3_align):
  
  L(bk_ssse3_cpy_pre):
         cmp     $64, %ecx
-       jl      L(bk_write_more32bytes)
+       jb      L(bk_write_more32bytes)
  
  L(bk_ssse3_cpy):
         sub     $64, %esi
@@ -1775,7 +1774,7 @@ L(bk_ssse3_cpy):
         movdqu  (%esi), %xmm0
         movdqa  %xmm0, (%edx)
         cmp     $64, %ecx
-       jge     L(bk_ssse3_cpy)
+       jae     L(bk_ssse3_cpy)
         jmp     L(bk_write_64bytesless)
  
  #endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S

index 749c82d3797c837974b764576613ba829c10fd39..c512b0e8121a0f03a41d0de68f23b2b5f3500a2e 100644 (file)
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -128,7 +128,7 @@ ENTRY (MEMCPY)
         jb      L(copy_forward)
         je      L(fwd_write_0bytes)
         cmp     $32, %ecx
-       jge     L(memmove_bwd)
+       jae     L(memmove_bwd)
         jmp     L(bk_write_less32bytes_2)
  L(memmove_bwd):
         add     %ecx, %eax
@@ -139,12 +139,12 @@ L(memmove_bwd):
  L(copy_forward):
  #endif
         cmp     $48, %ecx
-       jge     L(48bytesormore)
+       jae     L(48bytesormore)
  
  L(fwd_write_less32bytes):
  #ifndef USE_AS_MEMMOVE
         cmp     %dl, %al
-       jl      L(bk_write)
+       jb      L(bk_write)
  #endif
         add     %ecx, %edx
         add     %ecx, %eax
@@ -181,7 +181,7 @@ L(48bytesormore):
  #endif
  
         mov     %eax, %edi
-       jge     L(large_page)
+       jae     L(large_page)
         and     $0xf, %edi
         jz      L(shl_0)
  
@@ -202,7 +202,7 @@ L(shl_0_loop):
         movdqa  %xmm0, (%edx, %edi)
         movdqa  %xmm1, 16(%edx, %edi)
         lea     32(%edi), %edi
-       jl      L(shl_0_end)
+       jb      L(shl_0_end)
  
         movdqa  (%eax, %edi), %xmm0
         movdqa  16(%eax, %edi), %xmm1
@@ -210,7 +210,7 @@ L(shl_0_loop):
         movdqa  %xmm0, (%edx, %edi)
         movdqa  %xmm1, 16(%edx, %edi)
         lea     32(%edi), %edi
-       jl      L(shl_0_end)
+       jb      L(shl_0_end)
  
         movdqa  (%eax, %edi), %xmm0
         movdqa  16(%eax, %edi), %xmm1
@@ -218,7 +218,7 @@ L(shl_0_loop):
         movdqa  %xmm0, (%edx, %edi)
         movdqa  %xmm1, 16(%edx, %edi)
         lea     32(%edi), %edi
-       jl      L(shl_0_end)
+       jb      L(shl_0_end)
  
         movdqa  (%eax, %edi), %xmm0
         movdqa  16(%eax, %edi), %xmm1
@@ -250,7 +250,7 @@ L(shl_0_gobble):
  
         POP (%edi)
         lea     -128(%ecx), %ecx
-       jge     L(shl_0_gobble_mem_loop)
+       jae     L(shl_0_gobble_mem_loop)
  L(shl_0_gobble_cache_loop):
         movdqa  (%eax), %xmm0
         movdqa  0x10(%eax), %xmm1
@@ -272,8 +272,7 @@ L(shl_0_gobble_cache_loop):
         movdqa  %xmm7, 0x70(%edx)
         lea     0x80(%edx), %edx
  
-       jge     L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
+       jae     L(shl_0_gobble_cache_loop)
         cmp     $-0x40, %ecx
         lea     0x80(%ecx), %ecx
         jl      L(shl_0_cache_less_64bytes)
@@ -294,7 +293,7 @@ L(shl_0_gobble_cache_loop_tail):
         add     $0x40, %edx
  L(shl_0_cache_less_64bytes):
         cmp     $0x20, %ecx
-       jl      L(shl_0_cache_less_32bytes)
+       jb      L(shl_0_cache_less_32bytes)
         movdqa  (%eax), %xmm0
         sub     $0x20, %ecx
         movdqa  0x10(%eax), %xmm1
@@ -304,7 +303,7 @@ L(shl_0_cache_less_64bytes):
         add     $0x20, %edx
  L(shl_0_cache_less_32bytes):
         cmp     $0x10, %ecx
-       jl      L(shl_0_cache_less_16bytes)
+       jb      L(shl_0_cache_less_16bytes)
         sub     $0x10, %ecx
         movdqa  (%eax), %xmm0
         add     $0x10, %eax
@@ -342,7 +341,7 @@ L(shl_0_gobble_mem_loop):
         movdqa  %xmm7, 0x70(%edx)
         lea     0x80(%edx), %edx
  
-       jge     L(shl_0_gobble_mem_loop)
+       jae     L(shl_0_gobble_mem_loop)
         cmp     $-0x40, %ecx
         lea     0x80(%ecx), %ecx
         jl      L(shl_0_mem_less_64bytes)
@@ -363,7 +362,7 @@ L(shl_0_gobble_mem_loop):
         add     $0x40, %edx
  L(shl_0_mem_less_64bytes):
         cmp     $0x20, %ecx
-       jl      L(shl_0_mem_less_32bytes)
+       jb      L(shl_0_mem_less_32bytes)
         movdqa  (%eax), %xmm0
         sub     $0x20, %ecx
         movdqa  0x10(%eax), %xmm1
@@ -373,7 +372,7 @@ L(shl_0_mem_less_64bytes):
         add     $0x20, %edx
  L(shl_0_mem_less_32bytes):
         cmp     $0x10, %ecx
-       jl      L(shl_0_mem_less_16bytes)
+       jb      L(shl_0_mem_less_16bytes)
         sub     $0x10, %ecx
         movdqa  (%eax), %xmm0
         add     $0x10, %eax
@@ -406,7 +405,7 @@ L(shl_1_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_1_end)
+       jb      L(shl_1_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -449,7 +448,7 @@ L(shl_2_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_2_end)
+       jb      L(shl_2_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -492,7 +491,7 @@ L(shl_3_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_3_end)
+       jb      L(shl_3_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -535,7 +534,7 @@ L(shl_4_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_4_end)
+       jb      L(shl_4_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -578,7 +577,7 @@ L(shl_5_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_5_end)
+       jb      L(shl_5_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -622,7 +621,7 @@ L(shl_6_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_6_end)
+       jb      L(shl_6_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -665,7 +664,7 @@ L(shl_7_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_7_end)
+       jb      L(shl_7_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -708,7 +707,7 @@ L(shl_8_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_8_end)
+       jb      L(shl_8_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -751,7 +750,7 @@ L(shl_9_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_9_end)
+       jb      L(shl_9_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -794,7 +793,7 @@ L(shl_10_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_10_end)
+       jb      L(shl_10_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -837,7 +836,7 @@ L(shl_11_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_11_end)
+       jb      L(shl_11_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -880,7 +879,7 @@ L(shl_12_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_12_end)
+       jb      L(shl_12_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -923,7 +922,7 @@ L(shl_13_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_13_end)
+       jb      L(shl_13_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -966,7 +965,7 @@ L(shl_14_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_14_end)
+       jb      L(shl_14_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -1010,7 +1009,7 @@ L(shl_15_loop):
         movdqa  %xmm2, -32(%edx, %edi)
         movdqa  %xmm3, -16(%edx, %edi)
  
-       jl      L(shl_15_end)
+       jb      L(shl_15_end)
  
         movdqa  16(%eax, %edi), %xmm2
         sub     $32, %ecx
@@ -1281,7 +1280,7 @@ L(large_page_loop):
         sub     $0x40, %ecx
  L(large_page_less_64bytes):
         cmp     $32, %ecx
-       jl      L(large_page_less_32bytes)
+       jb      L(large_page_less_32bytes)
         movdqu  (%eax), %xmm0
         movdqu  0x10(%eax), %xmm1
         lea     0x20(%eax), %eax
@@ -1617,11 +1616,11 @@ L(copy_backward):
  
  L(bk_aligned_4):
         cmp     $64, %ecx
-       jge     L(bk_write_more64bytes)
+       jae     L(bk_write_more64bytes)
  
  L(bk_write_64bytesless):
         cmp     $32, %ecx
-       jl      L(bk_write_less32bytes)
+       jb      L(bk_write_less32bytes)
  
  L(bk_write_more32bytes):
         /* Copy 32 bytes at a time.  */
@@ -1656,7 +1655,7 @@ L(bk_write_less32bytes_2):
         ALIGN (4)
  L(bk_align):
         cmp     $8, %ecx
-       jle     L(bk_write_less32bytes)
+       jbe     L(bk_write_less32bytes)
         testl   $1, %edx
         /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
            then (EDX & 2) must be != 0.  */
@@ -1712,7 +1711,7 @@ L(bk_ssse3_align):
  
  L(bk_ssse3_cpy_pre):
         cmp     $64, %ecx
-       jl      L(bk_write_more32bytes)
+       jb      L(bk_write_more32bytes)
  
  L(bk_ssse3_cpy):
         sub     $64, %esi
@@ -1727,7 +1726,7 @@ L(bk_ssse3_cpy):
         movdqu  (%esi), %xmm0
         movdqa  %xmm0, (%edx)
         cmp     $64, %ecx
-       jge     L(bk_ssse3_cpy)
+       jae     L(bk_ssse3_cpy)
         jmp     L(bk_write_64bytesless)
  
  #endif
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S

index 84afffeb66c418afe9abe8554c12ba79754938f7..d4bf9b7d3e0788dfc8e64ec2b83ca7f04bab904f 100644 (file)
--- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -261,7 +261,7 @@ L(not_aligned_16):
         ALIGN (4)
  L(aligned_16):
         cmp     $128, %ecx
-       jge     L(128bytesormore)
+       jae     L(128bytesormore)
  
  L(aligned_16_less128bytes):
         BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -306,7 +306,7 @@ L(128bytesormore_normal):
         movdqa  %xmm0, 0x60(%edx)
         movdqa  %xmm0, 0x70(%edx)
         lea     128(%edx), %edx
-       jl      L(128bytesless_normal)
+       jb      L(128bytesless_normal)
  
  
         sub     $128, %ecx
@@ -319,7 +319,7 @@ L(128bytesormore_normal):
         movdqa  %xmm0, 0x60(%edx)
         movdqa  %xmm0, 0x70(%edx)
         lea     128(%edx), %edx
-       jge     L(128bytesormore_normal)
+       jae     L(128bytesormore_normal)
  
  L(128bytesless_normal):
         POP (%edi)
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S

index b2b979193e9e72d7dd9312dadafb1bf10e4c594a..00e552e44e6f51d0f06a5f7e95c19f619bd010f7 100644 (file)
--- a/sysdeps/i386/i686/multiarch/memset-sse2.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -261,7 +261,7 @@ L(not_aligned_16):
         ALIGN (4)
  L(aligned_16):
         cmp     $128, %ecx
-       jge     L(128bytesormore)
+       jae     L(128bytesormore)
  
  L(aligned_16_less128bytes):
         BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -312,7 +312,7 @@ L(128bytesormore_normal):
         movdqa  %xmm0, 0x60(%edx)
         movdqa  %xmm0, 0x70(%edx)
         lea     128(%edx), %edx
-       jl      L(128bytesless_normal)
+       jb      L(128bytesless_normal)
  
  
         sub     $128, %ecx
@@ -325,7 +325,7 @@ L(128bytesormore_normal):
         movdqa  %xmm0, 0x60(%edx)
         movdqa  %xmm0, 0x70(%edx)
         lea     128(%edx), %edx
-       jge     L(128bytesormore_normal)
+       jae     L(128bytesormore_normal)
  
  L(128bytesless_normal):
         lea     128(%ecx), %ecx
@@ -346,7 +346,7 @@ L(128bytes_L2_normal):
         movaps  %xmm0, 0x70(%edx)
         add     $128, %edx
         cmp     $128, %ecx
-       jge     L(128bytes_L2_normal)
+       jae     L(128bytes_L2_normal)
  
  L(128bytesless_L2_normal):
         BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -368,7 +368,7 @@ L(128bytesormore_shared_cache_loop):
         movdqa  %xmm0, 0x70(%edx)
         add     $0x80, %edx
         cmp     $0x80, %ebx
-       jge     L(128bytesormore_shared_cache_loop)
+       jae     L(128bytesormore_shared_cache_loop)
         cmp     $0x80, %ecx
         jb      L(shared_cache_loop_end)
         ALIGN (4)
@@ -384,7 +384,7 @@ L(128bytesormore_nt):
         movntdq %xmm0, 0x70(%edx)
         add     $0x80, %edx
         cmp     $0x80, %ecx
-       jge     L(128bytesormore_nt)
+       jae     L(128bytesormore_nt)
         sfence
  L(shared_cache_loop_end):
  #if defined DATA_CACHE_SIZE || !defined SHARED
author	H.J. Lu <hongjiu.lu@intel.com>
	Fri, 12 Feb 2010 15:41:49 +0000 (16:41 +0100)
committer	Andreas Schwab <schwab@redhat.com>
	Mon, 22 Feb 2010 15:34:01 +0000 (16:34 +0100)
ChangeLog		patch \| blob \| blame \| history
sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S		patch \| blob \| blame \| history
sysdeps/i386/i686/multiarch/memcpy-ssse3.S		patch \| blob \| blame \| history
sysdeps/i386/i686/multiarch/memset-sse2-rep.S		patch \| blob \| blame \| history
sysdeps/i386/i686/multiarch/memset-sse2.S		patch \| blob \| blame \| history