Improved loop logic, and unrolled

author Niels Möller <nisse@lysator.liu.se>

Wed, 17 Sep 2008 21:03:01 +0000 (23:03 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Wed, 17 Sep 2008 21:03:01 +0000 (23:03 +0200)
author Niels Möller <nisse@lysator.liu.se>
Wed, 17 Sep 2008 21:03:01 +0000 (23:03 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Wed, 17 Sep 2008 21:03:01 +0000 (23:03 +0200)
diff --git a/x86/arcfour-crypt.asm b/x86/arcfour-crypt.asm

index 8bf0453950284e2d92f3ea3672d6bb4863a2a268..bdeb98f03a9d5a986e0af933009ec2f0b4086b93 100644 (file)
--- a/x86/arcfour-crypt.asm
+++ b/x86/arcfour-crypt.asm
@@ -38,48 +38,69 @@ C Input arguments:
         C src = 32(%esp)
  C Register usage:
         C %ebp = ctx
-       C %esi = src (updated through out loop)
-       C %edi = dst (updated through out loop)
-       C %edx = src + length (end of source area)
+       C %esi = src
+       C %edi = dst
+       C %edx = loop counter
         C %eax = i
         C %ebx = j
         C %cl  = si
         C %ch  = sj
  
         movl    24(%esp), %edx          C  length
-       testl   %edx,%edx
-       jz      .Lend
-
         movl    20(%esp), %ebp          C  ctx
-       movl    28(%esp), %edi
-       movl    32(%esp), %esi
-       addl    %esi, %edx              C  Keep src + length
+       movl    28(%esp), %edi          C  dst
+       movl    32(%esp), %esi          C  src
+
+       lea     (%edx, %edi), %edi
+       lea     (%edx, %esi), %esi
+       negl    %edx
+       jnc     .Lend
         
         movzbl  ARCFOUR_I (%ebp), %eax  C  i
         movzbl  ARCFOUR_J (%ebp), %ebx  C  j
+
+       incb    %al
+       sarl    $1, %edx
+       jc      .Lloop_odd
+       
+       ALIGN(4)
  .Lloop:
-C      incb    %al
+       movb    (%ebp, %eax), %cl       C  si.
+       addb    %cl, %bl
+       movb    (%ebp, %ebx), %ch       C  sj
+       movb    %ch, (%ebp, %eax)       C  S[i] = sj
         incl    %eax
-       andl    $0xff, %eax
-       movzbl  (%ebp, %eax), %ecx      C  si. Clears high bytes
+       movzbl  %al, %eax
+       movb    %cl, (%ebp, %ebx)       C  S[j] = si
+       addb    %ch, %cl
+       movzbl  %cl, %ecx               C  Clear, so it can be used
+                                       C  for indexing.
+       movb    (%ebp, %ecx), %cl
+       xorb    (%esi, %edx, 2), %cl
+       movb    %cl, (%edi, %edx, 2)
+
+       C FIXME: Could exchange cl and ch in the second half
+       C and try to interleave instructions better.
+.Lloop_odd:
+       movb    (%ebp, %eax), %cl       C  si.
         addb    %cl, %bl
-C The addl andl is preferable on PPro and PII, but slows thing down on AMD Duron.
-C      addl    %ecx, %ebx
-C      andl    $0xff, %ebx
         movb    (%ebp, %ebx), %ch       C  sj
         movb    %ch, (%ebp, %eax)       C  S[i] = sj
+       incl    %eax
+       movzbl  %al, %eax
         movb    %cl, (%ebp, %ebx)       C  S[j] = si
         addb    %ch, %cl
         movzbl  %cl, %ecx               C  Clear, so it can be used
                                         C  for indexing.
         movb    (%ebp, %ecx), %cl
-       xorb    (%esi), %cl
-       incl    %esi
-       movb    %cl, (%edi)
-       incl    %edi
-       cmpl    %esi, %edx
-       jne     .Lloop
+       xorb    1(%esi, %edx, 2), %cl
+       incl    %edx
+       movb    %cl, -1(%edi, %edx, 2)
+
+       jnz     .Lloop
  
+C .Lloop_done:
+       decb    %al
         movb    %al, ARCFOUR_I (%ebp)           C  Store the new i and j.
         movb    %bl, ARCFOUR_J (%ebp)
  .Lend:
author	Niels Möller <nisse@lysator.liu.se>
	Wed, 17 Sep 2008 21:03:01 +0000 (23:03 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Wed, 17 Sep 2008 21:03:01 +0000 (23:03 +0200)