+2013-05-16 Niels Möller <nisse@lysator.liu.se>
+
+ * arm/v6/aes-encrypt-internal.asm: Use ALIGN macro. Use 16-byte
+ alignment for loops.
+ * arm/v6/aes-decrypt-internal.asm: Likewise. Also added a nop
+ which mysteriously improves benchmark performance on Cortex-A9.
+
2013-05-15 Niels Möller <nisse@lysator.liu.se>
* configure.ac (asm_path): Handle armv6 and armv7 differently from
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
- .align 2
+ ALIGN(4)
PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr}
+ nop C For some mysterious reason, taking out this nop
+ C slows this function down on Cortex-A9.
+ ALIGN(16)
.Lblock_loop:
mov KEY, CTX
AES_LOAD(SRC,KEY,W0)
add TABLE, TABLE, #AES_TABLE0
b .Lentry
- .align 2
+ ALIGN(16)
.Lround_loop:
C Transform X -> W
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
- .align 2
+ ALIGN(4)
PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr}
+ ALIGN(16)
.Lblock_loop:
mov KEY, CTX
AES_LOAD(SRC,KEY,W0)
add TABLE, TABLE, #AES_TABLE0
b .Lentry
- .align 2
+ ALIGN(16)
.Lround_loop:
C Transform X -> W
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)