From: Niels Möller Date: Wed, 22 May 2013 09:27:58 +0000 (+0200) Subject: arm: Adapted AES assembly to new interface. X-Git-Tag: nettle_3.0_release_20140607~207^2~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d6fadad818952700af8460399feb0cd5bad899cc;p=thirdparty%2Fnettle.git arm: Adapted AES assembly to new interface. --- diff --git a/ChangeLog b/ChangeLog index 7d3a3454..399eeee9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2013-05-22 Niels Möller + + * arm/v6/aes-encrypt-internal.asm: Adapted to new interface. + Unfortunately, 4% slowdown on Cortex-A9, for unknown reason. + * arm/v6/aes-decrypt-internal.asm: Likewise. + * arm/aes-encrypt-internal.asm: Adapted to new interface. + * arm/aes-decrypt-internal.asm: Likewise. + 2013-05-21 Niels Möller * sparc32/aes-encrypt-internal.asm: Adapted to new interface. diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm index 37abf1ec..94717872 100644 --- a/arm/aes-decrypt-internal.asm +++ b/arm/aes-decrypt-internal.asm @@ -19,26 +19,32 @@ C MA 02111-1301, USA. include_src() -C define(, ) -define(, ) -define(, ) -define(, ) -define(, ) - +define(, ) +define(, ) +define(
, ) +define(, ) +C On stack: DST, SRC + define(, ) define(, ) define(, ) define(, ) define(, ) -define(, ) -define(, ) +define(, ) +define(, ) -define(, ) C Overlaps LENGTH, SRC, DST +define(, ) C Overlaps inputs, except TABLE +define(, ) define(, ) define(, ) define(, ) C lr -define(, ) C Overlaps CTX input -define(, <[sp]>) + +define(, <[sp]>) +define(, <[sp, #+4]>) +define(, <[sp, #+8]>) +C 8 saved registers +define(, <[sp, #+44]>) +define(, <[sp, #+48]>) define(, < @@ -103,29 +109,30 @@ define(, < .file "aes-decrypt-internal.asm" - C _aes_decrypt(struct aes_context *ctx, + C _aes_decrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) .text ALIGN(4) PROLOGUE(_nettle_aes_decrypt) - teq LENGTH, #0 + teq PARAM_LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r0, r4,r5,r6,r7,r8,r10,r11,lr} + push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr} mov MASK, #0x3fc ALIGN(16) .Lblock_loop: - ldr KEY, CTX - ldr ROUND, [KEY, #+AES_NROUNDS] - AES_LOAD(SRC,KEY,W0) - AES_LOAD(SRC,KEY,W1) - AES_LOAD(SRC,KEY,W2) - AES_LOAD(SRC,KEY,W3) - - push {LENGTH, DST, SRC} + ldr X0, FRAME_SRC C Use X0 as SRC pointer + ldm sp, {COUNT, KEY} + + AES_LOAD(X0,KEY,W0) + AES_LOAD(X0,KEY,W1) + AES_LOAD(X0,KEY,W2) + AES_LOAD(X0,KEY,W3) + + str X0, FRAME_SRC + add TABLE, TABLE, #AES_TABLE0 b .Lentry @@ -135,31 +142,35 @@ PROLOGUE(_nettle_aes_decrypt) AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop - lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register + lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register sub TABLE, TABLE, #AES_TABLE0 C Final round - AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, ROUND) - AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, ROUND) - AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, ROUND) - AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, ROUND) + AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, COUNT) + AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, COUNT) + AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, COUNT) + AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, COUNT) - pop {LENGTH, DST, SRC} - - AES_STORE(DST,W0) - AES_STORE(DST,W1) - AES_STORE(DST,W2) - AES_STORE(DST,W3) + ldr X0, FRAME_DST + ldr X1, FRAME_LENGTH + + AES_STORE(X0,W0) + AES_STORE(X0,W1) + AES_STORE(X0,W2) + AES_STORE(X0,W3) + + subs X1, X1, #16 + str X0, FRAME_DST + str X1, FRAME_LENGTH - subs LENGTH, LENGTH, #16 bhi .Lblock_loop - add sp, sp, #4 C Drop saved r0 + add sp, sp, #12 C Drop saved r0, r1, r3 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: diff --git a/arm/aes-encrypt-internal.asm b/arm/aes-encrypt-internal.asm index eb2f1489..0d396185 100644 --- a/arm/aes-encrypt-internal.asm +++ b/arm/aes-encrypt-internal.asm @@ -19,32 +19,38 @@ C MA 02111-1301, USA. include_src() -C Benchmarked at at 725, 930, 990 cycles/block on cortex A9, +C Benchmarked at at 725, 815, 990 cycles/block on cortex A9, C for 128, 192 and 256 bit key sizes. C Possible improvements: More efficient load and store with C aligned accesses. Better scheduling. -C define(, ) -define(
, ) -define(, ) -define(, ) -define(, ) - +define(, ) +define(, ) +define(
, ) +define(, ) +C On stack: DST, SRC + define(, ) define(, ) define(, ) define(, ) define(, ) -define(, ) -define(, ) +define(, ) +define(, ) -define(, ) C Overlaps LENGTH, SRC, DST +define(, ) C Overlaps inputs, except TABLE +define(, ) define(, ) define(, ) define(, ) C lr -define(, ) C Overlaps CTX input -define(, <[sp]>) + +define(, <[sp]>) +define(, <[sp, #+4]>) +define(, <[sp, #+8]>) +C 8 saved registers +define(, <[sp, #+44]>) +define(, <[sp, #+48]>) C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) @@ -112,29 +118,30 @@ define(, < .file "aes-encrypt-internal.asm" - C _aes_encrypt(struct aes_context *ctx, + C _aes_encrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) .text ALIGN(4) PROLOGUE(_nettle_aes_encrypt) - teq LENGTH, #0 + teq PARAM_LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r0, r4,r5,r6,r7,r8,r10,r11,lr} + push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr} mov MASK, #0x3fc ALIGN(16) .Lblock_loop: - ldr KEY, CTX - ldr ROUND, [KEY, #+AES_NROUNDS] - AES_LOAD(SRC,KEY,W0) - AES_LOAD(SRC,KEY,W1) - AES_LOAD(SRC,KEY,W2) - AES_LOAD(SRC,KEY,W3) - - push {LENGTH, DST, SRC} + ldr X0, FRAME_SRC C Use X0 as SRC pointer + ldm sp, {COUNT, KEY} + + AES_LOAD(X0,KEY,W0) + AES_LOAD(X0,KEY,W1) + AES_LOAD(X0,KEY,W2) + AES_LOAD(X0,KEY,W3) + + str X0, FRAME_SRC + add TABLE, TABLE, #AES_TABLE0 b .Lentry @@ -144,31 +151,35 @@ PROLOGUE(_nettle_aes_encrypt) AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop - lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register + lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register sub TABLE, TABLE, #AES_TABLE0 C Final round - AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, ROUND) - AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, ROUND) - AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, ROUND) - AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, ROUND) + AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, COUNT) + AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, COUNT) + AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, COUNT) + AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, COUNT) - pop {LENGTH, DST, SRC} - - AES_STORE(DST,W0) - AES_STORE(DST,W1) - AES_STORE(DST,W2) - AES_STORE(DST,W3) + ldr X0, FRAME_DST + ldr X1, FRAME_LENGTH + + AES_STORE(X0,W0) + AES_STORE(X0,W1) + AES_STORE(X0,W2) + AES_STORE(X0,W3) + + subs X1, X1, #16 + str X0, FRAME_DST + str X1, FRAME_LENGTH - subs LENGTH, LENGTH, #16 bhi .Lblock_loop - add sp, sp, #4 C Drop saved r0 + add sp, sp, #12 C Drop saved r0, r1, r3 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm index f550506d..f9f0b7ad 100644 --- a/arm/v6/aes-decrypt-internal.asm +++ b/arm/v6/aes-decrypt-internal.asm @@ -19,25 +19,33 @@ C MA 02111-1301, USA. include_src() -define(, ) -define(
, ) -define(, ) -define(, ) -define(, ) +define(, ) +define(, ) +define(
, ) +define(, ) +C On stack: DST, SRC define(, ) define(, ) define(, ) define(, ) define(, ) -define(, ) -define(, ) +define(, ) +define(, ) -define(, ) C Overlaps LENGTH, SRC, DST -define(, ) +define(, ) C Overlaps PARAM_ROUNDS and PARAM_KEYS +define(, ) define(, ) define(, ) C lr +define(>, <[sp]>) +define(, <[sp, #+4]>) +C 8 saved registers +define(, <[sp, #+40]>) +define(, <[sp, #+44]>) + +define(, <%r12>) C Overlap registers used in inner loop. +define(, ) C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) define(, < @@ -102,7 +110,7 @@ define(, < .file "aes-decrypt-internal.asm" - C _aes_decrypt(struct aes_context *ctx, + C _aes_decrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) @@ -111,22 +119,23 @@ define(, < PROLOGUE(_nettle_aes_decrypt) teq LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r4,r5,r6,r7,r8,r10,r11,lr} - nop C For some mysterious reason, taking out this nop - C slows this function down by 10(!) % on Cortex-A9. + ldr SRC, [sp, #+4] + + push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr} + ALIGN(16) .Lblock_loop: - mov KEY, CTX + ldm sp, {COUNT, KEY} + + add TABLE, TABLE, #AES_TABLE0 + AES_LOAD(SRC,KEY,W0) AES_LOAD(SRC,KEY,W1) AES_LOAD(SRC,KEY,W2) AES_LOAD(SRC,KEY,W3) - push {LENGTH, DST, SRC} - ldr ROUND, [CTX, #+AES_NROUNDS] - add TABLE, TABLE, #AES_TABLE0 + str SRC, FRAME_SRC b .Lentry ALIGN(16) @@ -135,29 +144,34 @@ PROLOGUE(_nettle_aes_decrypt) AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop sub TABLE, TABLE, #AES_TABLE0 + C Final round + ldr DST, FRAME_DST + AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0) AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1) AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2) AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3) - pop {LENGTH, DST, SRC} + ldr SRC, FRAME_SRC AES_STORE(DST,W0) AES_STORE(DST,W1) AES_STORE(DST,W2) AES_STORE(DST,W3) + str DST, FRAME_DST subs LENGTH, LENGTH, #16 bhi .Lblock_loop + add sp, sp, #8 C Drop saved r0, r1 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: diff --git a/arm/v6/aes-encrypt-internal.asm b/arm/v6/aes-encrypt-internal.asm index 3cf13072..3c817de1 100644 --- a/arm/v6/aes-encrypt-internal.asm +++ b/arm/v6/aes-encrypt-internal.asm @@ -19,31 +19,39 @@ C MA 02111-1301, USA. include_src() -C Benchmarked at at 680, 818, 929 cycles/block on cortex A9, +C Benchmarked at at 706, 870, 963 cycles/block on cortex A9, C for 128, 192 and 256 bit key sizes. C Possible improvements: More efficient load and store with C aligned accesses. Better scheduling. -define(, ) -define(
, ) -define(, ) -define(, ) -define(, ) +define(, ) +define(, ) +define(
, ) +define(, ) +C On stack: DST, SRC define(, ) define(, ) define(, ) define(, ) define(, ) -define(, ) -define(, ) +define(, ) +define(, ) -define(, ) C Overlaps LENGTH, SRC, DST -define(, ) +define(, ) C Overlaps PARAM_ROUNDS and PARAM_KEYS +define(, ) define(, ) define(, ) C lr +define(>, <[sp]>) +define(, <[sp, #+4]>) +C 8 saved registers +define(, <[sp, #+40]>) +define(, <[sp, #+44]>) + +define(, <%r12>) C Overlap registers used in inner loop. +define(, ) C 53 instr. C It's tempting to use eor with rotation, but that's slower. @@ -110,7 +118,7 @@ define(, < .file "aes-encrypt-internal.asm" - C _aes_encrypt(struct aes_context *ctx, + C _aes_encrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) @@ -119,20 +127,23 @@ define(, < PROLOGUE(_nettle_aes_encrypt) teq LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r4,r5,r6,r7,r8,r10,r11,lr} + ldr SRC, [sp, #+4] + + push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr} + ALIGN(16) .Lblock_loop: - mov KEY, CTX + ldm sp, {COUNT, KEY} + + add TABLE, TABLE, #AES_TABLE0 + AES_LOAD(SRC,KEY,W0) AES_LOAD(SRC,KEY,W1) AES_LOAD(SRC,KEY,W2) AES_LOAD(SRC,KEY,W3) - push {LENGTH, DST, SRC} - ldr ROUND, [CTX, #+AES_NROUNDS] - add TABLE, TABLE, #AES_TABLE0 + str SRC, FRAME_SRC b .Lentry ALIGN(16) @@ -141,29 +152,34 @@ PROLOGUE(_nettle_aes_encrypt) AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop sub TABLE, TABLE, #AES_TABLE0 + C Final round + ldr DST, FRAME_DST + AES_FINAL_ROUND_V6(X0, X1, X2, X3, KEY, W0) AES_FINAL_ROUND_V6(X1, X2, X3, X0, KEY, W1) AES_FINAL_ROUND_V6(X2, X3, X0, X1, KEY, W2) AES_FINAL_ROUND_V6(X3, X0, X1, X2, KEY, W3) - pop {LENGTH, DST, SRC} + ldr SRC, FRAME_SRC AES_STORE(DST,W0) AES_STORE(DST,W1) AES_STORE(DST,W2) AES_STORE(DST,W3) + str DST, FRAME_DST subs LENGTH, LENGTH, #16 bhi .Lblock_loop + add sp, sp, #8 C Drop saved r0, r1 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: