From da51566b256e0c0536d5b986e676863b0526bf5e Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 10 Mar 2021 15:54:44 +0000 Subject: [PATCH] ARM assembly pack: translate bit-sliced AES implementation to AArch64 Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/14592) --- crypto/aes/asm/bsaes-armv8.S | 2338 ++++++++++++++++++++++++++++++++++ crypto/aes/build.info | 5 +- 2 files changed, 2341 insertions(+), 2 deletions(-) create mode 100644 crypto/aes/asm/bsaes-armv8.S diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S new file mode 100644 index 00000000000..9bd02d0c8a9 --- /dev/null +++ b/crypto/aes/asm/bsaes-armv8.S @@ -0,0 +1,2338 @@ +// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// ==================================================================== +// Written by Ben Avison for the OpenSSL +// project. Rights for redistribution and usage in source and binary +// forms are granted according to the OpenSSL license. +// ==================================================================== +// +// This implementation is a translation of bsaes-armv7 for AArch64. +// No attempt has been made to carry across the build switches for +// kernel targets, since the Linux kernel crypto support has moved on +// from when it was based on OpenSSL. + +// A lot of hand-scheduling has been performed. Consequently, this code +// doesn't factor out neatly into macros in the same way that the +// AArch32 version did, and there is little to be gained by wrapping it +// up in Perl, and it is presented as pure assembly. + + +#include "crypto/arm_arch.h" + +.text + +.type _bsaes_decrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_decrypt8: + ldr q8, [x9], #16 + adr x11, .LM0ISR + movi v9.16b, #0x55 + ldr q10, [x11], #16 + movi v16.16b, #0x33 + movi v17.16b, #0x0f + sub x10, x10, #1 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v10.16b + tbl v1.16b, {v1.16b}, v10.16b + tbl v2.16b, {v2.16b}, v10.16b + tbl v4.16b, {v4.16b}, v10.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v10.16b + tbl v5.16b, {v5.16b}, v10.16b + tbl v6.16b, {v6.16b}, v10.16b + ushr v8.2d, v0.2d, #1 + tbl v7.16b, {v7.16b}, v10.16b + ushr v10.2d, v4.2d, #1 + ushr v18.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + ushr v19.2d, v6.2d, #1 + eor v10.16b, v10.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v9.16b + eor v19.16b, v19.16b, v7.16b + and v10.16b, v10.16b, v9.16b + and v18.16b, v18.16b, v9.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v9.16b, v19.16b, v9.16b + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #1 + eor v3.16b, v3.16b, v18.16b + shl v18.2d, v18.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v9.2d, #1 + eor v7.16b, v7.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v18.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v18.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v10.16b, v10.16b, v17.16b + and v16.16b, v16.16b, v17.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Ldec_sbox +.align 4 +.Ldec_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Ldec_sbox: + eor v1.16b, v1.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + subs x10, x10, #1 + eor v4.16b, v4.16b, v7.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v1.16b, v6.16b + eor v6.16b, v6.16b, v4.16b + eor v2.16b, v2.16b, v5.16b + eor v0.16b, v0.16b, v1.16b + eor v7.16b, v7.16b, v6.16b + eor v8.16b, v6.16b, v2.16b + and v9.16b, v4.16b, v6.16b + eor v10.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v16.16b, v7.16b, v4.16b + eor v17.16b, v4.16b, v0.16b + and v18.16b, v0.16b, v2.16b + eor v19.16b, v7.16b, v4.16b + eor v1.16b, v1.16b, v3.16b + eor v20.16b, v3.16b, v0.16b + eor v21.16b, v5.16b, v2.16b + eor v22.16b, v3.16b, v7.16b + and v8.16b, v17.16b, v8.16b + orr v17.16b, v3.16b, v5.16b + eor v23.16b, v1.16b, v6.16b + eor v24.16b, v20.16b, v16.16b + eor v25.16b, v1.16b, v5.16b + orr v26.16b, v20.16b, v21.16b + and v20.16b, v20.16b, v21.16b + and v27.16b, v7.16b, v1.16b + eor v21.16b, v21.16b, v23.16b + orr v28.16b, v16.16b, v23.16b + orr v29.16b, v22.16b, v25.16b + eor v26.16b, v26.16b, v8.16b + and v16.16b, v16.16b, v23.16b + and v22.16b, v22.16b, v25.16b + and v21.16b, v24.16b, v21.16b + eor v8.16b, v28.16b, v8.16b + eor v23.16b, v5.16b, v2.16b + eor v24.16b, v1.16b, v6.16b + eor v16.16b, v16.16b, v22.16b + eor v22.16b, v3.16b, v0.16b + eor v25.16b, v29.16b, v21.16b + eor v21.16b, v26.16b, v21.16b + eor v8.16b, v8.16b, v20.16b + eor v26.16b, v23.16b, v24.16b + eor v16.16b, v16.16b, v20.16b + eor v28.16b, v22.16b, v19.16b + eor v20.16b, v25.16b, v20.16b + eor v9.16b, v21.16b, v9.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v5.16b, v1.16b + eor v21.16b, v16.16b, v17.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v20.16b, v27.16b + eor v20.16b, v3.16b, v7.16b + eor v25.16b, v9.16b, v8.16b + eor v27.16b, v0.16b, v4.16b + and v29.16b, v9.16b, v17.16b + eor v30.16b, v8.16b, v29.16b + eor v31.16b, v21.16b, v29.16b + eor v29.16b, v21.16b, v29.16b + bsl v30.16b, v17.16b, v21.16b + bsl v31.16b, v9.16b, v8.16b + bsl v16.16b, v30.16b, v29.16b + bsl v21.16b, v29.16b, v30.16b + eor v8.16b, v31.16b, v30.16b + and v1.16b, v1.16b, v31.16b + and v9.16b, v16.16b, v31.16b + and v6.16b, v6.16b, v30.16b + eor v16.16b, v17.16b, v21.16b + and v4.16b, v4.16b, v30.16b + eor v17.16b, v8.16b, v30.16b + and v21.16b, v24.16b, v8.16b + eor v9.16b, v9.16b, v25.16b + and v19.16b, v19.16b, v8.16b + eor v24.16b, v30.16b, v16.16b + eor v25.16b, v30.16b, v16.16b + and v7.16b, v7.16b, v17.16b + and v10.16b, v10.16b, v16.16b + eor v29.16b, v9.16b, v16.16b + eor v30.16b, v31.16b, v9.16b + and v0.16b, v24.16b, v0.16b + and v9.16b, v18.16b, v9.16b + and v2.16b, v25.16b, v2.16b + eor v10.16b, v10.16b, v6.16b + eor v18.16b, v29.16b, v16.16b + and v5.16b, v30.16b, v5.16b + eor v24.16b, v8.16b, v29.16b + and v25.16b, v26.16b, v29.16b + and v26.16b, v28.16b, v29.16b + eor v8.16b, v8.16b, v29.16b + eor v17.16b, v17.16b, v18.16b + eor v5.16b, v1.16b, v5.16b + and v23.16b, v24.16b, v23.16b + eor v21.16b, v21.16b, v25.16b + eor v19.16b, v19.16b, v26.16b + eor v0.16b, v4.16b, v0.16b + and v3.16b, v17.16b, v3.16b + eor v1.16b, v9.16b, v1.16b + eor v9.16b, v25.16b, v23.16b + eor v5.16b, v5.16b, v21.16b + eor v2.16b, v6.16b, v2.16b + and v6.16b, v8.16b, v22.16b + eor v3.16b, v7.16b, v3.16b + and v8.16b, v20.16b, v18.16b + eor v10.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v19.16b + eor v9.16b, v1.16b, v9.16b + eor v1.16b, v2.16b, v21.16b + eor v3.16b, v3.16b, v19.16b + and v16.16b, v27.16b, v16.16b + eor v17.16b, v26.16b, v6.16b + eor v6.16b, v8.16b, v7.16b + eor v7.16b, v1.16b, v9.16b + eor v1.16b, v5.16b, v3.16b + eor v2.16b, v10.16b, v3.16b + eor v4.16b, v16.16b, v4.16b + eor v8.16b, v6.16b, v17.16b + eor v5.16b, v9.16b, v3.16b + eor v9.16b, v0.16b, v1.16b + eor v6.16b, v7.16b, v1.16b + eor v0.16b, v4.16b, v17.16b + eor v4.16b, v8.16b, v7.16b + eor v7.16b, v9.16b, v2.16b + eor v8.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + eor v4.16b, v7.16b, v0.16b + eor v7.16b, v8.16b, v3.16b + bcc .Ldec_done + ext v8.16b, v0.16b, v0.16b, #8 + ext v9.16b, v1.16b, v1.16b, #8 + ldr q28, [x11] // load from .LISR in common case (x10 > 0) + ext v10.16b, v6.16b, v6.16b, #8 + ext v16.16b, v3.16b, v3.16b, #8 + ext v17.16b, v5.16b, v5.16b, #8 + ext v18.16b, v4.16b, v4.16b, #8 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v6.16b + eor v16.16b, v16.16b, v3.16b + eor v17.16b, v17.16b, v5.16b + ext v19.16b, v2.16b, v2.16b, #8 + ext v20.16b, v7.16b, v7.16b, #8 + eor v18.16b, v18.16b, v4.16b + eor v6.16b, v6.16b, v8.16b + eor v8.16b, v2.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v19.16b, v2.16b + eor v9.16b, v20.16b, v7.16b + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v16.16b + eor v6.16b, v6.16b, v17.16b + eor v8.16b, v8.16b, v16.16b + eor v7.16b, v7.16b, v18.16b + eor v4.16b, v4.16b, v16.16b + eor v2.16b, v3.16b, v2.16b + eor v1.16b, v1.16b, v17.16b + eor v3.16b, v5.16b, v9.16b + eor v5.16b, v8.16b, v17.16b + eor v7.16b, v7.16b, v17.16b + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v6.16b, v6.16b, #12 + ext v10.16b, v4.16b, v4.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v5.16b, v5.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v3.16b, v3.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v3.16b, v3.16b, v20.16b + eor v17.16b, v17.16b, v4.16b + eor v10.16b, v10.16b, v6.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v3.16b + eor v16.16b, v16.16b, v3.16b + eor v18.16b, v18.16b, v5.16b + eor v19.16b, v19.16b, v7.16b + ext v21.16b, v5.16b, v5.16b, #8 + ext v5.16b, v7.16b, v7.16b, #8 + eor v7.16b, v20.16b, v2.16b + ext v4.16b, v4.16b, v4.16b, #8 + ext v20.16b, v3.16b, v3.16b, #8 + eor v17.16b, v17.16b, v3.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v3.16b, v10.16b, v3.16b + ext v10.16b, v6.16b, v6.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v18.16b + eor v3.16b, v3.16b, v4.16b + eor v7.16b, v20.16b, v7.16b + eor v6.16b, v2.16b, v19.16b + eor v4.16b, v21.16b, v17.16b + eor v2.16b, v10.16b, v9.16b + bne .Ldec_loop + ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) + b .Ldec_loop +.align 4 +.Ldec_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v2.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v6.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v3.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v4.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v4.16b, v4.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v6.16b, v6.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v3.16b, v3.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v2.2d, #2 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v6.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v4.16b, v4.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v2.16b, v2.16b, v17.16b + ushr v8.2d, v4.2d, #4 + ushr v9.2d, v6.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v3.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v2.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v4.16b, v4.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v2.16b, v2.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v4.16b, v4.16b, v10.16b + eor v6.16b, v6.16b, v10.16b + eor v3.16b, v3.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +// InvShiftRows constants +// Used in _bsaes_decrypt8, which assumes contiguity +// .LM0ISR used with round 0 key +// .LISR used with middle round keys +// .LISRM0 used with final round key +.LM0ISR: +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d + +// ShiftRows constants +// Used in _bsaes_encrypt8, which assumes contiguity +// .LM0SR used with round 0 key +// .LSR used with middle round keys +// .LSRM0 used with final round key +.LM0SR: +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d + +.LM0_bigendian: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LM0_littleendian: +.quad 0x0105090d0004080c, 0x03070b0f02060a0e + +// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into +// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR +.LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 + +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_encrypt8: + ldr q8, [x9], #16 + adr x11, .LM0SR + ldr q9, [x11], #16 +_bsaes_encrypt8_alt: + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + sub x10, x10, #1 + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v9.16b + tbl v1.16b, {v1.16b}, v9.16b + tbl v2.16b, {v2.16b}, v9.16b + tbl v4.16b, {v4.16b}, v9.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v9.16b + tbl v5.16b, {v5.16b}, v9.16b + tbl v6.16b, {v6.16b}, v9.16b + ushr v8.2d, v0.2d, #1 + movi v10.16b, #0x55 + tbl v7.16b, {v7.16b}, v9.16b + ushr v9.2d, v4.2d, #1 + movi v16.16b, #0x33 + ushr v17.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + movi v18.16b, #0x0f + ushr v19.2d, v6.2d, #1 + eor v9.16b, v9.16b, v5.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v10.16b + eor v19.16b, v19.16b, v7.16b + and v9.16b, v9.16b, v10.16b + and v17.16b, v17.16b, v10.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v10.16b, v19.16b, v10.16b + eor v5.16b, v5.16b, v9.16b + shl v9.2d, v9.2d, #1 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v10.2d, #1 + eor v7.16b, v7.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v17.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v17.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v17.16b, v17.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v17.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v18.16b + and v9.16b, v9.16b, v18.16b + and v10.16b, v10.16b, v18.16b + and v16.16b, v16.16b, v18.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Lenc_sbox +.align 4 +.Lenc_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Lenc_sbox: + eor v5.16b, v5.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + subs x10, x10, #1 + eor v2.16b, v2.16b, v1.16b + eor v5.16b, v5.16b, v0.16b + eor v8.16b, v3.16b, v7.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v5.16b + eor v8.16b, v8.16b, v4.16b + eor v3.16b, v6.16b, v3.16b + eor v4.16b, v4.16b, v5.16b + eor v6.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v8.16b, v1.16b + eor v8.16b, v7.16b, v4.16b + eor v9.16b, v3.16b, v0.16b + eor v10.16b, v7.16b, v6.16b + eor v16.16b, v5.16b, v3.16b + eor v17.16b, v6.16b, v2.16b + eor v18.16b, v5.16b, v1.16b + eor v19.16b, v2.16b, v4.16b + eor v20.16b, v1.16b, v0.16b + orr v21.16b, v8.16b, v9.16b + orr v22.16b, v10.16b, v16.16b + eor v23.16b, v8.16b, v17.16b + eor v24.16b, v9.16b, v18.16b + and v19.16b, v19.16b, v20.16b + orr v20.16b, v17.16b, v18.16b + and v8.16b, v8.16b, v9.16b + and v9.16b, v17.16b, v18.16b + and v17.16b, v23.16b, v24.16b + and v10.16b, v10.16b, v16.16b + eor v16.16b, v21.16b, v19.16b + eor v18.16b, v20.16b, v19.16b + and v19.16b, v2.16b, v1.16b + and v20.16b, v6.16b, v5.16b + eor v21.16b, v22.16b, v17.16b + eor v9.16b, v9.16b, v10.16b + eor v10.16b, v16.16b, v17.16b + eor v16.16b, v18.16b, v8.16b + and v17.16b, v4.16b, v0.16b + orr v18.16b, v7.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v8.16b, v9.16b, v8.16b + eor v9.16b, v10.16b, v19.16b + eor v10.16b, v3.16b, v0.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v5.16b, v1.16b + eor v19.16b, v21.16b, v20.16b + eor v20.16b, v8.16b, v18.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v7.16b, v4.16b + eor v21.16b, v9.16b, v16.16b + eor v22.16b, v6.16b, v2.16b + and v23.16b, v9.16b, v19.16b + eor v24.16b, v10.16b, v17.16b + eor v25.16b, v0.16b, v1.16b + eor v26.16b, v7.16b, v6.16b + eor v27.16b, v18.16b, v22.16b + eor v28.16b, v3.16b, v5.16b + eor v29.16b, v16.16b, v23.16b + eor v30.16b, v20.16b, v23.16b + eor v23.16b, v20.16b, v23.16b + eor v31.16b, v4.16b, v2.16b + bsl v29.16b, v19.16b, v20.16b + bsl v30.16b, v9.16b, v16.16b + bsl v8.16b, v29.16b, v23.16b + bsl v20.16b, v23.16b, v29.16b + eor v9.16b, v30.16b, v29.16b + and v5.16b, v5.16b, v30.16b + and v8.16b, v8.16b, v30.16b + and v1.16b, v1.16b, v29.16b + eor v16.16b, v19.16b, v20.16b + and v2.16b, v2.16b, v29.16b + eor v19.16b, v9.16b, v29.16b + and v17.16b, v17.16b, v9.16b + eor v8.16b, v8.16b, v21.16b + and v20.16b, v22.16b, v9.16b + eor v21.16b, v29.16b, v16.16b + eor v22.16b, v29.16b, v16.16b + and v23.16b, v25.16b, v16.16b + and v6.16b, v6.16b, v19.16b + eor v25.16b, v8.16b, v16.16b + eor v29.16b, v30.16b, v8.16b + and v4.16b, v21.16b, v4.16b + and v8.16b, v28.16b, v8.16b + and v0.16b, v22.16b, v0.16b + eor v21.16b, v23.16b, v1.16b + eor v22.16b, v9.16b, v25.16b + eor v9.16b, v9.16b, v25.16b + eor v23.16b, v25.16b, v16.16b + and v3.16b, v29.16b, v3.16b + and v24.16b, v24.16b, v25.16b + and v25.16b, v27.16b, v25.16b + and v10.16b, v22.16b, v10.16b + and v9.16b, v9.16b, v18.16b + eor v18.16b, v19.16b, v23.16b + and v19.16b, v26.16b, v23.16b + eor v3.16b, v5.16b, v3.16b + eor v17.16b, v17.16b, v24.16b + eor v10.16b, v24.16b, v10.16b + and v16.16b, v31.16b, v16.16b + eor v20.16b, v20.16b, v25.16b + eor v9.16b, v25.16b, v9.16b + eor v4.16b, v2.16b, v4.16b + and v7.16b, v18.16b, v7.16b + eor v18.16b, v19.16b, v6.16b + eor v5.16b, v8.16b, v5.16b + eor v0.16b, v1.16b, v0.16b + eor v1.16b, v21.16b, v10.16b + eor v8.16b, v3.16b, v17.16b + eor v2.16b, v16.16b, v2.16b + eor v3.16b, v6.16b, v7.16b + eor v6.16b, v18.16b, v9.16b + eor v4.16b, v4.16b, v20.16b + eor v10.16b, v5.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v9.16b, v2.16b, v9.16b + eor v3.16b, v3.16b, v20.16b + eor v7.16b, v6.16b, v1.16b + eor v5.16b, v8.16b, v4.16b + eor v6.16b, v10.16b, v1.16b + eor v2.16b, v4.16b, v0.16b + eor v4.16b, v3.16b, v10.16b + eor v9.16b, v9.16b, v7.16b + eor v3.16b, v0.16b, v5.16b + eor v0.16b, v1.16b, v4.16b + eor v1.16b, v4.16b, v8.16b + eor v4.16b, v9.16b, v5.16b + eor v6.16b, v6.16b, v3.16b + bcc .Lenc_done + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v4.16b, v4.16b, #12 + ldr q28, [x11] + ext v10.16b, v6.16b, v6.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v3.16b, v3.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v5.16b, v5.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v5.16b, v5.16b, v20.16b + eor v17.16b, v17.16b, v6.16b + eor v10.16b, v10.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v5.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v7.16b + ext v3.16b, v3.16b, v3.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 + eor v20.16b, v20.16b, v2.16b + ext v6.16b, v6.16b, v6.16b, #8 + ext v21.16b, v5.16b, v5.16b, #8 + eor v17.16b, v17.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v10.16b, v10.16b, v5.16b + ext v22.16b, v4.16b, v4.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v7.16b, v18.16b + eor v4.16b, v3.16b, v17.16b + eor v3.16b, v6.16b, v10.16b + eor v7.16b, v21.16b, v20.16b + eor v6.16b, v2.16b, v19.16b + eor v2.16b, v22.16b, v9.16b + bne .Lenc_loop + ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) + b .Lenc_loop +.align 4 +.Lenc_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v3.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v4.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v2.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v6.16b, v6.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v3.16b, v3.16b, v9.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v3.2d, #2 + eor v8.16b, v8.16b, v6.16b + eor v9.16b, v9.16b, v4.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v2.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v6.16b, v6.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v4.16b, v4.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + ushr v8.2d, v6.2d, #4 + ushr v9.2d, v4.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v2.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v2.16b, v2.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v6.16b, v6.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v6.16b, v6.16b, v10.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_key_convert,%function +.align 4 +// On entry: +// x9 -> input key (big-endian) +// x10 = number of rounds +// x17 -> output key (native endianness) +// On exit: +// x9, x10 corrupted +// x11 -> .LM0_bigendian +// x17 -> last quadword of output key +// other general-purpose registers preserved +// v2-v6 preserved +// v7.16b[] = 0x63 +// v8-v14 preserved +// v15 = last round key (converted to native endianness) +// other SIMD registers corrupted +_bsaes_key_convert: +#ifdef __ARMEL__ + adr x11, .LM0_littleendian +#else + adr x11, .LM0_bigendian +#endif + ldr q0, [x9], #16 // load round 0 key + ldr q1, [x11] // .LM0 + ldr q15, [x9], #16 // load round 1 key + + movi v7.16b, #0x63 // compose .L63 + movi v16.16b, #0x01 // bit masks + movi v17.16b, #0x02 + movi v18.16b, #0x04 + movi v19.16b, #0x08 + movi v20.16b, #0x10 + movi v21.16b, #0x20 + movi v22.16b, #0x40 + movi v23.16b, #0x80 + +#ifdef __ARMEL__ + rev32 v0.16b, v0.16b +#endif + sub x10, x10, #1 + str q0, [x17], #16 // save round 0 key + +.align 4 +.Lkey_loop: + tbl v0.16b, {v15.16b}, v1.16b + ldr q15, [x9], #16 // load next round key + + eor v0.16b, v0.16b, v7.16b + cmtst v24.16b, v0.16b, v16.16b + cmtst v25.16b, v0.16b, v17.16b + cmtst v26.16b, v0.16b, v18.16b + cmtst v27.16b, v0.16b, v19.16b + cmtst v28.16b, v0.16b, v20.16b + cmtst v29.16b, v0.16b, v21.16b + cmtst v30.16b, v0.16b, v22.16b + cmtst v31.16b, v0.16b, v23.16b + sub x10, x10, #1 + st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key + st1 {v28.16b-v31.16b}, [x17], #64 + cbnz x10, .Lkey_loop + + // don't save last round key +#ifdef __ARMEL__ + rev32 v15.16b, v15.16b + adr x11, .LM0_bigendian +#endif + ret +.size _bsaes_key_convert,.-_bsaes_key_convert + +.globl bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,%function +.align 4 +// On entry: +// x0 -> input ciphertext +// x1 -> output plaintext +// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) +// x3 -> key +// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) +// w5 must be == 0 +// On exit: +// Output plaintext filled in +// Initialisation vector overwritten with last quadword of ciphertext +// No output registers, usual AAPCS64 register preservation +bsaes_cbc_encrypt: + cmp x2, #128 + blo AES_cbc_encrypt + + // it is up to the caller to make sure we are called with enc == 0 + + stp fp, lr, [sp, #-48]! + stp d8, d9, [sp, #16] + stp d10, d15, [sp, #32] + lsr x2, x2, #4 // len in 16 byte blocks + + ldr w15, [x3, #240] // get # of rounds + mov x14, sp + + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x3 // pass key + mov x10, x15 // pass # of rounds + mov sp, x17 // sp is sp + bl _bsaes_key_convert + ldr q6, [sp] + str q15, [x17] // save last round key + eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) + str q6, [sp] + + ldr q15, [x4] // load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs x2, x2, #0x8 + bmi .Lcbc_dec_loop_finish + + ldr q0, [x0], #16 // load input + mov x9, sp // pass the key + ldr q1, [x0], #16 + mov x10, x15 + ldr q2, [x0], #16 + ldr q3, [x0], #16 + ldr q4, [x0], #16 + ldr q5, [x0], #16 + ldr q6, [x0], #16 + ldr q7, [x0], #-7*16 + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + eor v1.16b, v1.16b, v16.16b + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + str q1, [x1], #16 + ldr q1, [x0], #16 + eor v1.16b, v4.16b, v1.16b + ldr q4, [x0], #16 + eor v2.16b, v2.16b, v4.16b + eor v0.16b, v6.16b, v0.16b + ldr q4, [x0], #16 + str q0, [x1], #16 + str q1, [x1], #16 + eor v0.16b, v7.16b, v4.16b + ldr q1, [x0], #16 + str q2, [x1], #16 + ldr q2, [x0], #16 + ldr q15, [x0], #16 + str q0, [x1], #16 + eor v0.16b, v5.16b, v2.16b + eor v1.16b, v3.16b, v1.16b + str q1, [x1], #16 + str q0, [x1], #16 + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds x2, x2, #8 + beq .Lcbc_dec_done + + ldr q0, [x0], #16 // load input + cmp x2, #2 + blo .Lcbc_dec_one + ldr q1, [x0], #16 + mov x9, sp // pass the key + mov x10, x15 + beq .Lcbc_dec_two + ldr q2, [x0], #16 + cmp x2, #4 + blo .Lcbc_dec_three + ldr q3, [x0], #16 + beq .Lcbc_dec_four + ldr q4, [x0], #16 + cmp x2, #6 + blo .Lcbc_dec_five + ldr q5, [x0], #16 + beq .Lcbc_dec_six + ldr q6, [x0], #-6*16 + + bl _bsaes_decrypt8 + + ldr q5, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q8, [x0], #16 + ldr q9, [x0], #16 + ldr q10, [x0], #16 + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + eor v1.16b, v1.16b, v5.16b + ldr q5, [x0], #16 + eor v6.16b, v6.16b, v8.16b + ldr q15, [x0] + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + str q1, [x1], #16 + eor v0.16b, v7.16b, v0.16b + str q6, [x1], #16 + eor v1.16b, v3.16b, v5.16b + str q4, [x1], #16 + str q2, [x1], #16 + str q0, [x1], #16 + str q1, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub x0, x0, #0x60 + bl _bsaes_decrypt8 + ldr q3, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q5, [x0], #16 + ldr q8, [x0], #16 + ldr q9, [x0], #16 + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + eor v1.16b, v1.16b, v3.16b + ldr q15, [x0] + eor v3.16b, v6.16b, v5.16b + eor v4.16b, v4.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + str q1, [x1], #16 + eor v0.16b, v7.16b, v0.16b + str q3, [x1], #16 + str q4, [x1], #16 + str q2, [x1], #16 + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub x0, x0, #0x50 + bl _bsaes_decrypt8 + ldr q3, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q5, [x0], #16 + ldr q7, [x0], #16 + ldr q8, [x0], #16 + str q0, [x1], #16 // write output + ldr q15, [x0] + eor v0.16b, v1.16b, v3.16b + eor v1.16b, v6.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + str q0, [x1], #16 + eor v0.16b, v2.16b, v8.16b + str q1, [x1], #16 + str q3, [x1], #16 + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub x0, x0, #0x40 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q3, [x0], #16 + ldr q5, [x0], #16 + str q0, [x1], #16 // write output + ldr q15, [x0] + eor v0.16b, v1.16b, v2.16b + eor v1.16b, v6.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + str q0, [x1], #16 + str q1, [x1], #16 + str q2, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub x0, x0, #0x30 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q3, [x0], #16 + ldr q15, [x0] + str q0, [x1], #16 // write output + eor v0.16b, v1.16b, v2.16b + eor v1.16b, v6.16b, v3.16b + str q0, [x1], #16 + str q1, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub x0, x0, #0x20 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q15, [x0] + str q0, [x1], #16 // write output + eor v0.16b, v1.16b, v2.16b + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub x0, x0, #0x10 + stp x1, x4, [sp, #-32]! + str x14, [sp, #16] + mov v8.16b, v15.16b + mov v15.16b, v0.16b + mov x2, x3 + bl AES_decrypt + ldr x14, [sp, #16] + ldp x1, x4, [sp], #32 + ldr q0, [x1] // load result + eor v0.16b, v0.16b, v8.16b // ^= IV + str q0, [x1] // write output + +.align 4 +.Lcbc_dec_done: + movi v0.16b, #0 + movi v1.16b, #0 +.Lcbc_dec_bzero:// wipe key schedule [if any] + stp q0, q1, [sp], #32 + cmp sp, x14 + bne .Lcbc_dec_bzero + str q15, [x4] // return IV + ldp d8, d9, [sp, #16] + ldp d10, d15, [sp, #32] + ldp fp, lr, [sp], #48 + ret +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt + +.globl bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 4 +// On entry: +// x0 -> input text (whole 16-byte blocks) +// x1 -> output text (whole 16-byte blocks) +// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) +// x3 -> key +// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block +// On exit: +// Output text filled in +// No output registers, usual AAPCS64 register preservation +bsaes_ctr32_encrypt_blocks: + + cmp x2, #8 // use plain AES for + blo .Lctr_enc_short // small sizes + + stp fp, lr, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldr w15, [x3, #240] // get # of rounds + mov x14, sp + + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x3 // pass key + mov x10, x15 // pass # of rounds + mov sp, x17 // sp is sp + bl _bsaes_key_convert + eor v7.16b, v7.16b, v15.16b // fix up last round key + str q7, [x17] // save last round key + + ldr q0, [x4] // load counter + add x13, x11, #.LREVM0SR-.LM0_bigendian + ldr q4, [sp] // load round0 key + + movi v8.4s, #1 // compose 1<<96 + movi v9.16b, #0 + rev32 v15.16b, v0.16b + rev32 v0.16b, v0.16b + ext v11.16b, v9.16b, v8.16b, #4 + rev32 v4.16b, v4.16b + add v12.4s, v11.4s, v11.4s // compose 2<<96 + str q4, [sp] // save adjusted round0 key + add v13.4s, v11.4s, v12.4s // compose 3<<96 + add v14.4s, v12.4s, v12.4s // compose 4<<96 + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + // Intermix prologue from _bsaes_encrypt8 to use the opportunity + // to flip byte order in 32-bit counter + + add v1.4s, v15.4s, v11.4s // +1 + add x9, sp, #0x10 // pass next round key + add v2.4s, v15.4s, v12.4s // +2 + ldr q9, [x13] // .LREVM0SR + ldr q8, [sp] // load round0 key + add v3.4s, v15.4s, v13.4s // +3 + mov x10, x15 // pass rounds + sub x11, x13, #.LREVM0SR-.LSR // pass constants + add v6.4s, v2.4s, v14.4s + add v4.4s, v15.4s, v14.4s // +4 + add v7.4s, v3.4s, v14.4s + add v15.4s, v4.4s, v14.4s // next counter + add v5.4s, v1.4s, v14.4s + + bl _bsaes_encrypt8_alt + + subs x2, x2, #8 + blo .Lctr_enc_loop_done + + ldr q16, [x0], #16 + ldr q17, [x0], #16 + eor v1.16b, v1.16b, v17.16b + ldr q17, [x0], #16 + eor v0.16b, v0.16b, v16.16b + eor v4.16b, v4.16b, v17.16b + str q0, [x1], #16 + ldr q16, [x0], #16 + str q1, [x1], #16 + mov v0.16b, v15.16b + str q4, [x1], #16 + ldr q1, [x0], #16 + eor v4.16b, v6.16b, v16.16b + eor v1.16b, v3.16b, v1.16b + ldr q3, [x0], #16 + eor v3.16b, v7.16b, v3.16b + ldr q6, [x0], #16 + eor v2.16b, v2.16b, v6.16b + ldr q6, [x0], #16 + eor v5.16b, v5.16b, v6.16b + str q4, [x1], #16 + str q1, [x1], #16 + str q3, [x1], #16 + str q2, [x1], #16 + str q5, [x1], #16 + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add x2, x2, #8 + ldr q16, [x0], #16 // load input + eor v0.16b, v0.16b, v16.16b + str q0, [x1], #16 // write output + cmp x2, #2 + blo .Lctr_enc_done + ldr q17, [x0], #16 + eor v1.16b, v1.16b, v17.16b + str q1, [x1], #16 + beq .Lctr_enc_done + ldr q18, [x0], #16 + eor v4.16b, v4.16b, v18.16b + str q4, [x1], #16 + cmp x2, #4 + blo .Lctr_enc_done + ldr q19, [x0], #16 + eor v6.16b, v6.16b, v19.16b + str q6, [x1], #16 + beq .Lctr_enc_done + ldr q20, [x0], #16 + eor v3.16b, v3.16b, v20.16b + str q3, [x1], #16 + cmp x2, #6 + blo .Lctr_enc_done + ldr q21, [x0], #16 + eor v7.16b, v7.16b, v21.16b + str q7, [x1], #16 + beq .Lctr_enc_done + ldr q22, [x0] + eor v2.16b, v2.16b, v22.16b + str q2, [x1], #16 + +.Lctr_enc_done: + movi v0.16b, #0 + movi v1.16b, #0 +.Lctr_enc_bzero: // wipe key schedule [if any] + stp q0, q1, [sp], #32 + cmp sp, x14 + bne .Lctr_enc_bzero + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp fp, lr, [sp], #80 + ret + +.Lctr_enc_short: + stp fp, lr, [sp, #-96]! + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + str x23, [sp, #48] + + mov x19, x0 // copy arguments + mov x20, x1 + mov x21, x2 + mov x22, x3 + ldr w23, [x4, #12] // load counter .LSW + ldr q1, [x4] // load whole counter value +#ifdef __ARMEL__ + rev w23, w23 +#endif + str q1, [sp, #80] // copy counter value + +.Lctr_enc_short_loop: + add x0, sp, #80 // input counter value + add x1, sp, #64 // output on the stack + mov x2, x22 // key + + bl AES_encrypt + + ldr q0, [x19], #16 // load input + ldr q1, [sp, #64] // load encrypted counter + add x23, x23, #1 +#ifdef __ARMEL__ + rev w0, w23 + str w0, [sp, #80+12] // next counter value +#else + str w23, [sp, #80+12] // next counter value +#endif + eor v0.16b, v0.16b, v1.16b + str q0, [x20], #16 // store output + subs x21, x21, #1 + bne .Lctr_enc_short_loop + + movi v0.16b, #0 + movi v1.16b, #0 + stp q0, q1, [sp, #64] + + ldr x23, [sp, #48] + ldp x21, x22, [sp, #32] + ldp x19, x20, [sp, #16] + ldp fp, lr, [sp], #96 + ret +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks + +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,%function +.align 4 +// On entry: +// x0 -> input plaintext +// x1 -> output ciphertext +// x2 -> length of text in bytes (must be at least 16) +// x3 -> key1 (used to encrypt the XORed plaintext blocks) +// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) +// x5 -> 16-byte initial vector (typically, sector number) +// On exit: +// Output ciphertext filled in +// No output registers, usual AAPCS64 register preservation +bsaes_xts_encrypt: + // Stack layout: + // sp -> + // nrounds*128-96 bytes: key schedule + // x19 -> + // 16 bytes: frame record + // 4*16 bytes: tweak storage across _bsaes_encrypt8 + // 6*8 bytes: storage for 5 callee-saved general-purpose registers + // 8*8 bytes: storage for 8 callee-saved SIMD registers + stp fp, lr, [sp, #-192]! + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + str x23, [sp, #112] + stp d8, d9, [sp, #128] + stp d10, d11, [sp, #144] + stp d12, d13, [sp, #160] + stp d14, d15, [sp, #176] + + mov x19, sp + mov x20, x0 + mov x21, x1 + mov x22, x2 + mov x23, x3 + + // generate initial tweak + sub sp, sp, #16 + mov x0, x5 // iv[] + mov x1, sp + mov x2, x4 // key2 + bl AES_encrypt + ldr q11, [sp], #16 + + ldr w1, [x23, #240] // get # of rounds + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x23 // pass key + mov x10, x1 // pass # of rounds + mov sp, x17 + bl _bsaes_key_convert + eor v15.16b, v15.16b, v7.16b // fix up last round key + str q15, [x17] // save last round key + + subs x22, x22, #0x80 + blo .Lxts_enc_short + b .Lxts_enc_loop + +.align 4 +.Lxts_enc_loop: + ldr q8, .Lxts_magic + mov x10, x1 // pass rounds + add x2, x19, #16 + ldr q0, [x20], #16 + sshr v1.2d, v11.2d, #63 + mov x9, sp // pass key schedule + ldr q6, .Lxts_magic+16 + add v2.2d, v11.2d, v11.2d + cmtst v3.2d, v11.2d, v6.2d + and v1.16b, v1.16b, v8.16b + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + ldr q4, [x20], #16 + eor v12.16b, v2.16b, v1.16b + eor v1.16b, v4.16b, v12.16b + eor v0.16b, v0.16b, v11.16b + cmtst v2.2d, v12.2d, v6.2d + add v4.2d, v12.2d, v12.2d + add x0, x19, #16 + ext v3.16b, v3.16b, v3.16b, #8 + and v2.16b, v2.16b, v8.16b + eor v13.16b, v4.16b, v3.16b + ldr q3, [x20], #16 + ext v4.16b, v2.16b, v2.16b, #8 + eor v2.16b, v3.16b, v13.16b + ldr q3, [x20], #16 + add v5.2d, v13.2d, v13.2d + cmtst v7.2d, v13.2d, v6.2d + and v7.16b, v7.16b, v8.16b + ldr q9, [x20], #16 + ext v7.16b, v7.16b, v7.16b, #8 + ldr q10, [x20], #16 + eor v14.16b, v5.16b, v4.16b + ldr q16, [x20], #16 + add v4.2d, v14.2d, v14.2d + eor v3.16b, v3.16b, v14.16b + eor v15.16b, v4.16b, v7.16b + add v5.2d, v15.2d, v15.2d + ldr q7, [x20], #16 + cmtst v4.2d, v14.2d, v6.2d + and v17.16b, v4.16b, v8.16b + cmtst v18.2d, v15.2d, v6.2d + eor v4.16b, v9.16b, v15.16b + ext v9.16b, v17.16b, v17.16b, #8 + eor v9.16b, v5.16b, v9.16b + add v17.2d, v9.2d, v9.2d + and v18.16b, v18.16b, v8.16b + eor v5.16b, v10.16b, v9.16b + str q9, [x2], #16 + ext v10.16b, v18.16b, v18.16b, #8 + cmtst v9.2d, v9.2d, v6.2d + and v9.16b, v9.16b, v8.16b + eor v10.16b, v17.16b, v10.16b + cmtst v17.2d, v10.2d, v6.2d + eor v6.16b, v16.16b, v10.16b + str q10, [x2], #16 + ext v9.16b, v9.16b, v9.16b, #8 + add v10.2d, v10.2d, v10.2d + eor v9.16b, v10.16b, v9.16b + str q9, [x2], #16 + eor v7.16b, v7.16b, v9.16b + add v9.2d, v9.2d, v9.2d + and v8.16b, v17.16b, v8.16b + ext v8.16b, v8.16b, v8.16b, #8 + eor v8.16b, v9.16b, v8.16b + str q8, [x2] // next round tweak + + bl _bsaes_encrypt8 + + ldr q8, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q9, [x0], #16 + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + ldr q10, [x0], #16 + eor v3.16b, v3.16b, v15.16b + subs x22, x22, #0x80 + str q0, [x21], #16 + ldr q11, [x0] // next round tweak + str q1, [x21], #16 + eor v0.16b, v7.16b, v8.16b + eor v1.16b, v2.16b, v9.16b + str q4, [x21], #16 + eor v2.16b, v5.16b, v10.16b + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + str q2, [x21], #16 + bpl .Lxts_enc_loop + +.Lxts_enc_short: + adds x22, x22, #0x70 + bmi .Lxts_enc_done + + ldr q8, .Lxts_magic + sshr v1.2d, v11.2d, #63 + add v2.2d, v11.2d, v11.2d + ldr q9, .Lxts_magic+16 + subs x22, x22, #0x10 + ldr q0, [x20], #16 + and v1.16b, v1.16b, v8.16b + cmtst v3.2d, v11.2d, v9.2d + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + eor v12.16b, v2.16b, v1.16b + ext v1.16b, v3.16b, v3.16b, #8 + add v2.2d, v12.2d, v12.2d + cmtst v3.2d, v12.2d, v9.2d + eor v13.16b, v2.16b, v1.16b + and v22.16b, v3.16b, v8.16b + bmi .Lxts_enc_1 + + ext v2.16b, v22.16b, v22.16b, #8 + add v3.2d, v13.2d, v13.2d + ldr q1, [x20], #16 + cmtst v4.2d, v13.2d, v9.2d + subs x22, x22, #0x10 + eor v14.16b, v3.16b, v2.16b + and v23.16b, v4.16b, v8.16b + bmi .Lxts_enc_2 + + ext v3.16b, v23.16b, v23.16b, #8 + add v4.2d, v14.2d, v14.2d + ldr q2, [x20], #16 + cmtst v5.2d, v14.2d, v9.2d + eor v0.16b, v0.16b, v11.16b + subs x22, x22, #0x10 + eor v15.16b, v4.16b, v3.16b + and v24.16b, v5.16b, v8.16b + bmi .Lxts_enc_3 + + ext v4.16b, v24.16b, v24.16b, #8 + add v5.2d, v15.2d, v15.2d + ldr q3, [x20], #16 + cmtst v6.2d, v15.2d, v9.2d + eor v1.16b, v1.16b, v12.16b + subs x22, x22, #0x10 + eor v16.16b, v5.16b, v4.16b + and v25.16b, v6.16b, v8.16b + bmi .Lxts_enc_4 + + ext v5.16b, v25.16b, v25.16b, #8 + add v6.2d, v16.2d, v16.2d + add x0, x19, #16 + cmtst v7.2d, v16.2d, v9.2d + ldr q4, [x20], #16 + eor v2.16b, v2.16b, v13.16b + str q16, [x0], #16 + subs x22, x22, #0x10 + eor v17.16b, v6.16b, v5.16b + and v26.16b, v7.16b, v8.16b + bmi .Lxts_enc_5 + + ext v7.16b, v26.16b, v26.16b, #8 + add v18.2d, v17.2d, v17.2d + ldr q5, [x20], #16 + eor v3.16b, v3.16b, v14.16b + str q17, [x0], #16 + subs x22, x22, #0x10 + eor v18.16b, v18.16b, v7.16b + bmi .Lxts_enc_6 + + ldr q6, [x20], #16 + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + str q18, [x0] // next round tweak + mov x9, sp // pass key schedule + mov x10, x1 + add x0, x19, #16 + sub x22, x22, #0x10 + eor v6.16b, v6.16b, v17.16b + + bl _bsaes_encrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q17, [x0], #16 + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + ldr q11, [x0] // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + eor v1.16b, v2.16b, v17.16b + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_6: + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + ldr q11, [x0] // next round tweak + eor v3.16b, v3.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_5: + eor v3.16b, v3.16b, v14.16b + eor v4.16b, v4.16b, v15.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q11, [x0] // next round tweak + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_4: + eor v2.16b, v2.16b, v13.16b + eor v3.16b, v3.16b, v14.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + mov v11.16b, v15.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + str q6, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_3: + eor v1.16b, v1.16b, v12.16b + eor v2.16b, v2.16b, v13.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + mov v11.16b, v14.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_2: + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov v11.16b, v13.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_1: + eor v0.16b, v0.16b, v11.16b + sub x0, sp, #16 + sub x1, sp, #16 + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + str q0, [sp, #-16]! + + bl AES_encrypt + + ldr q0, [sp], #16 + trn1 v13.2d, v11.2d, v13.2d + trn1 v11.2d, v12.2d, v14.2d // next round tweak + eor v0.16b, v0.16b, v13.16b + str q0, [x21], #16 + +.Lxts_enc_done: + adds x22, x22, #0x10 + beq .Lxts_enc_ret + + sub x6, x21, #0x10 + // Penultimate plaintext block produces final ciphertext part-block + // plus remaining part of final plaintext block. Move ciphertext part + // to final position and re-use penultimate ciphertext block buffer to + // construct final plaintext block +.Lxts_enc_steal: + ldrb w0, [x20], #1 + ldrb w1, [x21, #-0x10] + strb w0, [x21, #-0x10] + strb w1, [x21], #1 + + subs x22, x22, #1 + bhi .Lxts_enc_steal + + // Finally encrypt the penultimate ciphertext block using the + // last tweak + ldr q0, [x6] + eor v0.16b, v0.16b, v11.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov x21, x6 + mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers + + bl AES_encrypt + + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v11.16b + str q0, [x21] + +.Lxts_enc_ret: + + movi v0.16b, #0 + movi v1.16b, #0 +.Lxts_enc_bzero: // wipe key schedule + stp q0, q1, [sp], #32 + cmp sp, x19 + bne .Lxts_enc_bzero + + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldr x23, [sp, #112] + ldp d8, d9, [sp, #128] + ldp d10, d11, [sp, #144] + ldp d12, d13, [sp, #160] + ldp d14, d15, [sp, #176] + ldp fp, lr, [sp], #192 + ret +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +// The assembler doesn't seem capable of de-duplicating these when expressed +// using `ldr qd,=` syntax, so assign a symbolic address +.align 5 +.Lxts_magic: +.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,%function +.align 4 +// On entry: +// x0 -> input ciphertext +// x1 -> output plaintext +// x2 -> length of text in bytes (must be at least 16) +// x3 -> key1 (used to decrypt the XORed ciphertext blocks) +// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) +// x5 -> 16-byte initial vector (typically, sector number) +// On exit: +// Output plaintext filled in +// No output registers, usual AAPCS64 register preservation +bsaes_xts_decrypt: + // Stack layout: + // sp -> + // nrounds*128-96 bytes: key schedule + // x19 -> + // 16 bytes: frame record + // 4*16 bytes: tweak storage across _bsaes_decrypt8 + // 6*8 bytes: storage for 5 callee-saved general-purpose registers + // 8*8 bytes: storage for 8 callee-saved SIMD registers + stp fp, lr, [sp, #-192]! + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + str x23, [sp, #112] + stp d8, d9, [sp, #128] + stp d10, d11, [sp, #144] + stp d12, d13, [sp, #160] + stp d14, d15, [sp, #176] + + mov x19, sp + mov x20, x0 + mov x21, x1 + mov x22, x2 + mov x23, x3 + + // generate initial tweak + sub sp, sp, #16 + mov x0, x5 // iv[] + mov x1, sp + mov x2, x4 // key2 + bl AES_encrypt + ldr q11, [sp], #16 + + ldr w1, [x23, #240] // get # of rounds + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x23 // pass key + mov x10, x1 // pass # of rounds + mov sp, x17 + bl _bsaes_key_convert + ldr q6, [sp] + str q15, [x17] // save last round key + eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) + str q6, [sp] + + sub x30, x22, #0x10 + tst x22, #0xf // if not multiple of 16 + csel x22, x30, x22, ne // subtract another 16 bytes + subs x22, x22, #0x80 + + blo .Lxts_dec_short + b .Lxts_dec_loop + +.align 4 +.Lxts_dec_loop: + ldr q8, .Lxts_magic + mov x10, x1 // pass rounds + add x2, x19, #16 + ldr q0, [x20], #16 + sshr v1.2d, v11.2d, #63 + mov x9, sp // pass key schedule + ldr q6, .Lxts_magic+16 + add v2.2d, v11.2d, v11.2d + cmtst v3.2d, v11.2d, v6.2d + and v1.16b, v1.16b, v8.16b + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + ldr q4, [x20], #16 + eor v12.16b, v2.16b, v1.16b + eor v1.16b, v4.16b, v12.16b + eor v0.16b, v0.16b, v11.16b + cmtst v2.2d, v12.2d, v6.2d + add v4.2d, v12.2d, v12.2d + add x0, x19, #16 + ext v3.16b, v3.16b, v3.16b, #8 + and v2.16b, v2.16b, v8.16b + eor v13.16b, v4.16b, v3.16b + ldr q3, [x20], #16 + ext v4.16b, v2.16b, v2.16b, #8 + eor v2.16b, v3.16b, v13.16b + ldr q3, [x20], #16 + add v5.2d, v13.2d, v13.2d + cmtst v7.2d, v13.2d, v6.2d + and v7.16b, v7.16b, v8.16b + ldr q9, [x20], #16 + ext v7.16b, v7.16b, v7.16b, #8 + ldr q10, [x20], #16 + eor v14.16b, v5.16b, v4.16b + ldr q16, [x20], #16 + add v4.2d, v14.2d, v14.2d + eor v3.16b, v3.16b, v14.16b + eor v15.16b, v4.16b, v7.16b + add v5.2d, v15.2d, v15.2d + ldr q7, [x20], #16 + cmtst v4.2d, v14.2d, v6.2d + and v17.16b, v4.16b, v8.16b + cmtst v18.2d, v15.2d, v6.2d + eor v4.16b, v9.16b, v15.16b + ext v9.16b, v17.16b, v17.16b, #8 + eor v9.16b, v5.16b, v9.16b + add v17.2d, v9.2d, v9.2d + and v18.16b, v18.16b, v8.16b + eor v5.16b, v10.16b, v9.16b + str q9, [x2], #16 + ext v10.16b, v18.16b, v18.16b, #8 + cmtst v9.2d, v9.2d, v6.2d + and v9.16b, v9.16b, v8.16b + eor v10.16b, v17.16b, v10.16b + cmtst v17.2d, v10.2d, v6.2d + eor v6.16b, v16.16b, v10.16b + str q10, [x2], #16 + ext v9.16b, v9.16b, v9.16b, #8 + add v10.2d, v10.2d, v10.2d + eor v9.16b, v10.16b, v9.16b + str q9, [x2], #16 + eor v7.16b, v7.16b, v9.16b + add v9.2d, v9.2d, v9.2d + and v8.16b, v17.16b, v8.16b + ext v8.16b, v8.16b, v8.16b, #8 + eor v8.16b, v9.16b, v8.16b + str q8, [x2] // next round tweak + + bl _bsaes_decrypt8 + + eor v6.16b, v6.16b, v13.16b + eor v0.16b, v0.16b, v11.16b + ldr q8, [x0], #16 + eor v7.16b, v7.16b, v8.16b + str q0, [x21], #16 + eor v0.16b, v1.16b, v12.16b + ldr q1, [x0], #16 + eor v1.16b, v3.16b, v1.16b + subs x22, x22, #0x80 + eor v2.16b, v2.16b, v15.16b + eor v3.16b, v4.16b, v14.16b + ldr q4, [x0], #16 + str q0, [x21], #16 + ldr q11, [x0] // next round tweak + eor v0.16b, v5.16b, v4.16b + str q6, [x21], #16 + str q3, [x21], #16 + str q2, [x21], #16 + str q7, [x21], #16 + str q1, [x21], #16 + str q0, [x21], #16 + bpl .Lxts_dec_loop + +.Lxts_dec_short: + adds x22, x22, #0x70 + bmi .Lxts_dec_done + + ldr q8, .Lxts_magic + sshr v1.2d, v11.2d, #63 + add v2.2d, v11.2d, v11.2d + ldr q9, .Lxts_magic+16 + subs x22, x22, #0x10 + ldr q0, [x20], #16 + and v1.16b, v1.16b, v8.16b + cmtst v3.2d, v11.2d, v9.2d + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + eor v12.16b, v2.16b, v1.16b + ext v1.16b, v3.16b, v3.16b, #8 + add v2.2d, v12.2d, v12.2d + cmtst v3.2d, v12.2d, v9.2d + eor v13.16b, v2.16b, v1.16b + and v22.16b, v3.16b, v8.16b + bmi .Lxts_dec_1 + + ext v2.16b, v22.16b, v22.16b, #8 + add v3.2d, v13.2d, v13.2d + ldr q1, [x20], #16 + cmtst v4.2d, v13.2d, v9.2d + subs x22, x22, #0x10 + eor v14.16b, v3.16b, v2.16b + and v23.16b, v4.16b, v8.16b + bmi .Lxts_dec_2 + + ext v3.16b, v23.16b, v23.16b, #8 + add v4.2d, v14.2d, v14.2d + ldr q2, [x20], #16 + cmtst v5.2d, v14.2d, v9.2d + eor v0.16b, v0.16b, v11.16b + subs x22, x22, #0x10 + eor v15.16b, v4.16b, v3.16b + and v24.16b, v5.16b, v8.16b + bmi .Lxts_dec_3 + + ext v4.16b, v24.16b, v24.16b, #8 + add v5.2d, v15.2d, v15.2d + ldr q3, [x20], #16 + cmtst v6.2d, v15.2d, v9.2d + eor v1.16b, v1.16b, v12.16b + subs x22, x22, #0x10 + eor v16.16b, v5.16b, v4.16b + and v25.16b, v6.16b, v8.16b + bmi .Lxts_dec_4 + + ext v5.16b, v25.16b, v25.16b, #8 + add v6.2d, v16.2d, v16.2d + add x0, x19, #16 + cmtst v7.2d, v16.2d, v9.2d + ldr q4, [x20], #16 + eor v2.16b, v2.16b, v13.16b + str q16, [x0], #16 + subs x22, x22, #0x10 + eor v17.16b, v6.16b, v5.16b + and v26.16b, v7.16b, v8.16b + bmi .Lxts_dec_5 + + ext v7.16b, v26.16b, v26.16b, #8 + add v18.2d, v17.2d, v17.2d + ldr q5, [x20], #16 + eor v3.16b, v3.16b, v14.16b + str q17, [x0], #16 + subs x22, x22, #0x10 + eor v18.16b, v18.16b, v7.16b + bmi .Lxts_dec_6 + + ldr q6, [x20], #16 + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + str q18, [x0] // next round tweak + mov x9, sp // pass key schedule + mov x10, x1 + add x0, x19, #16 + sub x22, x22, #0x10 + eor v6.16b, v6.16b, v17.16b + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q17, [x0], #16 + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + eor v2.16b, v2.16b, v15.16b + ldr q11, [x0] // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + eor v1.16b, v3.16b, v17.16b + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_6: + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + ldr q11, [x0] // next round tweak + eor v2.16b, v2.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + str q0, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_5: + eor v3.16b, v3.16b, v14.16b + eor v4.16b, v4.16b, v15.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q11, [x0] // next round tweak + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + eor v2.16b, v2.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_4: + eor v2.16b, v2.16b, v13.16b + eor v3.16b, v3.16b, v14.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + mov v11.16b, v15.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + str q4, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_3: + eor v1.16b, v1.16b, v12.16b + eor v2.16b, v2.16b, v13.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + mov v11.16b, v14.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_2: + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov v11.16b, v13.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_1: + eor v0.16b, v0.16b, v11.16b + sub x0, sp, #16 + sub x1, sp, #16 + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + str q0, [sp, #-16]! + + bl AES_decrypt + + ldr q0, [sp], #16 + trn1 v13.2d, v11.2d, v13.2d + trn1 v11.2d, v12.2d, v14.2d // next round tweak + eor v0.16b, v0.16b, v13.16b + str q0, [x21], #16 + +.Lxts_dec_done: + adds x22, x22, #0x10 + beq .Lxts_dec_ret + + // calculate one round of extra tweak for the stolen ciphertext + ldr q8, .Lxts_magic + sshr v6.2d, v11.2d, #63 + and v6.16b, v6.16b, v8.16b + add v12.2d, v11.2d, v11.2d + ext v6.16b, v6.16b, v6.16b, #8 + eor v12.16b, v12.16b, v6.16b + + // perform the final decryption with the last tweak value + ldr q0, [x20], #16 + eor v0.16b, v0.16b, v12.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + + bl AES_decrypt + + trn1 v12.2d, v12.2d, v14.2d + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v12.16b + str q0, [x21] + + mov x6, x21 + // Penultimate ciphertext block produces final plaintext part-block + // plus remaining part of final ciphertext block. Move plaintext part + // to final position and re-use penultimate plaintext block buffer to + // construct final ciphertext block +.Lxts_dec_steal: + ldrb w1, [x21] + ldrb w0, [x20], #1 + strb w1, [x21, #0x10] + strb w0, [x21], #1 + + subs x22, x22, #1 + bhi .Lxts_dec_steal + + // Finally decrypt the penultimate plaintext block using the + // penultimate tweak + ldr q0, [x6] + eor v0.16b, v0.16b, v11.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov x21, x6 + + bl AES_decrypt + + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v11.16b + str q0, [x21] + +.Lxts_dec_ret: + + movi v0.16b, #0 + movi v1.16b, #0 +.Lxts_dec_bzero: // wipe key schedule + stp q0, q1, [sp], #32 + cmp sp, x19 + bne .Lxts_dec_bzero + + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldr x23, [sp, #112] + ldp d8, d9, [sp, #128] + ldp d10, d11, [sp, #144] + ldp d12, d13, [sp, #160] + ldp d14, d15, [sp, #176] + ldp fp, lr, [sp], #192 + ret +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt diff --git a/crypto/aes/build.info b/crypto/aes/build.info index 66bff1ae73a..b17f7e5c1f7 100644 --- a/crypto/aes/build.info +++ b/crypto/aes/build.info @@ -30,8 +30,8 @@ IF[{- !$disabled{asm} -}] $AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S $AESDEF_armv4=AES_ASM BSAES_ASM - $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S - $AESDEF_aarch64=VPAES_ASM + $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S + $AESDEF_aarch64=BSAES_ASM VPAES_ASM $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s $AESDEF_parisc11=AES_ASM @@ -73,6 +73,7 @@ DEFINE[../../providers/libfips.a]=$AESDEF DEFINE[../../providers/libdefault.a]=$AESDEF GENERATE[aes-ia64.s]=asm/aes-ia64.S +GENERATE[bsaes-armv8.S]=asm/bsaes-armv8.S GENERATE[aes-586.s]=asm/aes-586.pl DEPEND[aes-586.s]=../perlasm/x86asm.pl -- 2.47.2