crypto: riscv - add vector crypto accelerated AES-CBC-CTS

author Eric Biggers <ebiggers@google.com>

Tue, 13 Feb 2024 05:54:42 +0000 (21:54 -0800)

committer Palmer Dabbelt <palmer@rivosinc.com>

Wed, 20 Mar 2024 15:56:11 +0000 (08:56 -0700)
author Eric Biggers <ebiggers@google.com>
Tue, 13 Feb 2024 05:54:42 +0000 (21:54 -0800)
committer Palmer Dabbelt <palmer@rivosinc.com>
Wed, 20 Mar 2024 15:56:11 +0000 (08:56 -0700)
diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig

index 2ad44e1d464afdc1bcf4633bddab8c978f08d717..ad58dad9a580761a6ab6f10365bf93666bda9e35 100644 (file)
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -3,14 +3,14 @@
  menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
  
  config CRYPTO_AES_RISCV64
-       tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS"
+       tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
         depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
         select CRYPTO_ALGAPI
         select CRYPTO_LIB_AES
         select CRYPTO_SKCIPHER
         help
           Block cipher: AES cipher algorithms
-         Length-preserving ciphers: AES with ECB, CBC, CTR, XTS
+         Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTS
  
           Architecture: riscv64 using:
           - Zvkned vector crypto extension
diff --git a/arch/riscv/crypto/aes-riscv64-glue.c b/arch/riscv/crypto/aes-riscv64-glue.c

index 37bc6ef0be40e5c0b28c528a26c58dd792eeaebd..f814ee048555b1cf39f50b2e8afc370a8baf88ed 100644 (file)
--- a/arch/riscv/crypto/aes-riscv64-glue.c
+++ b/arch/riscv/crypto/aes-riscv64-glue.c
@@ -1,13 +1,15 @@
  // SPDX-License-Identifier: GPL-2.0-only
  /*
   * AES using the RISC-V vector crypto extensions.  Includes the bare block
- * cipher and the ECB, CBC, CTR, and XTS modes.
+ * cipher and the ECB, CBC, CBC-CTS, CTR, and XTS modes.
   *
   * Copyright (C) 2023 VRULL GmbH
   * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
   *
   * Copyright (C) 2023 SiFive, Inc.
   * Author: Jerry Shih <jerry.shih@sifive.com>
+ *
+ * Copyright 2024 Google LLC
   */
  
  #include <asm/simd.h>
@@ -40,6 +42,10 @@ asmlinkage void aes_cbc_decrypt_zvkned(const struct crypto_aes_ctx *key,
                                        const u8 *in, u8 *out, size_t len,
                                        u8 iv[AES_BLOCK_SIZE]);
  
+asmlinkage void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
+                                        const u8 *in, u8 *out, size_t len,
+                                        const u8 iv[AES_BLOCK_SIZE], bool enc);
+
  asmlinkage void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key,
                                             const u8 *in, u8 *out, size_t len,
                                             u8 iv[AES_BLOCK_SIZE]);
@@ -164,7 +170,7 @@ static int riscv64_aes_ecb_decrypt(struct skcipher_request *req)
  
  /* AES-CBC */
  
-static inline int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc)
+static int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc)
  {
         struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
         const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -202,6 +208,70 @@ static int riscv64_aes_cbc_decrypt(struct skcipher_request *req)
         return riscv64_aes_cbc_crypt(req, false);
  }
  
+/* AES-CBC-CTS */
+
+static int riscv64_aes_cbc_cts_crypt(struct skcipher_request *req, bool enc)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct scatterlist sg_src[2], sg_dst[2];
+       struct skcipher_request subreq;
+       struct scatterlist *src, *dst;
+       struct skcipher_walk walk;
+       unsigned int cbc_len;
+       int err;
+
+       if (req->cryptlen < AES_BLOCK_SIZE)
+               return -EINVAL;
+
+       err = skcipher_walk_virt(&walk, req, false);
+       if (err)
+               return err;
+       /*
+        * If the full message is available in one step, decrypt it in one call
+        * to the CBC-CTS assembly function.  This reduces overhead, especially
+        * on short messages.  Otherwise, fall back to doing CBC up to the last
+        * two blocks, then invoke CTS just for the ciphertext stealing.
+        */
+       if (unlikely(walk.nbytes != req->cryptlen)) {
+               cbc_len = round_down(req->cryptlen - AES_BLOCK_SIZE - 1,
+                                    AES_BLOCK_SIZE);
+               skcipher_walk_abort(&walk);
+               skcipher_request_set_tfm(&subreq, tfm);
+               skcipher_request_set_callback(&subreq,
+                                             skcipher_request_flags(req),
+                                             NULL, NULL);
+               skcipher_request_set_crypt(&subreq, req->src, req->dst,
+                                          cbc_len, req->iv);
+               err = riscv64_aes_cbc_crypt(&subreq, enc);
+               if (err)
+                       return err;
+               dst = src = scatterwalk_ffwd(sg_src, req->src, cbc_len);
+               if (req->dst != req->src)
+                       dst = scatterwalk_ffwd(sg_dst, req->dst, cbc_len);
+               skcipher_request_set_crypt(&subreq, src, dst,
+                                          req->cryptlen - cbc_len, req->iv);
+               err = skcipher_walk_virt(&walk, &subreq, false);
+               if (err)
+                       return err;
+       }
+       kernel_vector_begin();
+       aes_cbc_cts_crypt_zvkned(ctx, walk.src.virt.addr, walk.dst.virt.addr,
+                                walk.nbytes, req->iv, enc);
+       kernel_vector_end();
+       return skcipher_walk_done(&walk, 0);
+}
+
+static int riscv64_aes_cbc_cts_encrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_cbc_cts_crypt(req, true);
+}
+
+static int riscv64_aes_cbc_cts_decrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_cbc_cts_crypt(req, false);
+}
+
  /* AES-CTR */
  
  static int riscv64_aes_ctr_crypt(struct skcipher_request *req)
@@ -434,6 +504,22 @@ static struct skcipher_alg riscv64_zvkned_aes_skcipher_algs[] = {
                         .cra_driver_name = "cbc-aes-riscv64-zvkned",
                         .cra_module = THIS_MODULE,
                 },
+       }, {
+               .setkey = riscv64_aes_setkey_skcipher,
+               .encrypt = riscv64_aes_cbc_cts_encrypt,
+               .decrypt = riscv64_aes_cbc_cts_decrypt,
+               .min_keysize = AES_MIN_KEY_SIZE,
+               .max_keysize = AES_MAX_KEY_SIZE,
+               .ivsize = AES_BLOCK_SIZE,
+               .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
+               .base = {
+                       .cra_blocksize = AES_BLOCK_SIZE,
+                       .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+                       .cra_priority = 300,
+                       .cra_name = "cts(cbc(aes))",
+                       .cra_driver_name = "cts-cbc-aes-riscv64-zvkned",
+                       .cra_module = THIS_MODULE,
+               },
         }
  };
  
@@ -540,11 +626,12 @@ static void __exit riscv64_aes_mod_exit(void)
  module_init(riscv64_aes_mod_init);
  module_exit(riscv64_aes_mod_exit);
  
-MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS (RISC-V accelerated)");
+MODULE_DESCRIPTION("AES-ECB/CBC/CTS/CTR/XTS (RISC-V accelerated)");
  MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_CRYPTO("aes");
  MODULE_ALIAS_CRYPTO("ecb(aes)");
  MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
  MODULE_ALIAS_CRYPTO("ctr(aes)");
  MODULE_ALIAS_CRYPTO("xts(aes)");
diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S

index 43541aad6386cc89f5214dcc5b7445027fdc0e35..23d063f94ce61d42646dfec778146b1c2529daf2 100644 (file)
--- a/arch/riscv/crypto/aes-riscv64-zvkned.S
+++ b/arch/riscv/crypto/aes-riscv64-zvkned.S
@@ -184,3 +184,156 @@ SYM_FUNC_START(aes_cbc_decrypt_zvkned)
  192:
         aes_cbc_decrypt 192
  SYM_FUNC_END(aes_cbc_decrypt_zvkned)
+
+.macro aes_cbc_cts_encrypt     keylen
+
+       // CBC-encrypt all blocks except the last.  But don't store the
+       // second-to-last block to the output buffer yet, since it will be
+       // handled specially in the ciphertext stealing step.  Exception: if the
+       // message is single-block, still encrypt the last (and only) block.
+       li              t0, 16
+       j               2f
+1:
+       vse32.v         v16, (OUTP)     // Store ciphertext block
+       addi            OUTP, OUTP, 16
+2:
+       vle32.v         v17, (INP)      // Load plaintext block
+       vxor.vv         v16, v16, v17   // XOR with IV or prev ciphertext block
+       aes_encrypt     v16, \keylen    // Encrypt
+       addi            INP, INP, 16
+       addi            LEN, LEN, -16
+       bgt             LEN, t0, 1b     // Repeat if more than one block remains
+
+       // Special case: if the message is a single block, just do CBC.
+       beqz            LEN, .Lcts_encrypt_done\@
+
+       // Encrypt the last two blocks using ciphertext stealing as follows:
+       //      C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
+       //      C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
+       //
+       // C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
+       // plaintext block.  Block n, the last block, may be partial; its length
+       // is 1 <= LEN <= 16.  If there are only 2 blocks, C[n-2] means the IV.
+       //
+       // v16 already contains Encrypt(P[n-1] ^ C[n-2]).
+       // INP points to P[n].  OUTP points to where C[n-1] should go.
+       // To support in-place encryption, load P[n] before storing C[n].
+       addi            t0, OUTP, 16    // Get pointer to where C[n] should go
+       vsetvli         zero, LEN, e8, m1, tu, ma
+       vle8.v          v17, (INP)      // Load P[n]
+       vse8.v          v16, (t0)       // Store C[n]
+       vxor.vv         v16, v16, v17   // v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
+       vsetivli        zero, 4, e32, m1, ta, ma
+       aes_encrypt     v16, \keylen
+.Lcts_encrypt_done\@:
+       vse32.v         v16, (OUTP)     // Store C[n-1] (or C[n] in single-block case)
+       ret
+.endm
+
+#define LEN32          t4 // Length of remaining full blocks in 32-bit words
+#define LEN_MOD16      t5 // Length of message in bytes mod 16
+
+.macro aes_cbc_cts_decrypt     keylen
+       andi            LEN32, LEN, ~15
+       srli            LEN32, LEN32, 2
+       andi            LEN_MOD16, LEN, 15
+
+       // Save C[n-2] in v28 so that it's available later during the ciphertext
+       // stealing step.  If there are fewer than three blocks, C[n-2] means
+       // the IV, otherwise it means the third-to-last ciphertext block.
+       vmv.v.v         v28, v16        // IV
+       add             t0, LEN, -33
+       bltz            t0, .Lcts_decrypt_loop\@
+       andi            t0, t0, ~15
+       add             t0, t0, INP
+       vle32.v         v28, (t0)
+
+       // CBC-decrypt all full blocks.  For the last full block, or the last 2
+       // full blocks if the message is block-aligned, this doesn't write the
+       // correct output blocks (unless the message is only a single block),
+       // because it XORs the wrong values with the raw AES plaintexts.  But we
+       // fix this after this loop without redoing the AES decryptions.  This
+       // approach allows more of the AES decryptions to be parallelized.
+.Lcts_decrypt_loop\@:
+       vsetvli         t0, LEN32, e32, m4, ta, ma
+       addi            t1, t0, -4
+       vle32.v         v20, (INP)      // Load next set of ciphertext blocks
+       vmv.v.v         v24, v16        // Get IV or last ciphertext block of prev set
+       vslideup.vi     v24, v20, 4     // Setup prev ciphertext blocks
+       vslidedown.vx   v16, v20, t1    // Save last ciphertext block of this set
+       aes_decrypt     v20, \keylen    // Decrypt this set of blocks
+       vxor.vv         v24, v24, v20   // XOR prev ciphertext blocks with decrypted blocks
+       vse32.v         v24, (OUTP)     // Store this set of plaintext blocks
+       sub             LEN32, LEN32, t0
+       slli            t0, t0, 2       // Words to bytes
+       add             INP, INP, t0
+       add             OUTP, OUTP, t0
+       bnez            LEN32, .Lcts_decrypt_loop\@
+
+       vsetivli        zero, 4, e32, m4, ta, ma
+       vslidedown.vx   v20, v20, t1    // Extract raw plaintext of last full block
+       addi            t0, OUTP, -16   // Get pointer to last full plaintext block
+       bnez            LEN_MOD16, .Lcts_decrypt_non_block_aligned\@
+
+       // Special case: if the message is a single block, just do CBC.
+       li              t1, 16
+       beq             LEN, t1, .Lcts_decrypt_done\@
+
+       // Block-aligned message.  Just fix up the last 2 blocks.  We need:
+       //
+       //      P[n-1] = Decrypt(C[n]) ^ C[n-2]
+       //      P[n] = Decrypt(C[n-1]) ^ C[n]
+       //
+       // We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
+       // Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
+       // is everything needed to fix the output without re-decrypting blocks.
+       addi            t1, OUTP, -32   // Get pointer to where P[n-1] should go
+       vxor.vv         v20, v20, v28   // Decrypt(C[n]) ^ C[n-2] == P[n-1]
+       vle32.v         v24, (t1)       // Decrypt(C[n-1]) ^ C[n-2]
+       vse32.v         v20, (t1)       // Store P[n-1]
+       vxor.vv         v20, v24, v16   // Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
+       j               .Lcts_decrypt_finish\@
+
+.Lcts_decrypt_non_block_aligned\@:
+       // Decrypt the last two blocks using ciphertext stealing as follows:
+       //
+       //      P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
+       //      P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
+       //
+       // We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
+       vmv.v.v         v16, v20        // v16 = Decrypt(C[n-1])
+       vsetvli         zero, LEN_MOD16, e8, m1, tu, ma
+       vle8.v          v20, (INP)      // v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
+       vxor.vv         v16, v16, v20   // v16 = Decrypt(C[n-1]) ^ C[n]
+       vse8.v          v16, (OUTP)     // Store P[n]
+       vsetivli        zero, 4, e32, m1, ta, ma
+       aes_decrypt     v20, \keylen    // v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
+.Lcts_decrypt_finish\@:
+       vxor.vv         v20, v20, v28   // XOR with C[n-2]
+       vse32.v         v20, (t0)       // Store last full plaintext block
+.Lcts_decrypt_done\@:
+       ret
+.endm
+
+.macro aes_cbc_cts_crypt       keylen
+       vle32.v         v16, (IVP)      // Load IV
+       beqz            a5, .Lcts_decrypt\@
+       aes_cbc_cts_encrypt \keylen
+.Lcts_decrypt\@:
+       aes_cbc_cts_decrypt \keylen
+.endm
+
+// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
+//                              const u8 *in, u8 *out, size_t len,
+//                              const u8 iv[16], bool enc);
+//
+// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
+// This is the variant that unconditionally swaps the last two blocks.
+SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
+       aes_begin       KEYP, 128f, 192f
+       aes_cbc_cts_crypt 256
+128:
+       aes_cbc_cts_crypt 128
+192:
+       aes_cbc_cts_crypt 192
+SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)
author	Eric Biggers <ebiggers@google.com>
	Tue, 13 Feb 2024 05:54:42 +0000 (21:54 -0800)
committer	Palmer Dabbelt <palmer@rivosinc.com>
	Wed, 20 Mar 2024 15:56:11 +0000 (08:56 -0700)
arch/riscv/crypto/Kconfig		patch \| blob \| blame \| history
arch/riscv/crypto/aes-riscv64-glue.c		patch \| blob \| blame \| history
arch/riscv/crypto/aes-riscv64-zvkned.S		patch \| blob \| blame \| history