]>
Commit | Line | Data |
---|---|---|
2874c5fd | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
c9320b6d | 2 | /* |
8b65f34c EB |
3 | * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, |
4 | * including ChaCha20 (RFC7539) | |
c9320b6d MW |
5 | * |
6 | * Copyright (C) 2015 Martin Willi | |
c9320b6d MW |
7 | */ |
8 | ||
9 | #include <crypto/algapi.h> | |
5fb8ef25 | 10 | #include <crypto/internal/chacha.h> |
f2abe0d7 | 11 | #include <crypto/internal/simd.h> |
9ae433bc | 12 | #include <crypto/internal/skcipher.h> |
c9320b6d MW |
13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> | |
c9320b6d MW |
15 | #include <asm/simd.h> |
16 | ||
8b65f34c | 17 | #define CHACHA_STATE_ALIGN 16 |
c9320b6d | 18 | |
8b65f34c EB |
19 | asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, |
20 | unsigned int len, int nrounds); | |
21 | asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, | |
22 | unsigned int len, int nrounds); | |
23 | asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); | |
84e03fa3 | 24 | |
8b65f34c EB |
25 | asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
26 | unsigned int len, int nrounds); | |
27 | asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, | |
28 | unsigned int len, int nrounds); | |
29 | asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, | |
30 | unsigned int len, int nrounds); | |
84e03fa3 | 31 | |
8b65f34c EB |
32 | asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
33 | unsigned int len, int nrounds); | |
34 | asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, | |
35 | unsigned int len, int nrounds); | |
36 | asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, | |
37 | unsigned int len, int nrounds); | |
84e03fa3 AB |
38 | |
39 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); | |
40 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); | |
41 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); | |
c9320b6d | 42 | |
8b65f34c | 43 | static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) |
9b17608f | 44 | { |
1ca1b917 EB |
45 | len = min(len, maxblocks * CHACHA_BLOCK_SIZE); |
46 | return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; | |
9b17608f MW |
47 | } |
48 | ||
8b65f34c EB |
49 | static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, |
50 | unsigned int bytes, int nrounds) | |
c9320b6d | 51 | { |
84e03fa3 AB |
52 | if (IS_ENABLED(CONFIG_AS_AVX512) && |
53 | static_branch_likely(&chacha_use_avx512vl)) { | |
cee7a36e | 54 | while (bytes >= CHACHA_BLOCK_SIZE * 8) { |
8b65f34c EB |
55 | chacha_8block_xor_avx512vl(state, dst, src, bytes, |
56 | nrounds); | |
cee7a36e MW |
57 | bytes -= CHACHA_BLOCK_SIZE * 8; |
58 | src += CHACHA_BLOCK_SIZE * 8; | |
59 | dst += CHACHA_BLOCK_SIZE * 8; | |
60 | state[12] += 8; | |
61 | } | |
62 | if (bytes > CHACHA_BLOCK_SIZE * 4) { | |
8b65f34c EB |
63 | chacha_8block_xor_avx512vl(state, dst, src, bytes, |
64 | nrounds); | |
65 | state[12] += chacha_advance(bytes, 8); | |
cee7a36e MW |
66 | return; |
67 | } | |
180def6c | 68 | if (bytes > CHACHA_BLOCK_SIZE * 2) { |
8b65f34c EB |
69 | chacha_4block_xor_avx512vl(state, dst, src, bytes, |
70 | nrounds); | |
71 | state[12] += chacha_advance(bytes, 4); | |
180def6c MW |
72 | return; |
73 | } | |
29a47b54 | 74 | if (bytes) { |
8b65f34c EB |
75 | chacha_2block_xor_avx512vl(state, dst, src, bytes, |
76 | nrounds); | |
77 | state[12] += chacha_advance(bytes, 2); | |
29a47b54 MW |
78 | return; |
79 | } | |
cee7a36e | 80 | } |
84e03fa3 | 81 | |
e6abef61 | 82 | if (static_branch_likely(&chacha_use_avx2)) { |
1ca1b917 | 83 | while (bytes >= CHACHA_BLOCK_SIZE * 8) { |
8b65f34c | 84 | chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); |
1ca1b917 EB |
85 | bytes -= CHACHA_BLOCK_SIZE * 8; |
86 | src += CHACHA_BLOCK_SIZE * 8; | |
87 | dst += CHACHA_BLOCK_SIZE * 8; | |
3d1e93cd MW |
88 | state[12] += 8; |
89 | } | |
1ca1b917 | 90 | if (bytes > CHACHA_BLOCK_SIZE * 4) { |
8b65f34c EB |
91 | chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); |
92 | state[12] += chacha_advance(bytes, 8); | |
9b17608f MW |
93 | return; |
94 | } | |
1ca1b917 | 95 | if (bytes > CHACHA_BLOCK_SIZE * 2) { |
8b65f34c EB |
96 | chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); |
97 | state[12] += chacha_advance(bytes, 4); | |
8a5a79d5 MW |
98 | return; |
99 | } | |
1ca1b917 | 100 | if (bytes > CHACHA_BLOCK_SIZE) { |
8b65f34c EB |
101 | chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); |
102 | state[12] += chacha_advance(bytes, 2); | |
a5dd97f8 MW |
103 | return; |
104 | } | |
3d1e93cd | 105 | } |
84e03fa3 | 106 | |
1ca1b917 | 107 | while (bytes >= CHACHA_BLOCK_SIZE * 4) { |
8b65f34c | 108 | chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); |
1ca1b917 EB |
109 | bytes -= CHACHA_BLOCK_SIZE * 4; |
110 | src += CHACHA_BLOCK_SIZE * 4; | |
111 | dst += CHACHA_BLOCK_SIZE * 4; | |
274f938e MW |
112 | state[12] += 4; |
113 | } | |
1ca1b917 | 114 | if (bytes > CHACHA_BLOCK_SIZE) { |
8b65f34c EB |
115 | chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); |
116 | state[12] += chacha_advance(bytes, 4); | |
9b17608f | 117 | return; |
c9320b6d MW |
118 | } |
119 | if (bytes) { | |
8b65f34c | 120 | chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); |
9b17608f | 121 | state[12]++; |
c9320b6d MW |
122 | } |
123 | } | |
124 | ||
84e03fa3 AB |
125 | void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) |
126 | { | |
127 | state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); | |
128 | ||
129 | if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { | |
130 | hchacha_block_generic(state, stream, nrounds); | |
131 | } else { | |
132 | kernel_fpu_begin(); | |
133 | hchacha_block_ssse3(state, stream, nrounds); | |
134 | kernel_fpu_end(); | |
135 | } | |
136 | } | |
137 | EXPORT_SYMBOL(hchacha_block_arch); | |
138 | ||
139 | void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) | |
140 | { | |
141 | state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); | |
142 | ||
143 | chacha_init_generic(state, key, iv); | |
144 | } | |
145 | EXPORT_SYMBOL(chacha_init_arch); | |
146 | ||
147 | void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, | |
148 | int nrounds) | |
149 | { | |
150 | state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); | |
151 | ||
152 | if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || | |
153 | bytes <= CHACHA_BLOCK_SIZE) | |
154 | return chacha_crypt_generic(state, dst, src, bytes, nrounds); | |
155 | ||
706024a5 JD |
156 | do { |
157 | unsigned int todo = min_t(unsigned int, bytes, SZ_4K); | |
158 | ||
159 | kernel_fpu_begin(); | |
160 | chacha_dosimd(state, dst, src, todo, nrounds); | |
161 | kernel_fpu_end(); | |
162 | ||
163 | bytes -= todo; | |
164 | src += todo; | |
165 | dst += todo; | |
166 | } while (bytes); | |
84e03fa3 AB |
167 | } |
168 | EXPORT_SYMBOL(chacha_crypt_arch); | |
169 | ||
28e8d89b | 170 | static int chacha_simd_stream_xor(struct skcipher_request *req, |
860ab2e5 | 171 | const struct chacha_ctx *ctx, const u8 *iv) |
c9320b6d | 172 | { |
b8fbe71f | 173 | u32 *state, state_buf[16 + 2] __aligned(8); |
28e8d89b AB |
174 | struct skcipher_walk walk; |
175 | int err; | |
176 | ||
177 | err = skcipher_walk_virt(&walk, req, false); | |
c9320b6d | 178 | |
8b65f34c EB |
179 | BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); |
180 | state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); | |
b8fbe71f | 181 | |
28e8d89b | 182 | chacha_init_generic(state, ctx->key, iv); |
c9320b6d | 183 | |
28e8d89b AB |
184 | while (walk.nbytes > 0) { |
185 | unsigned int nbytes = walk.nbytes; | |
9b17608f | 186 | |
28e8d89b AB |
187 | if (nbytes < walk.total) |
188 | nbytes = round_down(nbytes, walk.stride); | |
c9320b6d | 189 | |
84e03fa3 AB |
190 | if (!static_branch_likely(&chacha_use_simd) || |
191 | !crypto_simd_usable()) { | |
28e8d89b AB |
192 | chacha_crypt_generic(state, walk.dst.virt.addr, |
193 | walk.src.virt.addr, nbytes, | |
194 | ctx->nrounds); | |
195 | } else { | |
a033aed5 | 196 | kernel_fpu_begin(); |
28e8d89b AB |
197 | chacha_dosimd(state, walk.dst.virt.addr, |
198 | walk.src.virt.addr, nbytes, | |
199 | ctx->nrounds); | |
200 | kernel_fpu_end(); | |
a033aed5 | 201 | } |
28e8d89b | 202 | err = skcipher_walk_done(&walk, walk.nbytes - nbytes); |
c9320b6d MW |
203 | } |
204 | ||
4af78261 EB |
205 | return err; |
206 | } | |
207 | ||
8b65f34c | 208 | static int chacha_simd(struct skcipher_request *req) |
4af78261 EB |
209 | { |
210 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); | |
211 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); | |
4af78261 | 212 | |
28e8d89b | 213 | return chacha_simd_stream_xor(req, ctx, req->iv); |
4af78261 EB |
214 | } |
215 | ||
8b65f34c | 216 | static int xchacha_simd(struct skcipher_request *req) |
4af78261 EB |
217 | { |
218 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); | |
219 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); | |
4af78261 | 220 | u32 *state, state_buf[16 + 2] __aligned(8); |
28e8d89b | 221 | struct chacha_ctx subctx; |
4af78261 | 222 | u8 real_iv[16]; |
f9c9bdb5 | 223 | |
8b65f34c EB |
224 | BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); |
225 | state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); | |
28e8d89b AB |
226 | chacha_init_generic(state, ctx->key, req->iv); |
227 | ||
228 | if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { | |
229 | kernel_fpu_begin(); | |
230 | hchacha_block_ssse3(state, subctx.key, ctx->nrounds); | |
231 | kernel_fpu_end(); | |
232 | } else { | |
233 | hchacha_block_generic(state, subctx.key, ctx->nrounds); | |
234 | } | |
8b65f34c | 235 | subctx.nrounds = ctx->nrounds; |
4af78261 EB |
236 | |
237 | memcpy(&real_iv[0], req->iv + 24, 8); | |
238 | memcpy(&real_iv[8], req->iv + 16, 8); | |
28e8d89b | 239 | return chacha_simd_stream_xor(req, &subctx, real_iv); |
c9320b6d MW |
240 | } |
241 | ||
4af78261 EB |
242 | static struct skcipher_alg algs[] = { |
243 | { | |
244 | .base.cra_name = "chacha20", | |
245 | .base.cra_driver_name = "chacha20-simd", | |
246 | .base.cra_priority = 300, | |
247 | .base.cra_blocksize = 1, | |
248 | .base.cra_ctxsize = sizeof(struct chacha_ctx), | |
249 | .base.cra_module = THIS_MODULE, | |
250 | ||
251 | .min_keysize = CHACHA_KEY_SIZE, | |
252 | .max_keysize = CHACHA_KEY_SIZE, | |
253 | .ivsize = CHACHA_IV_SIZE, | |
254 | .chunksize = CHACHA_BLOCK_SIZE, | |
28e8d89b | 255 | .setkey = chacha20_setkey, |
8b65f34c EB |
256 | .encrypt = chacha_simd, |
257 | .decrypt = chacha_simd, | |
4af78261 EB |
258 | }, { |
259 | .base.cra_name = "xchacha20", | |
260 | .base.cra_driver_name = "xchacha20-simd", | |
261 | .base.cra_priority = 300, | |
262 | .base.cra_blocksize = 1, | |
263 | .base.cra_ctxsize = sizeof(struct chacha_ctx), | |
264 | .base.cra_module = THIS_MODULE, | |
265 | ||
266 | .min_keysize = CHACHA_KEY_SIZE, | |
267 | .max_keysize = CHACHA_KEY_SIZE, | |
268 | .ivsize = XCHACHA_IV_SIZE, | |
269 | .chunksize = CHACHA_BLOCK_SIZE, | |
28e8d89b | 270 | .setkey = chacha20_setkey, |
8b65f34c EB |
271 | .encrypt = xchacha_simd, |
272 | .decrypt = xchacha_simd, | |
7a507d62 EB |
273 | }, { |
274 | .base.cra_name = "xchacha12", | |
275 | .base.cra_driver_name = "xchacha12-simd", | |
276 | .base.cra_priority = 300, | |
277 | .base.cra_blocksize = 1, | |
278 | .base.cra_ctxsize = sizeof(struct chacha_ctx), | |
279 | .base.cra_module = THIS_MODULE, | |
280 | ||
281 | .min_keysize = CHACHA_KEY_SIZE, | |
282 | .max_keysize = CHACHA_KEY_SIZE, | |
283 | .ivsize = XCHACHA_IV_SIZE, | |
284 | .chunksize = CHACHA_BLOCK_SIZE, | |
28e8d89b | 285 | .setkey = chacha12_setkey, |
7a507d62 EB |
286 | .encrypt = xchacha_simd, |
287 | .decrypt = xchacha_simd, | |
4af78261 | 288 | }, |
c9320b6d MW |
289 | }; |
290 | ||
8b65f34c | 291 | static int __init chacha_simd_mod_init(void) |
c9320b6d | 292 | { |
362f924b | 293 | if (!boot_cpu_has(X86_FEATURE_SSSE3)) |
84e03fa3 AB |
294 | return 0; |
295 | ||
296 | static_branch_enable(&chacha_use_simd); | |
297 | ||
e6abef61 | 298 | if (boot_cpu_has(X86_FEATURE_AVX) && |
84e03fa3 AB |
299 | boot_cpu_has(X86_FEATURE_AVX2) && |
300 | cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { | |
301 | static_branch_enable(&chacha_use_avx2); | |
302 | ||
303 | if (IS_ENABLED(CONFIG_AS_AVX512) && | |
304 | boot_cpu_has(X86_FEATURE_AVX512VL) && | |
305 | boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ | |
306 | static_branch_enable(&chacha_use_avx512vl); | |
307 | } | |
8394bfec JD |
308 | return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? |
309 | crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; | |
c9320b6d MW |
310 | } |
311 | ||
8b65f34c | 312 | static void __exit chacha_simd_mod_fini(void) |
c9320b6d | 313 | { |
8394bfec | 314 | if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) |
b62755ae | 315 | crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); |
c9320b6d MW |
316 | } |
317 | ||
8b65f34c EB |
318 | module_init(chacha_simd_mod_init); |
319 | module_exit(chacha_simd_mod_fini); | |
c9320b6d MW |
320 | |
321 | MODULE_LICENSE("GPL"); | |
322 | MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); | |
8b65f34c | 323 | MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); |
c9320b6d MW |
324 | MODULE_ALIAS_CRYPTO("chacha20"); |
325 | MODULE_ALIAS_CRYPTO("chacha20-simd"); | |
4af78261 EB |
326 | MODULE_ALIAS_CRYPTO("xchacha20"); |
327 | MODULE_ALIAS_CRYPTO("xchacha20-simd"); | |
7a507d62 EB |
328 | MODULE_ALIAS_CRYPTO("xchacha12"); |
329 | MODULE_ALIAS_CRYPTO("xchacha12-simd"); |