From: Danny Tsen Date: Tue, 7 Apr 2026 13:19:05 +0000 (-0400) Subject: ppc64le: Optimized MLKEM NTT, supports p8 (ISA 2.07) and above architectures. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=91f7d4e76eb4d21a277df27a34ef10a7540e81a8;p=thirdparty%2Fopenssl.git ppc64le: Optimized MLKEM NTT, supports p8 (ISA 2.07) and above architectures. Optimized MKEM NTT implementation for ppc64le for ISA 2.07 and above architectures. Supporting files include, asm/mlkem_ntt_ppc64le.S: supports NTT. asm/mlkem_intt_ppc64le.S: supports inverse NTT. asm/mlkem_ppc_macros_asm.S: PPC64LE macros. Modified build.info to support ppc64le assembly implementation. Added new definitions of MLKEM_NTT_ASM for NTT and inverse NTT for optimized assembly implementation. This is the initial archtecture specific implementation so can be mdified to adapt to a new build structures. Baseline speed test: keygen encaps decaps keygens/s encaps/s decaps/s ML-KEM-512 0.000037s 0.000030s 0.000046s 26744.7 33529.5 21875.6 ML-KEM-768 0.000059s 0.000043s 0.000066s 16836.6 23118.8 15198.3 ML-KEM-1024 0.000088s 0.000060s 0.000089s 11406.2 16749.7 11265.8 Optimized: keygen encaps decaps keygens/s encaps/s decaps/s ML-KEM-512 0.000023s 0.000015s 0.000022s 42789.9 65006.8 46064.6 ML-KEM-768 0.000038s 0.000023s 0.000032s 25983.3 43731.1 31254.7 ML-KEM-1024 0.000060s 0.000033s 0.000045s 16662.7 30708.2 22034.6 The optimized code runs around 1.9 times faster than the original C implementation. Tested-by: Eugene Syromiatnikov Signed-off-by: Danny Tsen Reviewed-by: Neil Horman Reviewed-by: Eugene Syromiatnikov MergeDate: Tue Apr 14 08:23:42 2026 (Merged from https://github.com/openssl/openssl/pull/30709) --- diff --git a/crypto/ml_kem/asm/mlkem_intt_ppc64le.S b/crypto/ml_kem/asm/mlkem_intt_ppc64le.S new file mode 100644 index 00000000000..ff14c456267 --- /dev/null +++ b/crypto/ml_kem/asm/mlkem_intt_ppc64le.S @@ -0,0 +1,676 @@ +/* + * Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "mlkem_ppc_macros_asm.inc" + +.machine "any" +.text + +#define ZETA_INTT_OFFSET 0 + +#define PEER 17 +#define CURR 18 + +#define V_Z1 28 +#define V_Z2 29 +#define V_Z3 30 +#define V_Z4 31 +#define V_ZETA 31 + +.macro SAVE_REGS + stdu 1, -336(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + + mtlr 0 + addi 1, 1, 336 +.endm + +/* ===================================================================== */ +/* Delayed writes resulting curr and peer coefficients */ +.macro delayed_writes_curr_peer _c1, _p1, _c2, _p2, _c3, _p3, _c4, _p4 + stxvd2x 32+\_c1, 0, 5 + stxvd2x 32+\_p1, 10, 5 + stxvd2x 32+\_c2, 11, 5 + stxvd2x 32+\_p2, 12, 5 + stxvd2x 32+\_c3, 15, 5 + stxvd2x 32+\_p3, 16, 5 + stxvd2x 32+\_c4, 17, 5 + stxvd2x 32+\_p4, 18, 5 +.endm + +/* + * _intt_layer_reduce_4x- common code for layer 6 and 7 + * + * Assuming input registers and output vectors as follows, + * input offsets - curr (0, r11, r15, r17), peer (r10, r12, r16, r18) + * output vectors - v20, v21, v22, v23, v24, v25, v26 + */ +.macro _intt_layer_reduce_4x + /* even - curr, odd - peer, odd is the updated peer */ + /* Compute (even - odd + kPrime) */ + vsubuhm 0, 10, 14 + vadduhm 20, 14, 10 + vadduhm 21, 0, V_kPrime16 + vsubuhm 0, 11, 15 + vadduhm 22, 15, 11 + vadduhm 23, 0, V_kPrime16 + vsubuhm 0, 12, 16 + vadduhm 24, 16, 12 + vadduhm 25, 0, V_kPrime16 + vsubuhm 0, 13, 17 + vadduhm 26, 17, 13 + vadduhm 27, 0, V_kPrime16 + zeta_scalar_mul 21, V_Z1, 8, 9 /* peer * zeta */ + zeta_scalar_mul 23, V_Z2, 18, 19 /* peer * zeta */ + Barrett_reduce_delayed_2x 8, 9, 18, 19, 21, 23 + zeta_scalar_mul 25, V_Z3, 8, 9 /* peer * zeta */ + zeta_scalar_mul 27, V_Z4, 18, 19 /* peer * zeta */ + Barrett_reduce_delayed_2x 8, 9, 18, 19, 25, 27 + reduce_once_4x 21, 23, 25, 27 +.endm + +/* Layer 1 layout - + * each load contains 2 curr and 2 peer elements. + * -> curr1 curr1 peer1 peer1 curr2 curr2 peer2 peer2 + * vmrgew and vmrgow -> 8 curr elements and 8 peer elements + */ +/* + * INTT layer 1, Offset between legs = 2. + */ +.macro _load_transpose_layer1 + lxvd2x 32+0, 0, 5 + lxvd2x 32+6, 10, 5 + lxvd2x 32+7, 11, 5 + lxvd2x 32+8, 12, 5 + lxvd2x 32+9, 15, 5 + lxvd2x 32+10, 16, 5 + lxvd2x 32+11, 17, 5 + lxvd2x 32+12, 18, 5 + vmrgew 14, 0, 6 + vmrgew 15, 7, 8 + vmrgew 16, 9, 10 + vmrgew 17, 11, 12 + vmrgow 13, 11, 12 + vmrgow 12, 9, 10 + vmrgow 11, 7, 8 + vmrgow 10, 0, 6 +.endm + +.macro _intt_layer1_reduce_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + lvx V_Z3, 0, 14 + addi 14, 14, 16 + lvx V_Z4, 0, 14 + addi 14, 14, 16 + _load_transpose_layer1 + _intt_layer_reduce_4x +.endm + +.macro INTT_Layer1_4x + _intt_layer1_reduce_4x + reduce_once_4x 20, 22, 24, 26 + vmrgow 6, 21, 20 + vmrgew 7, 21, 20 + vmrgow 8, 23, 22 + vmrgew 9, 23, 22 + vmrgow 10, 25, 24 + vmrgew 11, 25, 24 + vmrgow 12, 27, 26 + vmrgew 13, 27, 26 + delayed_writes_curr_peer 7, 6, 9, 8, 11, 10, 13, 12 + addi 5, 5, 128 +.endm + +/* Layer 2 layout - + * each load contains 4 curr and 4 peer elements + * xxpermidi -> 8 curr elements and 8 peer elements + */ +/* + * INTT layer 2, Offset between legs = 4. + */ +.macro _load_transpose_layer2 + lxvd2x 4, 0, 5 + lxvd2x 5, 10, 5 + lxvd2x 6, 11, 5 + lxvd2x 7, 12, 5 + lxvd2x 8, 15, 5 + lxvd2x 9, 16, 5 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+14, 5, 4, 3 + xxpermdi 32+10, 5, 4, 0 + xxpermdi 32+15, 7, 6, 3 + xxpermdi 32+11, 7, 6, 0 + xxpermdi 32+16, 9, 8, 3 + xxpermdi 32+12, 9, 8, 0 + xxpermdi 32+17, 11, 10, 3 + xxpermdi 32+13, 11, 10, 0 +.endm + +.macro _intt_layer2_reduce_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + lvx V_Z3, 0, 14 + addi 14, 14, 16 + lvx V_Z4, 0, 14 + addi 14, 14, 16 + _load_transpose_layer2 + _intt_layer_reduce_4x +.endm + +.macro INTT_Layer2_4x + _intt_layer2_reduce_4x + reduce_once_4x 20, 22, 24, 26 + xxpermdi 32+6, 32+20, 32+21, 3 + xxpermdi 32+7, 32+20, 32+21, 0 + xxpermdi 32+8, 32+22, 32+23, 3 + xxpermdi 32+9, 32+22, 32+23, 0 + xxpermdi 32+10, 32+24, 32+25, 3 + xxpermdi 32+11, 32+24, 32+25, 0 + xxpermdi 32+12, 32+26, 32+27, 3 + xxpermdi 32+13, 32+26, 32+27, 0 + delayed_writes_curr_peer 6, 7, 8, 9, 10, 11, 12, 13 + addi 5, 5, 128 +.endm + +/* ===================================================================== */ +/* + * INTT computation for layer 3, 4, 5, 6 and 7. + */ + +/* Load 8 vectors with 4x pipeline + * + * Assuming input registers and output vectors as follows, + * input offsets - curr (0, r11, r15, r17), peer (r10, r12, r16, r18) + * output vectors - v20, v21, v22, v23, v24, v25, v26 + */ +.macro _intt_scalar_reduce_4x _zeta1, _zeta2, _zeta3, _zeta4 + /* Load peer */ + lxvd2x 32+14, 10, 5 + lxvd2x 32+15, 12, 5 + lxvd2x 32+16, 16, 5 + lxvd2x 32+17, 18, 5 + + /* Load curr */ + lxvd2x 32+10, 0, 5 + lxvd2x 32+11, 11, 5 + lxvd2x 32+12, 15, 5 + lxvd2x 32+13, 17, 5 + + /* even - curr, odd - peer, odd is the updated peer */ + /* Compute (even - odd + kPrime) */ + vsubuhm 0, 10, 14 + vadduhm 20, 14, 10 + vadduhm 21, 0, V_kPrime16 + vsubuhm 0, 11, 15 + vadduhm 22, 15, 11 + vadduhm 23, 0, V_kPrime16 + vsubuhm 0, 12, 16 + vadduhm 24, 16, 12 + vadduhm 25, 0, V_kPrime16 + vsubuhm 0, 13, 17 + vadduhm 26, 17, 13 + vadduhm 27, 0, V_kPrime16 + zeta_scalar_mul 21, \_zeta1, 8, 9 /* peer * zeta */ + zeta_scalar_mul 23, \_zeta2, 18, 19 /* peer * zeta */ + Barrett_reduce_delayed_2x 8, 9, 18, 19, 21, 23 + zeta_scalar_mul 25, \_zeta3, 8, 9 /* peer * zeta */ + zeta_scalar_mul 27, \_zeta4, 18, 19 /* peer * zeta */ + Barrett_reduce_delayed_2x 8, 9, 18, 19, 25, 27 + reduce_once_4x 21, 23, 25, 27 +.endm + +/* + * INTT layer 3, Offset between legs = 8. + */ +.macro INTT_Layer3_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + lvx V_Z3, 0, 14 + addi 14, 14, 16 + lvx V_Z4, 0, 14 + addi 14, 14, 16 + _intt_scalar_reduce_4x V_Z1, V_Z2, V_Z3, V_Z4 + reduce_once_4x 20, 22, 24, 26 + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 + addi 5, 5, 128 +.endm + +/* + * INTT layer 4, Offset between legs = 16. + */ +.macro INTT_Layer4_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + _intt_scalar_reduce_4x V_Z1, V_Z1, V_Z2, V_Z2 + reduce_once_4x 20, 22, 24, 26 + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 + addi 5, 5, 128 +.endm + +.macro intt_reduce_delayed_write + _intt_scalar_reduce_4x V_ZETA, V_ZETA, V_ZETA, V_ZETA + reduce_once_4x 20, 22, 24, 26 + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 +.endm + +/* + * INTT layer 5, Offset between legs = 32. + */ +.macro INTT_Layer5_4x + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + intt_reduce_delayed_write + addi 5, 5, 128 +.endm + +/* + * INTT layer 6, Offset between legs = 64. + */ +.macro INTT_Layer6_4x + intt_reduce_delayed_write + addi 5, 5, 64 +.endm + +/* + * INTT layer 7, Offset between legs = 128. + */ +.macro INTT_Layer7_4x + intt_reduce_delayed_write + addi 5, 5, 64 +.endm + +/* ===================================================================== */ +/* Multiply a scalar by a constant (kInverseDegree=3303). */ +.macro _mul_const_reduce_2x _x1, _x2 + vmulouh 6, \_x1, V_kInverseDegree + vmuleuh 7, \_x1, V_kInverseDegree + xxmrglw 32+28, 32+7, 32+6 + xxmrghw 32+29, 32+7, 32+6 + vmulouh 6, \_x2, V_kInverseDegree + vmuleuh 7, \_x2, V_kInverseDegree + xxmrglw 32+30, 32+7, 32+6 + xxmrghw 32+31, 32+7, 32+6 + Barrett_reduce_delayed_2x 28, 29, 30, 31, \_x1, \_x2 +.endm + +.macro scalar_mul_const_8x + lxvd2x 32+20, 0, 5 + lxvd2x 32+21, 10, 5 + lxvd2x 32+22, 11, 5 + lxvd2x 32+23, 12, 5 + lxvd2x 32+24, 15, 5 + lxvd2x 32+25, 16, 5 + lxvd2x 32+26, 17, 5 + lxvd2x 32+27, 18, 5 + _mul_const_reduce_2x 20, 21 + _mul_const_reduce_2x 22, 23 + _mul_const_reduce_2x 24, 25 + _mul_const_reduce_2x 26, 27 + reduce_once_4x 20, 21, 22, 23 + reduce_once_4x 24, 25, 26, 27 + /* overload delayed_writes_curr_peer */ + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 + addi 5, 5, 128 +.endm + +.macro Load_consts + addis 10,2,ntt_consts@toc@ha + addi 10,10,ntt_consts@toc@l + lvx V_kPrime16, 0, 10 + li 7, 16 + lvx V_kPrime32, 7, 10 + li 7, 32 + lvx V_kBarrettMultiplier, 7, 10 + li 7, 48 + lvx V_kBarrettShift, 7, 10 + li 7, 64 + lvx V_kInverseDegree, 7, 10 +.endm + +/* ===================================================================== */ +/* + * mlkem_inverse_ntt_ppc(int16_t *r) + * Compute inverse NTT based on the following 7 layers - + * len = 2, 4, 8, 16, 32, 64, 128 + * + * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * The resulting coefficients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coefficients have the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coefficients have the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + */ +.global mlkem_inverse_ntt_ppc +.align 4 +mlkem_inverse_ntt_ppc: + + SAVE_REGS + + Load_consts + addis 8,2,mlkem_intt_zetas@toc@ha + addi 8,8,mlkem_intt_zetas@toc@l + addi 14, 8, ZETA_INTT_OFFSET + + li 10, 16 // offset to next peer + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + +.align 4 + /* + * Layer 1. len = 2 + * Load zeta vectors in 2-2-2-2 layout + */ + mr 5, 3 + + INTT_Layer1_4x + INTT_Layer1_4x + INTT_Layer1_4x + INTT_Layer1_4x + +.align 4 + /* + * Layer 22. len = 4 + * Load zeta vectors in 4-4 layout + */ + mr 5, 3 + + INTT_Layer2_4x + INTT_Layer2_4x + INTT_Layer2_4x + INTT_Layer2_4x + +.align 4 + /* + * Layer 3. len = 8 + */ + mr 5, 3 + + INTT_Layer3_4x + INTT_Layer3_4x + INTT_Layer3_4x + INTT_Layer3_4x + +.align 4 + /* + * Layer 4. len = 16 + */ + mr 5, 3 + li 10, 32 // offset to next peer + li 11, 16 + li 12, 32+16 + li 15, 64 + li 16, 64+32 + li 17, 64+16 + li 18, 96+16 + + INTT_Layer4_4x + INTT_Layer4_4x + INTT_Layer4_4x + INTT_Layer4_4x + +.align 4 + /* + * Layer 5. len = 32 + */ + mr 5, 3 + li 10, 64 // offset to next peer + li 11, 16 + li 12, 64+16 + li 15, 32 + li 16, 64+32 + li 17, 48 + li 18, 64+48 + + INTT_Layer5_4x + INTT_Layer5_4x + INTT_Layer5_4x + INTT_Layer5_4x + +.align 4 + /* + * Layer 6. len = 64 + */ + mr 5, 3 + li 10, 128 // offset to next peer + li 11, 16 + li 12, 128+16 + li 15, 32 + li 16, 128+32 + li 17, 48 + li 18, 128+48 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + INTT_Layer6_4x + INTT_Layer6_4x + addi 5, 5, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + INTT_Layer6_4x + INTT_Layer6_4x + addi 5, 5, 128 + +.align 4 + /* + * Layer 7. len = 128 + */ + mr 5, 3 + li 10, 256 // offset to next peer + li 11, 16 + li 12, 256+16 + li 15, 32 + li 16, 256+32 + li 17, 48 + li 18, 256+48 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + INTT_Layer7_4x + INTT_Layer7_4x + INTT_Layer7_4x + INTT_Layer7_4x + + /* Multiply a scalar by a constant. */ + mr 5, 3 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + + scalar_mul_const_8x + scalar_mul_const_8x + scalar_mul_const_8x + scalar_mul_const_8x + + RESTORE_REGS + blr +.size mlkem_inverse_ntt_ppc,.-mlkem_inverse_ntt_ppc + +.rodata +.align 4 +ntt_consts: +.short kPrime, kPrime, kPrime, kPrime, kPrime, kPrime, kPrime, kPrime +.long kPrime, kPrime, kPrime, kPrime +.long kBarrettMultiplier, kBarrettMultiplier, kBarrettMultiplier, kBarrettMultiplier +.long kBarrettShift, 0, kBarrettShift, 0 +.short kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree + +mlkem_intt_zetas: +/* + * For intt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (4, 2, 3, 1) + * Transpose z[0], z[1], z[2], z[3] + * -> z[4], z[4], z[2], z[2], z[3], z[3], z[1], z[1] + */ +.short 1219, 1219, 2444, 2444, 394, 394, 1175, 1175 +.short 1607, 1607, 1455, 1455, 2117, 2117, 2300, 2300 +.short 2186, 2186, 554, 554, 1179, 1179, 2443, 2443 +.short 525, 525, 2926, 2926, 2237, 2237, 2303, 2303 +.short 1230, 1230, 863, 863, 2768, 2768, 735, 735 +.short 2266, 2266, 556, 556, 3010, 3010, 2572, 2572 +.short 2954, 2954, 1239, 1239, 780, 780, 1684, 1684 +.short 1745, 1745, 1292, 1292, 1031, 1031, 109, 109 +.short 2596, 2596, 3061, 3061, 992, 992, 2688, 2688 +.short 2390, 2390, 892, 892, 1021, 1021, 941, 941 +.short 1482, 1482, 1868, 1868, 2377, 2377, 642, 642 +.short 1626, 1626, 540, 540, 1678, 1678, 1540, 1540 +.short 2573, 2573, 314, 314, 1173, 1173, 279, 279 +.short 1920, 1920, 48, 48, 667, 667, 3096, 3096 +.short 1692, 1692, 1041, 1041, 2606, 2606, 2229, 2229 +.short 3312, 3312, 2746, 2746, 568, 568, 680, 680 +/* For intt Len=4 */ +.short 2419, 2419, 2419, 2419, 2102, 2102, 2102, 2102 +.short 219, 219, 219, 219, 855, 855, 855, 855 +.short 2681, 2681, 2681, 2681, 1848, 1848, 1848, 1848 +.short 712, 712, 712, 712, 682, 682, 682, 682 +.short 927, 927, 927, 927, 1795, 1795, 1795, 1795 +.short 461, 461, 461, 461, 1891, 1891, 1891, 1891 +.short 2877, 2877, 2877, 2877, 2522, 2522, 2522, 2522 +.short 1894, 1894, 1894, 1894, 1010, 1010, 1010, 1010 +.short 1414, 1414, 1414, 1414, 2009, 2009, 2009, 2009 +.short 3296, 3296, 3296, 3296, 464, 464, 464, 464 +.short 2697, 2697, 2697, 2697, 816, 816, 816, 816 +.short 1352, 1352, 1352, 1352, 2679, 2679, 2679, 2679 +.short 1274, 1274, 1274, 1274, 1052, 1052, 1052, 1052 +.short 1025, 1025, 1025, 1025, 2132, 2132, 2132, 2132 +.short 1573, 1573, 1573, 1573, 76, 76, 76, 76 +.short 2998, 2998, 2998, 2998, 3040, 3040, 3040, 3040 +/* For intt Len=8 and others */ +.short 2508, 2508, 2508, 2508, 2508, 2508, 2508, 2508 +.short 1355, 1355, 1355, 1355, 1355, 1355, 1355, 1355 +.short 450, 450, 450, 450, 450, 450, 450, 450 +.short 936, 936, 936, 936, 936, 936, 936, 936 +.short 447, 447, 447, 447, 447, 447, 447, 447 +.short 2794, 2794, 2794, 2794, 2794, 2794, 2794, 2794 +.short 1235, 1235, 1235, 1235, 1235, 1235, 1235, 1235 +.short 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1903 +.short 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996 +.short 1089, 1089, 1089, 1089, 1089, 1089, 1089, 1089 +.short 3273, 3273, 3273, 3273, 3273, 3273, 3273, 3273 +.short 283, 283, 283, 283, 283, 283, 283, 283 +.short 1853, 1853, 1853, 1853, 1853, 1853, 1853, 1853 +.short 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990 +.short 882, 882, 882, 882, 882, 882, 882, 882 +.short 3033, 3033, 3033, 3033, 3033, 3033, 3033, 3033 +.short 1583, 1583, 1583, 1583, 1583, 1583, 1583, 1583 +.short 2760, 2760, 2760, 2760, 2760, 2760, 2760, 2760 +.short 69, 69, 69, 69, 69, 69, 69, 69 +.short 543, 543, 543, 543, 543, 543, 543, 543 +.short 2532, 2532, 2532, 2532, 2532, 2532, 2532, 2532 +.short 3136, 3136, 3136, 3136, 3136, 3136, 3136, 3136 +.short 1410, 1410, 1410, 1410, 1410, 1410, 1410, 1410 +.short 2267, 2267, 2267, 2267, 2267, 2267, 2267, 2267 +.short 2481, 2481, 2481, 2481, 2481, 2481, 2481, 2481 +.short 1432, 1432, 1432, 1432, 1432, 1432, 1432, 1432 +.short 2699, 2699, 2699, 2699, 2699, 2699, 2699, 2699 +.short 687, 687, 687, 687, 687, 687, 687, 687 +.short 40, 40, 40, 40, 40, 40, 40, 40 +.short 749, 749, 749, 749, 749, 749, 749, 749 +.short 1600, 1600, 1600, 1600, 1600, 1600, 1600, 1600 diff --git a/crypto/ml_kem/asm/mlkem_ntt_ppc64le.S b/crypto/ml_kem/asm/mlkem_ntt_ppc64le.S new file mode 100644 index 00000000000..1aefed531e7 --- /dev/null +++ b/crypto/ml_kem/asm/mlkem_ntt_ppc64le.S @@ -0,0 +1,647 @@ +/* + * Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "mlkem_ppc_macros_asm.inc" + +.machine "any" +.text + +#define ZETA_NTT_OFFSET 0 + +#define V_Z1 28 +#define V_Z2 29 +#define V_Z3 30 +#define V_Z4 31 +#define V_ZETA 31 + +.macro SAVE_REGS + stdu 1, -336(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + + mtlr 0 + addi 1, 1, 336 +.endm + +/* ===================================================================== */ +/* + * NTT computation for layer 1, 2, 3, 4 and 5. + */ +/* Load 8 vectors with 4x pipeline + * + * Assuming input registers and output vectors as follows, + * input offsets - curr (0, r11, r15, r17), peer (r10, r12, r16, r18) + * output vectors - v20, v21, v22, v23, v24, v25, v26 + */ +.macro _ntt_scalar_reduce_4x _zeta1, _zeta2, _zeta3, _zeta4 + /* Load peer */ + lxvd2x 32+14, 10, 5 + lxvd2x 32+15, 12, 5 + lxvd2x 32+16, 16, 5 + lxvd2x 32+17, 18, 5 + + zeta_scalar_mul 14, \_zeta1, 20, 21 /* peer * zeta */ + zeta_scalar_mul 15, \_zeta2, 22, 23 /* peer * zeta */ + zeta_scalar_mul 16, \_zeta3, 24, 25 /* peer * zeta */ + zeta_scalar_mul 17, \_zeta4, 26, 27 /* peer * zeta */ + Barrett_reduce_delayed 20, 21, 14 + Barrett_reduce_delayed 22, 23, 15 + Barrett_reduce_delayed 24, 25, 16 + Barrett_reduce_delayed 26, 27, 17 + reduce_once_4x 14, 15, 16, 17 + + /* Load curr */ + lxvd2x 32+10, 0, 5 + lxvd2x 32+11, 11, 5 + lxvd2x 32+12, 15, 5 + lxvd2x 32+13, 17, 5 + + /* even - curr, odd - peer, odd is the updated peer */ + /* Compute (even - odd + kPrime) */ + vsubuhm 0, 10, 14 + vadduhm 21, 0, V_kPrime16 + vadduhm 20, 14, 10 + vsubuhm 0, 11, 15 + vadduhm 23, 0, V_kPrime16 + vadduhm 22, 15, 11 + vsubuhm 0, 12, 16 + vadduhm 25, 0, V_kPrime16 + vadduhm 24, 16, 12 + vsubuhm 0, 13, 17 + vadduhm 27, 0, V_kPrime16 + vadduhm 26, 17, 13 +.endm + +/* Delayed writes resulting curr and peer coefficients */ +.macro delayed_writes_curr_peer _c1, _p1, _c2, _p2, _c3, _p3, _c4, _p4 + stxvd2x 32+\_c1, 0, 5 + stxvd2x 32+\_p1, 10, 5 + stxvd2x 32+\_c2, 11, 5 + stxvd2x 32+\_p2, 12, 5 + stxvd2x 32+\_c3, 15, 5 + stxvd2x 32+\_p3, 16, 5 + stxvd2x 32+\_c4, 17, 5 + stxvd2x 32+\_p4, 18, 5 +.endm + +.macro ntt_reduce_delayed_write + _ntt_scalar_reduce_4x V_ZETA, V_ZETA, V_ZETA, V_ZETA + reduce_once_4x 20, 22, 24, 26 + reduce_once_4x 21, 23, 25, 27 + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 +.endm + +/* + * NTT layer 1, Offset between legs = 128. + */ +.macro NTT_Layer1_4x + ntt_reduce_delayed_write + addi 5, 5, 64 +.endm + +/* + * NTT layer 2, Offset between legs = 64. + */ +.macro NTT_Layer2_4x + ntt_reduce_delayed_write + addi 5, 5, 64 +.endm + +/* + * NTT layer 3, Offset between legs = 32. + */ +.macro NTT_Layer3_4x + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + ntt_reduce_delayed_write + addi 5, 5, 128 +.endm + +/* + * NTT layer 4, Offset between legs = 16. + */ +.macro NTT_Layer4_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + _ntt_scalar_reduce_4x V_Z1, V_Z1, V_Z2, V_Z2 + reduce_once_4x 20, 22, 24, 26 + reduce_once_4x 21, 23, 25, 27 + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 + addi 5, 5, 128 +.endm + +/* + * NTT layer 5, Offset between legs = 8. + */ +.macro NTT_Layer5_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + lvx V_Z3, 0, 14 + addi 14, 14, 16 + lvx V_Z4, 0, 14 + addi 14, 14, 16 + _ntt_scalar_reduce_4x V_Z1, V_Z2, V_Z3, V_Z4 + reduce_once_4x 20, 22, 24, 26 + reduce_once_4x 21, 23, 25, 27 + delayed_writes_curr_peer 20, 21, 22, 23, 24, 25, 26, 27 + addi 5, 5, 128 +.endm + +/* ===================================================================== */ +/* Layer 6 layout - + * each load contains 4 curr and 4 peer elements + * xxpermidi -> 8 curr elements and 8 peer elements + */ +.macro _load_transpose_layer6 + lxvd2x 4, 0, 5 + lxvd2x 5, 10, 5 + lxvd2x 6, 11, 5 + lxvd2x 7, 12, 5 + lxvd2x 8, 15, 5 + lxvd2x 9, 16, 5 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+14, 5, 4, 3 + xxpermdi 0, 5, 4, 0 + xxpermdi 32+15, 7, 6, 3 + xxpermdi 1, 7, 6, 0 + xxpermdi 32+16, 9, 8, 3 + xxpermdi 2, 9, 8, 0 + xxpermdi 32+17, 11, 10, 3 + xxpermdi 3, 11, 10, 0 +.endm + +/* + * _layer_reduce_4x- common code for layer 6 and 7 + * + * Assuming input registers and output vectors as follows, + * input offsets - curr (0, r11, r15, r17), peer (r10, r12, r16, r18) + * output vectors - v20, v21, v22, v23, v24, v25, v26 + */ +.macro _layer_reduce_4x + zeta_scalar_mul 14, V_Z1, 20, 21 /* peer * zeta */ + zeta_scalar_mul 15, V_Z2, 22, 23 /* peer * zeta */ + zeta_scalar_mul 16, V_Z3, 24, 25 /* peer * zeta */ + zeta_scalar_mul 17, V_Z4, 26, 27 /* peer * zeta */ + Barrett_reduce_delayed 20, 21, 14 + Barrett_reduce_delayed 22, 23, 15 + Barrett_reduce_delayed 24, 25, 16 + Barrett_reduce_delayed 26, 27, 17 + reduce_once_4x 14, 15, 16, 17 + + /* Load curr */ + xxlor 32+10, 0, 0 + xxlor 32+11, 1, 1 + xxlor 32+12, 2, 2 + xxlor 32+13, 3, 3 + + /* even - curr, odd - peer, odd is the updated peer */ + /* Compute (even - odd + kPrime) */ + vsubuhm 0, 10, 14 + vadduhm 21, 0, V_kPrime16 + vadduhm 20, 14, 10 + vsubuhm 0, 11, 15 + vadduhm 23, 0, V_kPrime16 + vadduhm 22, 15, 11 + vsubuhm 0, 12, 16 + vadduhm 25, 0, V_kPrime16 + vadduhm 24, 16, 12 + vsubuhm 0, 13, 17 + vadduhm 27, 0, V_kPrime16 + vadduhm 26, 17, 13 +.endm + +.macro _ntt_layer6_reduce_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + lvx V_Z3, 0, 14 + addi 14, 14, 16 + lvx V_Z4, 0, 14 + addi 14, 14, 16 + _load_transpose_layer6 + _layer_reduce_4x +.endm + +/* + * NTT layer 6, Offset between legs = 4. + */ +.macro NTT_Layer6_4x + _ntt_layer6_reduce_4x + reduce_once_4x 20, 22, 24, 26 + reduce_once_4x 21, 23, 25, 27 + + xxpermdi 32+6, 32+20, 32+21, 3 + xxpermdi 32+7, 32+20, 32+21, 0 + xxpermdi 32+8, 32+22, 32+23, 3 + xxpermdi 32+9, 32+22, 32+23, 0 + xxpermdi 32+10, 32+24, 32+25, 3 + xxpermdi 32+11, 32+24, 32+25, 0 + xxpermdi 32+12, 32+26, 32+27, 3 + xxpermdi 32+13, 32+26, 32+27, 0 + delayed_writes_curr_peer 6, 7, 8, 9, 10, 11, 12, 13 + addi 5, 5, 128 +.endm + +/* Layer 7 layout - + * each load contains 2 curr and 2 peer elements. + * -> curr1 curr1 peer1 peer1 curr2 curr2 peer2 peer2 + * vmrgew and vmrgow -> 8 curr elements and 8 peer elements + */ +.macro _load_transpose_layer7 + lxvd2x 32+0, 0, 5 + lxvd2x 32+6, 10, 5 + lxvd2x 32+7, 11, 5 + lxvd2x 32+8, 12, 5 + lxvd2x 32+9, 15, 5 + lxvd2x 32+10, 16, 5 + lxvd2x 32+11, 17, 5 + lxvd2x 32+12, 18, 5 + vmrgew 14, 0, 6 + vmrgow 0, 0, 6 + vmrgew 15, 7, 8 + vmrgow 6, 7, 8 + vmrgew 16, 9, 10 + vmrgow 8, 9, 10 + vmrgew 17, 11, 12 + vmrgow 13, 11, 12 + xxlor 0, 32+0, 32+0 + xxlor 1, 32+6, 32+6 + xxlor 2, 32+8, 32+8 + xxlor 3, 32+13, 32+13 +.endm + +.macro _ntt_layer7_reduce_4x + lvx V_Z1, 0, 14 + addi 14, 14, 16 + lvx V_Z2, 0, 14 + addi 14, 14, 16 + lvx V_Z3, 0, 14 + addi 14, 14, 16 + lvx V_Z4, 0, 14 + addi 14, 14, 16 + _load_transpose_layer7 + _layer_reduce_4x +.endm + +/* + * NTT layer 7, Offset between legs = 2. + */ +.macro NTT_Layer7_4x + _ntt_layer7_reduce_4x + reduce_once_4x 20, 22, 24, 26 + reduce_once_4x 21, 23, 25, 27 + + vmrgow 6, 21, 20 + vmrgew 7, 21, 20 + vmrgow 8, 23, 22 + vmrgew 9, 23, 22 + vmrgow 10, 25, 24 + vmrgew 11, 25, 24 + vmrgow 12, 27, 26 + vmrgew 13, 27, 26 + delayed_writes_curr_peer 7, 6, 9, 8, 11, 10, 13, 12 + addi 5, 5, 128 +.endm + +.macro Load_consts + addis 10,2,ntt_consts@toc@ha + addi 10,10,ntt_consts@toc@l + lvx V_kPrime16, 0, 10 + li 7, 16 + lvx V_kPrime32, 7, 10 + li 7, 32 + lvx V_kBarrettMultiplier, 7, 10 + li 7, 48 + lvx V_kBarrettShift, 7, 10 + li 7, 64 + lvx V_kInverseDegree, 7, 10 +.endm + +/* ===================================================================== */ +/* + * mlkem_ntt_ppc(int16_t *r) + * Compute forward NTT based on the following 7 layers - + * len = 128, 64, 32, 16, 8, 4, 2. + * + * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * The resulting coefficients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coefficients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coefficients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + * + */ +.global mlkem_ntt_ppc +.align 4 +mlkem_ntt_ppc: + SAVE_REGS + + Load_consts + addis 8,2,mlkem_ntt_zetas@toc@ha + addi 8,8,mlkem_ntt_zetas@toc@l + addi 14, 8, ZETA_NTT_OFFSET + +.align 4 + /* + * Layer 1. len = 128 + */ + mr 5, 3 + li 10, 256 // offset to next peer + li 11, 16 + li 12, 256+16 + li 15, 32 + li 16, 256+32 + li 17, 48 + li 18, 256+48 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_Layer1_4x + NTT_Layer1_4x + NTT_Layer1_4x + NTT_Layer1_4x + +.align 4 + /* + * Layer 2. len = 64 + */ + mr 5, 3 + li 10, 128 // offset to next peer + li 11, 16 + li 12, 128+16 + li 15, 32 + li 16, 128+32 + li 17, 48 + li 18, 128+48 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_Layer2_4x + NTT_Layer2_4x + addi 5, 5, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_Layer2_4x + NTT_Layer2_4x + addi 5, 5, 128 + +.align 4 + /* + * Layer 3. len = 32 + */ + mr 5, 3 + li 10, 64 // offset to next peer + li 11, 16 + li 12, 64+16 + li 15, 32 + li 16, 64+32 + li 17, 48 + li 18, 64+48 + + NTT_Layer3_4x + NTT_Layer3_4x + NTT_Layer3_4x + NTT_Layer3_4x + +.align 4 + /* + * Layer 4. len = 16 + */ + mr 5, 3 + li 10, 32 // offset to next peer + li 11, 16 + li 12, 32+16 + li 15, 64 + li 16, 64+32 + li 17, 64+16 + li 18, 96+16 + + NTT_Layer4_4x + NTT_Layer4_4x + NTT_Layer4_4x + NTT_Layer4_4x + +.align 4 + /* + * Layer 5. len = 8 + */ + mr 5, 3 + li 10, 16 // offset to next peer + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + + NTT_Layer5_4x + NTT_Layer5_4x + NTT_Layer5_4x + NTT_Layer5_4x + +.align 4 + /* + * Layer 6. len = 4 + * Load zeta vectors in 4-4 layout + */ + mr 5, 3 + + NTT_Layer6_4x + NTT_Layer6_4x + NTT_Layer6_4x + NTT_Layer6_4x + +.align 4 + /* + * Layer 7. len = 2 + * Load zeta vectors in 2-2-2-2 layout + */ + mr 5, 3 + li 10, 16 + + NTT_Layer7_4x + NTT_Layer7_4x + NTT_Layer7_4x + NTT_Layer7_4x + + RESTORE_REGS + blr +.size mlkem_ntt_ppc,.-mlkem_ntt_ppc + +.rodata +.align 4 +ntt_consts: +.short kPrime, kPrime, kPrime, kPrime, kPrime, kPrime, kPrime, kPrime +.long kPrime, kPrime, kPrime, kPrime +.long kBarrettMultiplier, kBarrettMultiplier, kBarrettMultiplier, kBarrettMultiplier +.long kBarrettShift, 0, kBarrettShift, 0 +.short kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree, kInverseDegree , kInverseDegree + +/* zetas */ +mlkem_ntt_zetas: +/* For ntt len = 128, 64, 32, 16, 8 */ +.short 1729, 1729, 1729, 1729, 1729, 1729, 1729, 1729 +.short 2580, 2580, 2580, 2580, 2580, 2580, 2580, 2580 +.short 3289, 3289, 3289, 3289, 3289, 3289, 3289, 3289 +.short 2642, 2642, 2642, 2642, 2642, 2642, 2642, 2642 +.short 630, 630, 630, 630, 630, 630, 630, 630 +.short 1897, 1897, 1897, 1897, 1897, 1897, 1897, 1897 +.short 848, 848, 848, 848, 848, 848, 848, 848 +.short 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062 +.short 1919, 1919, 1919, 1919, 1919, 1919, 1919, 1919 +.short 193, 193, 193, 193, 193, 193, 193, 193 +.short 797, 797, 797, 797, 797, 797, 797, 797 +.short 2786, 2786, 2786, 2786, 2786, 2786, 2786, 2786 +.short 3260, 3260, 3260, 3260, 3260, 3260, 3260, 3260 +.short 569, 569, 569, 569, 569, 569, 569, 569 +.short 1746, 1746, 1746, 1746, 1746, 1746, 1746, 1746 +.short 296, 296, 296, 296, 296, 296, 296, 296 +.short 2447, 2447, 2447, 2447, 2447, 2447, 2447, 2447 +.short 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339 +.short 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476 +.short 3046, 3046, 3046, 3046, 3046, 3046, 3046, 3046 +.short 56, 56, 56, 56, 56, 56, 56, 56 +.short 2240, 2240, 2240, 2240, 2240, 2240, 2240, 2240 +.short 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333 +.short 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426 +.short 2094, 2094, 2094, 2094, 2094, 2094, 2094, 2094 +.short 535, 535, 535, 535, 535, 535, 535, 535 +.short 2882, 2882, 2882, 2882, 2882, 2882, 2882, 2882 +.short 2393, 2393, 2393, 2393, 2393, 2393, 2393, 2393 +.short 2879, 2879, 2879, 2879, 2879, 2879, 2879, 2879 +.short 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974 +.short 821, 821, 821, 821, 821, 821, 821, 821 +/* For ntt len = 4 */ +.short 289, 289, 289, 289, 331, 331, 331, 331 +.short 3253, 3253, 3253, 3253, 1756, 1756, 1756, 1756 +.short 1197, 1197, 1197, 1197, 2304, 2304, 2304, 2304 +.short 2277, 2277, 2277, 2277, 2055, 2055, 2055, 2055 +.short 650, 650, 650, 650, 1977, 1977, 1977, 1977 +.short 2513, 2513, 2513, 2513, 632, 632, 632, 632 +.short 2865, 2865, 2865, 2865, 33, 33, 33, 33 +.short 1320, 1320, 1320, 1320, 1915, 1915, 1915, 1915 +.short 2319, 2319, 2319, 2319, 1435, 1435, 1435, 1435 +.short 807, 807, 807, 807, 452, 452, 452, 452 +.short 1438, 1438, 1438, 1438, 2868, 2868, 2868, 2868 +.short 1534, 1534, 1534, 1534, 2402, 2402, 2402, 2402 +.short 2647, 2647, 2647, 2647, 2617, 2617, 2617, 2617 +.short 1481, 1481, 1481, 1481, 648, 648, 648, 648 +.short 2474, 2474, 2474, 2474, 3110, 3110, 3110, 3110 +.short 1227, 1227, 1227, 1227, 910, 910, 910, 910 +/* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (4, 2, 3, 1) + * Transpose z[0], z[1], z[2], z[3] + * -> z[4], z[4], z[2], z[2], z[3], z[3], z[1], z[1] + */ +.short 2649, 2649, 2761, 2761, 583, 583, 17, 17 +.short 1100, 1100, 723, 723, 2288, 2288, 1637, 1637 +.short 233, 233, 2662, 2662, 3281, 3281, 1409, 1409 +.short 3050, 3050, 2156, 2156, 3015, 3015, 756, 756 +.short 1789, 1789, 1651, 1651, 2789, 2789, 1703, 1703 +.short 2687, 2687, 952, 952, 1461, 1461, 1847, 1847 +.short 2388, 2388, 2308, 2308, 2437, 2437, 939, 939 +.short 641, 641, 2337, 2337, 268, 268, 733, 733 +.short 3220, 3220, 2298, 2298, 2037, 2037, 1584, 1584 +.short 1645, 1645, 2549, 2549, 2090, 2090, 375, 375 +.short 757, 757, 319, 319, 2773, 2773, 1063, 1063 +.short 2594, 2594, 561, 561, 2466, 2466, 2099, 2099 +.short 1026, 1026, 1092, 1092, 403, 403, 2804, 2804 +.short 886, 886, 2150, 2150, 2775, 2775, 1143, 1143 +.short 1029, 1029, 1212, 1212, 1874, 1874, 1722, 1722 +.short 2154, 2154, 2935, 2935, 885, 885, 2110, 2110 diff --git a/crypto/ml_kem/asm/mlkem_ppc_macros_asm.inc b/crypto/ml_kem/asm/mlkem_ppc_macros_asm.inc new file mode 100644 index 00000000000..58598ccf588 --- /dev/null +++ b/crypto/ml_kem/asm/mlkem_ppc_macros_asm.inc @@ -0,0 +1,152 @@ +/* + * Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* ===================================================================== */ +/* Barrett reduce vectors and constants */ +#define V_kPrime16 1 +#define V_kPrime32 2 +#define V_kBarrettMultiplier 3 +#define V_kBarrettShift 4 +#define V_kInverseDegree 5 + +#define ML_KEM_PRIME 3329 +#define kPrime 3329 +#define kBarrettShift 24 +#define kBarrettMultiplier 5039 +#define kInverseDegree 3303 + +/* + * reduce_once reduces 0 <= x < 2*kPrime, mod kPrime. + * + * Subtract |q| if the input is larger, without exposing a side-channel, + * avoiding the "clangover" attack. See |constish_time_non_zero| for a + * discussion on why the value barrier is by default omitted. + */ +.macro reduce_once _x + vsubuhm 6, \_x, V_kPrime16 + vcmpgtuh 7, V_kPrime16, 6 + xxsel 32+\_x, 32+\_x, 32+6, 32+7 +.endm + +/* reduce_once_4x: pipeline reduce_once in 4x */ +.macro reduce_once_4x _x1, _x2, _x3, _x4 + vsubuhm 6, \_x1, V_kPrime16 + vsubuhm 8, \_x2, V_kPrime16 + vsubuhm 10, \_x3, V_kPrime16 + vsubuhm 12, \_x4, V_kPrime16 + vcmpgtuh 7, V_kPrime16, 6 + vcmpgtuh 9, V_kPrime16, 8 + vcmpgtuh 11, V_kPrime16, 10 + vcmpgtuh 13, V_kPrime16, 12 + xxsel 32+\_x1, 32+\_x1, 32+6, 32+7 + xxsel 32+\_x2, 32+\_x2, 32+8, 32+9 + xxsel 32+\_x3, 32+\_x3, 32+10, 32+11 + xxsel 32+\_x4, 32+\_x4, 32+12, 32+13 +.endm + +/* scalar multiplication = coeff * zeta */ +.macro zeta_scalar_mul _coeff, _zeta, _x0, _x1 + vmulouh 6, \_coeff, \_zeta + vmuleuh 7, \_coeff, \_zeta + xxmrglw 32+\_x0, 32+7, 32+6 + xxmrghw 32+\_x1, 32+7, 32+6 +.endm + +/* + * Constant-time reduce x mod kPrime using Barrett reduction. x must be less + * than kPrime + 2 * kPrime^2. This is sufficient to reduce a product of + * two already reduced u_int16 values, in fact it is sufficient for each + * to be less than 2^12, because (kPrime * (2 * kPrime + 1)) > 2^24. + * + * This macro handles 8 coefficients elements. + * Input vectors: x0, x1 + * Output vector: updated peer + * Scratch vectors: v6, v7, v10, v11, v12, v13 + * + * Barrett_reduce_delayed- + * We delay reduce_once in 4x pipeline in each layer + */ +.macro Barrett_reduce_delayed _x0, _x1, _updated_peer + /* uint64_t product = (uint64_t)x * kBarrettMultiplier; */ + vmulouw 10, \_x0, V_kBarrettMultiplier + vmuleuw 11, \_x0, V_kBarrettMultiplier + vmulouw 12, \_x1, V_kBarrettMultiplier + vmuleuw 13, \_x1, V_kBarrettMultiplier + + /* uint32_t quotient = (uint32_t)(product >> kBarrettShift); */ + vsrd 10, 10, V_kBarrettShift + vsrd 11, 11, V_kBarrettShift + vsrd 12, 12, V_kBarrettShift + vsrd 13, 13, V_kBarrettShift + vmrgow 6, 11, 10 + vmrgow 7, 13, 12 + + /* uint32_t remainder = x - quotient * kPrime; */ + vmuluwm 10, 6, V_kPrime32 + vmuluwm 11, 7, V_kPrime32 + vsubuwm 6, \_x0, 10 + vsubuwm 7, \_x1, 11 + vpkuwus \_updated_peer, 7, 6 +.endm + +.macro Barrett_reduce _x0, _x1, _updated_peer + Barrett_reduce_delayed \_x0, \_x1, \_updated_peer + reduce_once \_updated_peer +.endm + +/* + * Barrett_reduce_delayed_2x - 2 Barrett_reduce_delayed in parallel + * This is only 2x since we don't have enough vectors to support 4x pipeline. + */ +.macro Barrett_reduce_delayed_2x _x0, _x1, _y0, _y1, _updated_peer1, _updated_peer2 + /* uint64_t product = (uint64_t)x * kBarrettMultiplier; */ + vmulouw 10, \_x0, V_kBarrettMultiplier + vmuleuw 11, \_x0, V_kBarrettMultiplier + vmulouw 12, \_x1, V_kBarrettMultiplier + vmuleuw 13, \_x1, V_kBarrettMultiplier + + vmulouw 14, \_y0, V_kBarrettMultiplier + vmuleuw 15, \_y0, V_kBarrettMultiplier + vmulouw 16, \_y1, V_kBarrettMultiplier + vmuleuw 17, \_y1, V_kBarrettMultiplier + + /* uint32_t quotient = (uint32_t)(product >> kBarrettShift); */ + vsrd 10, 10, V_kBarrettShift + vsrd 11, 11, V_kBarrettShift + vsrd 12, 12, V_kBarrettShift + vsrd 13, 13, V_kBarrettShift + + vsrd 14, 14, V_kBarrettShift + vsrd 15, 15, V_kBarrettShift + vsrd 16, 16, V_kBarrettShift + vsrd 17, 17, V_kBarrettShift + vmrgow 6, 11, 10 + vmrgow 7, 13, 12 + vmrgow 12, 15, 14 + vmrgow 13, 17, 16 + + /* uint32_t remainder = x - quotient * kPrime; */ + vmuluwm 10, 6, V_kPrime32 + vmuluwm 11, 7, V_kPrime32 + vmuluwm 14, 12, V_kPrime32 + vmuluwm 15, 13, V_kPrime32 + vsubuwm 6, \_x0, 10 + vsubuwm 7, \_x1, 11 + vsubuwm 16, \_y0, 14 + vsubuwm 17, \_y1, 15 + vpkuwus \_updated_peer1, 7, 6 + vpkuwus \_updated_peer2, 17, 16 +.endm +/* ===================================================================== */ diff --git a/crypto/ml_kem/build.info b/crypto/ml_kem/build.info index e2ea88b35dc..e5937019185 100644 --- a/crypto/ml_kem/build.info +++ b/crypto/ml_kem/build.info @@ -1,6 +1,29 @@ LIBS = ../../libcrypto +$MLKEMASM= +IF[{- !$disabled{asm} -}] + $MLKEMDEF_ppc64=MLKEM_NTT_PPC_ASM + + IF[{- $target{sys_id} ne "AIX" && $target{sys_id} ne "MACOSX" -}] + $MLKEMASM_ppc64=asm/mlkem_ntt_ppc64le.S asm/mlkem_intt_ppc64le.S + ENDIF + + # Now that we have defined all the arch specific variables, use the + # appropriate one, and define the appropriate macros + IF[$MLKEMASM_{- $target{asm_arch} -}] + $MLKEMASM=$MLKEMASM_{- $target{asm_arch} -} + $MLKEMDEF=$MLKEMDEF_{- $target{asm_arch} -} + ENDIF +ENDIF + +$COMMON=ml_kem.c $MLKEMASM + IF[{- !$disabled{'ml-kem'} -}] - SOURCE[../../libcrypto]=ml_kem.c - SOURCE[../../providers/libfips.a]=ml_kem.c + SOURCE[../../libcrypto]=$COMMON + SOURCE[../../providers/libfips.a]=$COMMON ENDIF + +# Implementations are now spread across several libraries, so the defines +# need to be applied to all affected libraries and modules. +DEFINE[../../libcrypto]=$MLKEMDEF +DEFINE[../../providers/libfips.a]=$MLKEMDEF diff --git a/crypto/ml_kem/ml_kem.c b/crypto/ml_kem/ml_kem.c index 9ea824ccffd..04ba04cdc9c 100644 --- a/crypto/ml_kem/ml_kem.c +++ b/crypto/ml_kem/ml_kem.c @@ -456,6 +456,58 @@ static __owur int sample_scalar(scalar *out, EVP_MD_CTX *mdctx) return 1; } +static CRYPTO_ONCE ml_kem_ntt_once = CRYPTO_ONCE_STATIC_INIT; + +#if defined(_ARCH_PPC64) +#include "crypto/ppc_arch.h" +#endif + +#if defined(MLKEM_NTT_PPC_ASM) && defined(_ARCH_PPC64) +/* + * PPC64LE Platform supports. + */ +typedef void (*ml_kem_scalar_ntt_fn)(scalar *p); +typedef void (*ml_kem_scalar_inverse_ntt_fn)(scalar *p); + +static void scalar_ntt_generic(scalar *p); +static void scalar_inverse_ntt_generic(scalar *p); + +static ml_kem_scalar_ntt_fn scalar_ntt = scalar_ntt_generic; +static ml_kem_scalar_inverse_ntt_fn scalar_inverse_ntt = scalar_inverse_ntt_generic; + +void mlkem_ntt_ppc(uint16_t *c); +void mlkem_inverse_ntt_ppc(uint16_t *c); + +static void scalar_ntt_ppc(scalar *s) +{ + mlkem_ntt_ppc(s->c); +} + +static void scalar_inverse_ntt_ppc(scalar *s) +{ + mlkem_inverse_ntt_ppc(s->c); +} +#else +#define scalar_ntt_generic scalar_ntt +#define scalar_inverse_ntt_generic scalar_inverse_ntt +#endif + +/* + * Initialize NTT function pointers to PPC64le implementations if available. + * Scalar implementations are used by default. + */ +static void ml_kem_ntt_init(void) +{ +#if defined(MLKEM_NTT_PPC_ASM) && defined(_ARCH_PPC64) +#if defined(__LITTLE_ENDIAN__) || (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + if (OPENSSL_ppccap_P & PPC_CRYPTO207) { + scalar_ntt = scalar_ntt_ppc; + scalar_inverse_ntt = scalar_inverse_ntt_ppc; + } +#endif +#endif +} + /*- * reduce_once reduces 0 <= x < 2*kPrime, mod kPrime. * @@ -506,7 +558,7 @@ static void scalar_mult_const(scalar *s, uint16_t a) * elements in GF(3329^2), with the coefficients of the elements being * consecutive entries in |s->c|. */ -static void scalar_ntt(scalar *s) +static void scalar_ntt_generic(scalar *s) { const uint16_t *roots = kNTTRoots; uint16_t *end = s->c + DEGREE; @@ -538,7 +590,7 @@ static void scalar_ntt(scalar *s) * iFFT to account for the fact that 3329 does not have a 512th root of unity, * using the precomputed 128 roots of unity stored in InverseNTTRoots. */ -static void scalar_inverse_ntt(scalar *s) +static void scalar_inverse_ntt_generic(scalar *s) { const uint16_t *roots = kInverseNTTRoots; uint16_t *end = s->c + DEGREE; @@ -1616,6 +1668,8 @@ void ossl_ml_kem_key_reset(ML_KEM_KEY *key) /* Retrieve the parameters of one of the ML-KEM variants */ const ML_KEM_VINFO *ossl_ml_kem_get_vinfo(int evp_type) { + (void)CRYPTO_THREAD_run_once(&ml_kem_ntt_once, ml_kem_ntt_init); + switch (evp_type) { case EVP_PKEY_ML_KEM_512: return &vinfo_map[ML_KEM_512_VINFO];