1 /* Optimized SSE2 implementation of ChaCha20 cipher.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 /* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher
21 Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
23 This file is part of Libgcrypt.
25 Libgcrypt is free software; you can redistribute it and/or modify
26 it under the terms of the GNU Lesser General Public License as
27 published by the Free Software Foundation; either version 2.1 of
28 the License, or (at your option) any later version.
30 Libgcrypt is distributed in the hope that it will be useful,
31 but WITHOUT ANY WARRANTY; without even the implied warranty of
32 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 GNU Lesser General Public License for more details.
35 You should have received a copy of the GNU Lesser General Public
36 License along with this program; if not, see <https://www.gnu.org/licenses/>.
39 /* Based on D. J. Bernstein reference implementation at
40 http://cr.yp.to/chacha.html:
42 chacha-regs.c version 20080118
47 #include <isa-level.h>
49 #if MINIMUM_X86_ISA_LEVEL <= 2
57 /* 'ret' instruction replacement for straight-line speculation mitigation */
58 #define ret_spec_stop \
69 #define STACK_VEC_X12 (16)
70 #define STACK_VEC_X13 (16 + STACK_VEC_X12)
71 #define STACK_TMP (16 + STACK_VEC_X13)
72 #define STACK_TMP1 (16 + STACK_TMP)
73 #define STACK_TMP2 (16 + STACK_TMP1)
75 #define STACK_MAX (16 + STACK_TMP2)
77 /* vector registers */
95 /**********************************************************************
97 **********************************************************************/
99 /* 4x4 32-bit integer matrix transpose */
100 #define TRANSPOSE_4x4(x0, x1, x2, x3, t1, t2, t3) \
118 /* fill xmm register with 32-bit value from memory */
119 #define PBROADCASTD(mem32, xreg) \
121 pshufd $0, xreg, xreg;
123 /**********************************************************************
125 **********************************************************************/
127 #define ROTATE2(v1,v2,c,tmp1,tmp2) \
130 psrld $(32 - (c)), v1; \
133 psrld $(32 - (c)), v2; \
143 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \
144 PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
145 ROTATE2(d1, d2, 16, tmp1, tmp2); \
146 PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
147 ROTATE2(b1, b2, 12, tmp1, tmp2); \
148 PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
149 ROTATE2(d1, d2, 8, tmp1, tmp2); \
150 PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
151 ROTATE2(b1, b2, 7, tmp1, tmp2);
153 .section .text.sse2,"ax",@progbits
162 .long 0x80000000,0x80000000,0x80000000,0x80000000
164 .hidden __chacha20_sse2_blocks4
165 ENTRY (__chacha20_sse2_blocks4)
170 * %rcx: nblks (multiple of 4)
174 cfi_adjust_cfa_offset(8);
175 cfi_rel_offset(rbp, 0)
177 cfi_def_cfa_register(%rbp);
179 subq $STACK_MAX, %rsp;
185 /* Construct counter vectors X12 and X13 */
186 movdqa L(inc_counter) rRIP, X0;
187 movdqa L(unsigned_cmp) rRIP, X2;
188 PBROADCASTD((12 * 4)(INPUT), X12);
189 PBROADCASTD((13 * 4)(INPUT), X13);
196 movdqa X12, (STACK_VEC_X12)(%rsp);
197 movdqa X13, (STACK_VEC_X13)(%rsp);
200 PBROADCASTD((0 * 4)(INPUT), X0);
201 PBROADCASTD((1 * 4)(INPUT), X1);
202 PBROADCASTD((2 * 4)(INPUT), X2);
203 PBROADCASTD((3 * 4)(INPUT), X3);
204 PBROADCASTD((4 * 4)(INPUT), X4);
205 PBROADCASTD((5 * 4)(INPUT), X5);
206 PBROADCASTD((6 * 4)(INPUT), X6);
207 PBROADCASTD((7 * 4)(INPUT), X7);
208 PBROADCASTD((8 * 4)(INPUT), X8);
209 PBROADCASTD((9 * 4)(INPUT), X9);
210 PBROADCASTD((10 * 4)(INPUT), X10);
211 PBROADCASTD((11 * 4)(INPUT), X11);
212 PBROADCASTD((14 * 4)(INPUT), X14);
213 PBROADCASTD((15 * 4)(INPUT), X15);
214 movdqa X11, (STACK_TMP)(%rsp);
215 movdqa X15, (STACK_TMP1)(%rsp);
218 QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15)
219 movdqa (STACK_TMP)(%rsp), X11;
220 movdqa (STACK_TMP1)(%rsp), X15;
221 movdqa X8, (STACK_TMP)(%rsp);
222 movdqa X9, (STACK_TMP1)(%rsp);
223 QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9)
224 QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9)
225 movdqa (STACK_TMP)(%rsp), X8;
226 movdqa (STACK_TMP1)(%rsp), X9;
227 movdqa X11, (STACK_TMP)(%rsp);
228 movdqa X15, (STACK_TMP1)(%rsp);
229 QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15)
234 movdqa (STACK_TMP)(%rsp), X11;
235 PBROADCASTD((0 * 4)(INPUT), X15);
237 PBROADCASTD((1 * 4)(INPUT), X15);
239 PBROADCASTD((2 * 4)(INPUT), X15);
241 PBROADCASTD((3 * 4)(INPUT), X15);
243 PBROADCASTD((4 * 4)(INPUT), X15);
245 PBROADCASTD((5 * 4)(INPUT), X15);
247 PBROADCASTD((6 * 4)(INPUT), X15);
249 PBROADCASTD((7 * 4)(INPUT), X15);
251 PBROADCASTD((8 * 4)(INPUT), X15);
253 PBROADCASTD((9 * 4)(INPUT), X15);
255 PBROADCASTD((10 * 4)(INPUT), X15);
257 PBROADCASTD((11 * 4)(INPUT), X15);
259 movdqa (STACK_VEC_X12)(%rsp), X15;
261 movdqa (STACK_VEC_X13)(%rsp), X15;
263 movdqa X13, (STACK_TMP)(%rsp);
264 PBROADCASTD((14 * 4)(INPUT), X15);
266 movdqa (STACK_TMP1)(%rsp), X15;
267 movdqa X14, (STACK_TMP1)(%rsp);
268 PBROADCASTD((15 * 4)(INPUT), X13);
270 movdqa X15, (STACK_TMP2)(%rsp);
273 addq $4, (12 * 4)(INPUT);
275 TRANSPOSE_4x4(X0, X1, X2, X3, X13, X14, X15);
276 movdqu X0, (64 * 0 + 16 * 0)(DST)
277 movdqu X1, (64 * 1 + 16 * 0)(DST)
278 movdqu X2, (64 * 2 + 16 * 0)(DST)
279 movdqu X3, (64 * 3 + 16 * 0)(DST)
280 TRANSPOSE_4x4(X4, X5, X6, X7, X0, X1, X2);
281 movdqa (STACK_TMP)(%rsp), X13;
282 movdqa (STACK_TMP1)(%rsp), X14;
283 movdqa (STACK_TMP2)(%rsp), X15;
284 movdqu X4, (64 * 0 + 16 * 1)(DST)
285 movdqu X5, (64 * 1 + 16 * 1)(DST)
286 movdqu X6, (64 * 2 + 16 * 1)(DST)
287 movdqu X7, (64 * 3 + 16 * 1)(DST)
288 TRANSPOSE_4x4(X8, X9, X10, X11, X0, X1, X2);
289 movdqu X8, (64 * 0 + 16 * 2)(DST)
290 movdqu X9, (64 * 1 + 16 * 2)(DST)
291 movdqu X10, (64 * 2 + 16 * 2)(DST)
292 movdqu X11, (64 * 3 + 16 * 2)(DST)
293 TRANSPOSE_4x4(X12, X13, X14, X15, X0, X1, X2);
294 movdqu X12, (64 * 0 + 16 * 3)(DST)
295 movdqu X13, (64 * 1 + 16 * 3)(DST)
296 movdqu X14, (64 * 2 + 16 * 3)(DST)
297 movdqu X15, (64 * 3 + 16 * 3)(DST)
300 lea (4 * 64)(DST), DST;
301 lea (4 * 64)(SRC), SRC;
304 /* eax zeroed by round loop. */
306 cfi_adjust_cfa_offset(-8)
307 cfi_def_cfa_register(%rsp);
309 END (__chacha20_sse2_blocks4)
311 #endif /* if MINIMUM_X86_ISA_LEVEL <= 2 */