]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/s390/s390-64/chacha20-s390x.S
e38504d3700cacd5947c2c6d2adabee15b5baeb9
[thirdparty/glibc.git] / sysdeps / s390 / s390-64 / chacha20-s390x.S
1 /* Optimized s390x implementation of ChaCha20 cipher.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 /* chacha20-s390x.S - zSeries implementation of ChaCha20 cipher
20
21 Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
22
23 This file is part of Libgcrypt.
24
25 Libgcrypt is free software; you can redistribute it and/or modify
26 it under the terms of the GNU Lesser General Public License as
27 published by the Free Software Foundation; either version 2.1 of
28 the License, or (at your option) any later version.
29
30 Libgcrypt is distributed in the hope that it will be useful,
31 but WITHOUT ANY WARRANTY; without even the implied warranty of
32 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 GNU Lesser General Public License for more details.
34
35 You should have received a copy of the GNU Lesser General Public
36 License along with this program; if not, see <https://www.gnu.org/licenses/>.
37 */
38
39 #include <sysdep.h>
40
41 #ifdef HAVE_S390_VX_ASM_SUPPORT
42
43 /* CFA expressions are used for pointing CFA and registers to
44 * SP relative offsets. */
45 # define DW_REGNO_SP 15
46
47 /* Fixed length encoding used for integers for now. */
48 # define DW_SLEB128_7BIT(value) \
49 0x00|((value) & 0x7f)
50 # define DW_SLEB128_28BIT(value) \
51 0x80|((value)&0x7f), \
52 0x80|(((value)>>7)&0x7f), \
53 0x80|(((value)>>14)&0x7f), \
54 0x00|(((value)>>21)&0x7f)
55
56 # define cfi_cfa_on_stack(rsp_offs,cfa_depth) \
57 .cfi_escape \
58 0x0f, /* DW_CFA_def_cfa_expression */ \
59 DW_SLEB128_7BIT(11), /* length */ \
60 0x7f, /* DW_OP_breg15, rsp + constant */ \
61 DW_SLEB128_28BIT(rsp_offs), \
62 0x06, /* DW_OP_deref */ \
63 0x23, /* DW_OP_plus_constu */ \
64 DW_SLEB128_28BIT((cfa_depth)+160)
65
66 .machine "z13+vx"
67 .text
68
69 .balign 16
70 .Lconsts:
71 .Lwordswap:
72 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
73 .Lbswap128:
74 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
75 .Lbswap32:
76 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
77 .Lone:
78 .long 0, 0, 0, 1
79 .Ladd_counter_0123:
80 .long 0, 1, 2, 3
81 .Ladd_counter_4567:
82 .long 4, 5, 6, 7
83
84 /* register macros */
85 #define INPUT %r2
86 #define DST %r3
87 #define SRC %r4
88 #define NBLKS %r0
89 #define ROUND %r1
90
91 /* stack structure */
92
93 #define STACK_FRAME_STD (8 * 16 + 8 * 4)
94 #define STACK_FRAME_F8_F15 (8 * 8)
95 #define STACK_FRAME_Y0_Y15 (16 * 16)
96 #define STACK_FRAME_CTR (4 * 16)
97 #define STACK_FRAME_PARAMS (6 * 8)
98
99 #define STACK_MAX (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
100 STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
101 STACK_FRAME_PARAMS)
102
103 #define STACK_F8 (STACK_MAX - STACK_FRAME_F8_F15)
104 #define STACK_F9 (STACK_F8 + 8)
105 #define STACK_F10 (STACK_F9 + 8)
106 #define STACK_F11 (STACK_F10 + 8)
107 #define STACK_F12 (STACK_F11 + 8)
108 #define STACK_F13 (STACK_F12 + 8)
109 #define STACK_F14 (STACK_F13 + 8)
110 #define STACK_F15 (STACK_F14 + 8)
111 #define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
112 #define STACK_CTR (STACK_Y0_Y15 - STACK_FRAME_CTR)
113 #define STACK_INPUT (STACK_CTR - STACK_FRAME_PARAMS)
114 #define STACK_DST (STACK_INPUT + 8)
115 #define STACK_SRC (STACK_DST + 8)
116 #define STACK_NBLKS (STACK_SRC + 8)
117 #define STACK_POCTX (STACK_NBLKS + 8)
118 #define STACK_POSRC (STACK_POCTX + 8)
119
120 #define STACK_G0_H3 STACK_Y0_Y15
121
122 /* vector registers */
123 #define A0 %v0
124 #define A1 %v1
125 #define A2 %v2
126 #define A3 %v3
127
128 #define B0 %v4
129 #define B1 %v5
130 #define B2 %v6
131 #define B3 %v7
132
133 #define C0 %v8
134 #define C1 %v9
135 #define C2 %v10
136 #define C3 %v11
137
138 #define D0 %v12
139 #define D1 %v13
140 #define D2 %v14
141 #define D3 %v15
142
143 #define E0 %v16
144 #define E1 %v17
145 #define E2 %v18
146 #define E3 %v19
147
148 #define F0 %v20
149 #define F1 %v21
150 #define F2 %v22
151 #define F3 %v23
152
153 #define G0 %v24
154 #define G1 %v25
155 #define G2 %v26
156 #define G3 %v27
157
158 #define H0 %v28
159 #define H1 %v29
160 #define H2 %v30
161 #define H3 %v31
162
163 #define IO0 E0
164 #define IO1 E1
165 #define IO2 E2
166 #define IO3 E3
167 #define IO4 F0
168 #define IO5 F1
169 #define IO6 F2
170 #define IO7 F3
171
172 #define S0 G0
173 #define S1 G1
174 #define S2 G2
175 #define S3 G3
176
177 #define TMP0 H0
178 #define TMP1 H1
179 #define TMP2 H2
180 #define TMP3 H3
181
182 #define X0 A0
183 #define X1 A1
184 #define X2 A2
185 #define X3 A3
186 #define X4 B0
187 #define X5 B1
188 #define X6 B2
189 #define X7 B3
190 #define X8 C0
191 #define X9 C1
192 #define X10 C2
193 #define X11 C3
194 #define X12 D0
195 #define X13 D1
196 #define X14 D2
197 #define X15 D3
198
199 #define Y0 E0
200 #define Y1 E1
201 #define Y2 E2
202 #define Y3 E3
203 #define Y4 F0
204 #define Y5 F1
205 #define Y6 F2
206 #define Y7 F3
207 #define Y8 G0
208 #define Y9 G1
209 #define Y10 G2
210 #define Y11 G3
211 #define Y12 H0
212 #define Y13 H1
213 #define Y14 H2
214 #define Y15 H3
215
216 /**********************************************************************
217 helper macros
218 **********************************************************************/
219
220 #define _ /*_*/
221
222 #define START_STACK(last_r) \
223 lgr %r0, %r15; \
224 lghi %r1, ~15; \
225 stmg %r6, last_r, 6 * 8(%r15); \
226 aghi %r0, -STACK_MAX; \
227 ngr %r0, %r1; \
228 lgr %r1, %r15; \
229 cfi_def_cfa_register(1); \
230 lgr %r15, %r0; \
231 stg %r1, 0(%r15); \
232 cfi_cfa_on_stack(0, 0); \
233 std %f8, STACK_F8(%r15); \
234 std %f9, STACK_F9(%r15); \
235 std %f10, STACK_F10(%r15); \
236 std %f11, STACK_F11(%r15); \
237 std %f12, STACK_F12(%r15); \
238 std %f13, STACK_F13(%r15); \
239 std %f14, STACK_F14(%r15); \
240 std %f15, STACK_F15(%r15);
241
242 #define END_STACK(last_r) \
243 lg %r1, 0(%r15); \
244 ld %f8, STACK_F8(%r15); \
245 ld %f9, STACK_F9(%r15); \
246 ld %f10, STACK_F10(%r15); \
247 ld %f11, STACK_F11(%r15); \
248 ld %f12, STACK_F12(%r15); \
249 ld %f13, STACK_F13(%r15); \
250 ld %f14, STACK_F14(%r15); \
251 ld %f15, STACK_F15(%r15); \
252 lmg %r6, last_r, 6 * 8(%r1); \
253 lgr %r15, %r1; \
254 cfi_def_cfa_register(DW_REGNO_SP);
255
256 #define PLUS(dst,src) \
257 vaf dst, dst, src;
258
259 #define XOR(dst,src) \
260 vx dst, dst, src;
261
262 #define ROTATE(v1,c) \
263 verllf v1, v1, (c)(0);
264
265 #define WORD_ROTATE(v1,s) \
266 vsldb v1, v1, v1, ((s) * 4);
267
268 #define DST_8(OPER, I, J) \
269 OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
270 OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
271
272 /**********************************************************************
273 round macros
274 **********************************************************************/
275
276 /**********************************************************************
277 8-way chacha20 ("vertical")
278 **********************************************************************/
279
280 #define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
281 x8,x9,x10,x11,x12,x13,x14,x15,\
282 y0,y1,y2,y3,y4,y5,y6,y7,\
283 y8,y9,y10,y11,y12,y13,y14,y15,\
284 op1,op2,op3,op4,op5,op6,op7,op8,\
285 op9,op10,op11,op12) \
286 op1; \
287 PLUS(x0, x1); PLUS(x4, x5); \
288 PLUS(x8, x9); PLUS(x12, x13); \
289 PLUS(y0, y1); PLUS(y4, y5); \
290 PLUS(y8, y9); PLUS(y12, y13); \
291 op2; \
292 XOR(x3, x0); XOR(x7, x4); \
293 XOR(x11, x8); XOR(x15, x12); \
294 XOR(y3, y0); XOR(y7, y4); \
295 XOR(y11, y8); XOR(y15, y12); \
296 op3; \
297 ROTATE(x3, 16); ROTATE(x7, 16); \
298 ROTATE(x11, 16); ROTATE(x15, 16); \
299 ROTATE(y3, 16); ROTATE(y7, 16); \
300 ROTATE(y11, 16); ROTATE(y15, 16); \
301 op4; \
302 PLUS(x2, x3); PLUS(x6, x7); \
303 PLUS(x10, x11); PLUS(x14, x15); \
304 PLUS(y2, y3); PLUS(y6, y7); \
305 PLUS(y10, y11); PLUS(y14, y15); \
306 op5; \
307 XOR(x1, x2); XOR(x5, x6); \
308 XOR(x9, x10); XOR(x13, x14); \
309 XOR(y1, y2); XOR(y5, y6); \
310 XOR(y9, y10); XOR(y13, y14); \
311 op6; \
312 ROTATE(x1,12); ROTATE(x5,12); \
313 ROTATE(x9,12); ROTATE(x13,12); \
314 ROTATE(y1,12); ROTATE(y5,12); \
315 ROTATE(y9,12); ROTATE(y13,12); \
316 op7; \
317 PLUS(x0, x1); PLUS(x4, x5); \
318 PLUS(x8, x9); PLUS(x12, x13); \
319 PLUS(y0, y1); PLUS(y4, y5); \
320 PLUS(y8, y9); PLUS(y12, y13); \
321 op8; \
322 XOR(x3, x0); XOR(x7, x4); \
323 XOR(x11, x8); XOR(x15, x12); \
324 XOR(y3, y0); XOR(y7, y4); \
325 XOR(y11, y8); XOR(y15, y12); \
326 op9; \
327 ROTATE(x3,8); ROTATE(x7,8); \
328 ROTATE(x11,8); ROTATE(x15,8); \
329 ROTATE(y3,8); ROTATE(y7,8); \
330 ROTATE(y11,8); ROTATE(y15,8); \
331 op10; \
332 PLUS(x2, x3); PLUS(x6, x7); \
333 PLUS(x10, x11); PLUS(x14, x15); \
334 PLUS(y2, y3); PLUS(y6, y7); \
335 PLUS(y10, y11); PLUS(y14, y15); \
336 op11; \
337 XOR(x1, x2); XOR(x5, x6); \
338 XOR(x9, x10); XOR(x13, x14); \
339 XOR(y1, y2); XOR(y5, y6); \
340 XOR(y9, y10); XOR(y13, y14); \
341 op12; \
342 ROTATE(x1,7); ROTATE(x5,7); \
343 ROTATE(x9,7); ROTATE(x13,7); \
344 ROTATE(y1,7); ROTATE(y5,7); \
345 ROTATE(y9,7); ROTATE(y13,7);
346
347 #define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
348 y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
349 QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
350 x8,x9,x10,x11,x12,x13,x14,x15,\
351 y0,y1,y2,y3,y4,y5,y6,y7,\
352 y8,y9,y10,y11,y12,y13,y14,y15,\
353 ,,,,,,,,,,,)
354
355 #define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
356 vmrhf tmp0, v0, v1; \
357 vmrhf tmp1, v2, v3; \
358 vmrlf tmp2, v0, v1; \
359 vmrlf v3, v2, v3; \
360 vmrhf tmpa, va, vb; \
361 vmrhf tmpb, vc, vd; \
362 vmrlf tmpc, va, vb; \
363 vmrlf vd, vc, vd; \
364 vpdi v0, tmp0, tmp1, 0; \
365 vpdi v1, tmp0, tmp1, 5; \
366 vpdi v2, tmp2, v3, 0; \
367 vpdi v3, tmp2, v3, 5; \
368 vpdi va, tmpa, tmpb, 0; \
369 vpdi vb, tmpa, tmpb, 5; \
370 vpdi vc, tmpc, vd, 0; \
371 vpdi vd, tmpc, vd, 5;
372
373 .balign 8
374 .globl __chacha20_s390x_vx_blocks8
375 ENTRY (__chacha20_s390x_vx_blocks8)
376 /* input:
377 * %r2: input
378 * %r3: dst
379 * %r4: src
380 * %r5: nblks (multiple of 8)
381 */
382
383 START_STACK(%r8);
384 lgr NBLKS, %r5;
385
386 larl %r7, .Lconsts;
387
388 /* Load counter. */
389 lg %r8, (12 * 4)(INPUT);
390 rllg %r8, %r8, 32;
391
392 .balign 4
393 /* Process eight chacha20 blocks per loop. */
394 .Lloop8:
395 vlm Y0, Y3, 0(INPUT);
396
397 slgfi NBLKS, 8;
398 lghi ROUND, (20 / 2);
399
400 /* Construct counter vectors X12/X13 & Y12/Y13. */
401 vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
402 vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
403 vrepf Y12, Y3, 0;
404 vrepf Y13, Y3, 1;
405 vaccf X5, Y12, X4;
406 vaccf Y5, Y12, Y4;
407 vaf X12, Y12, X4;
408 vaf Y12, Y12, Y4;
409 vaf X13, Y13, X5;
410 vaf Y13, Y13, Y5;
411
412 vrepf X0, Y0, 0;
413 vrepf X1, Y0, 1;
414 vrepf X2, Y0, 2;
415 vrepf X3, Y0, 3;
416 vrepf X4, Y1, 0;
417 vrepf X5, Y1, 1;
418 vrepf X6, Y1, 2;
419 vrepf X7, Y1, 3;
420 vrepf X8, Y2, 0;
421 vrepf X9, Y2, 1;
422 vrepf X10, Y2, 2;
423 vrepf X11, Y2, 3;
424 vrepf X14, Y3, 2;
425 vrepf X15, Y3, 3;
426
427 /* Store counters for blocks 0-7. */
428 vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
429 vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
430
431 vlr Y0, X0;
432 vlr Y1, X1;
433 vlr Y2, X2;
434 vlr Y3, X3;
435 vlr Y4, X4;
436 vlr Y5, X5;
437 vlr Y6, X6;
438 vlr Y7, X7;
439 vlr Y8, X8;
440 vlr Y9, X9;
441 vlr Y10, X10;
442 vlr Y11, X11;
443 vlr Y14, X14;
444 vlr Y15, X15;
445
446 /* Update and store counter. */
447 agfi %r8, 8;
448 rllg %r5, %r8, 32;
449 stg %r5, (12 * 4)(INPUT);
450
451 .balign 4
452 .Lround2_8:
453 QUARTERROUND4_V8(X0, X4, X8, X12, X1, X5, X9, X13,
454 X2, X6, X10, X14, X3, X7, X11, X15,
455 Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13,
456 Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15);
457 QUARTERROUND4_V8(X0, X5, X10, X15, X1, X6, X11, X12,
458 X2, X7, X8, X13, X3, X4, X9, X14,
459 Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12,
460 Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14);
461 brctg ROUND, .Lround2_8;
462
463 /* Store blocks 4-7. */
464 vstm Y0, Y15, STACK_Y0_Y15(%r15);
465
466 /* Load counters for blocks 0-3. */
467 vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
468
469 lghi ROUND, 1;
470 j .Lfirst_output_4blks_8;
471
472 .balign 4
473 .Lsecond_output_4blks_8:
474 /* Load blocks 4-7. */
475 vlm X0, X15, STACK_Y0_Y15(%r15);
476
477 /* Load counters for blocks 4-7. */
478 vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
479
480 lghi ROUND, 0;
481
482 .balign 4
483 /* Output four chacha20 blocks per loop. */
484 .Lfirst_output_4blks_8:
485 vlm Y12, Y15, 0(INPUT);
486 PLUS(X12, Y0);
487 PLUS(X13, Y1);
488 vrepf Y0, Y12, 0;
489 vrepf Y1, Y12, 1;
490 vrepf Y2, Y12, 2;
491 vrepf Y3, Y12, 3;
492 vrepf Y4, Y13, 0;
493 vrepf Y5, Y13, 1;
494 vrepf Y6, Y13, 2;
495 vrepf Y7, Y13, 3;
496 vrepf Y8, Y14, 0;
497 vrepf Y9, Y14, 1;
498 vrepf Y10, Y14, 2;
499 vrepf Y11, Y14, 3;
500 vrepf Y14, Y15, 2;
501 vrepf Y15, Y15, 3;
502 PLUS(X0, Y0);
503 PLUS(X1, Y1);
504 PLUS(X2, Y2);
505 PLUS(X3, Y3);
506 PLUS(X4, Y4);
507 PLUS(X5, Y5);
508 PLUS(X6, Y6);
509 PLUS(X7, Y7);
510 PLUS(X8, Y8);
511 PLUS(X9, Y9);
512 PLUS(X10, Y10);
513 PLUS(X11, Y11);
514 PLUS(X14, Y14);
515 PLUS(X15, Y15);
516
517 vl Y15, (.Lbswap32 - .Lconsts)(%r7);
518 TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
519 Y9, Y10, Y11, Y12, Y13, Y14);
520 TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
521 Y9, Y10, Y11, Y12, Y13, Y14);
522
523 vlm Y0, Y14, 0(SRC);
524 vperm X0, X0, X0, Y15;
525 vperm X1, X1, X1, Y15;
526 vperm X2, X2, X2, Y15;
527 vperm X3, X3, X3, Y15;
528 vperm X4, X4, X4, Y15;
529 vperm X5, X5, X5, Y15;
530 vperm X6, X6, X6, Y15;
531 vperm X7, X7, X7, Y15;
532 vperm X8, X8, X8, Y15;
533 vperm X9, X9, X9, Y15;
534 vperm X10, X10, X10, Y15;
535 vperm X11, X11, X11, Y15;
536 vperm X12, X12, X12, Y15;
537 vperm X13, X13, X13, Y15;
538 vperm X14, X14, X14, Y15;
539 vperm X15, X15, X15, Y15;
540 vl Y15, (15 * 16)(SRC);
541
542 XOR(Y0, X0);
543 XOR(Y1, X4);
544 XOR(Y2, X8);
545 XOR(Y3, X12);
546 XOR(Y4, X1);
547 XOR(Y5, X5);
548 XOR(Y6, X9);
549 XOR(Y7, X13);
550 XOR(Y8, X2);
551 XOR(Y9, X6);
552 XOR(Y10, X10);
553 XOR(Y11, X14);
554 XOR(Y12, X3);
555 XOR(Y13, X7);
556 XOR(Y14, X11);
557 XOR(Y15, X15);
558 vstm Y0, Y15, 0(DST);
559
560 aghi SRC, 256;
561 aghi DST, 256;
562
563 clgije ROUND, 1, .Lsecond_output_4blks_8;
564
565 clgijhe NBLKS, 8, .Lloop8;
566
567
568 END_STACK(%r8);
569 xgr %r2, %r2;
570 br %r14;
571 END (__chacha20_s390x_vx_blocks8)
572
573 #endif /* HAVE_S390_VX_ASM_SUPPORT */