1 /* Optimized s390x implementation of ChaCha20 cipher.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 /* chacha20-s390x.S - zSeries implementation of ChaCha20 cipher
21 Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
23 This file is part of Libgcrypt.
25 Libgcrypt is free software; you can redistribute it and/or modify
26 it under the terms of the GNU Lesser General Public License as
27 published by the Free Software Foundation; either version 2.1 of
28 the License, or (at your option) any later version.
30 Libgcrypt is distributed in the hope that it will be useful,
31 but WITHOUT ANY WARRANTY; without even the implied warranty of
32 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 GNU Lesser General Public License for more details.
35 You should have received a copy of the GNU Lesser General Public
36 License along with this program; if not, see <https://www.gnu.org/licenses/>.
41 #ifdef HAVE_S390_VX_ASM_SUPPORT
43 /* CFA expressions are used for pointing CFA and registers to
44 * SP relative offsets. */
45 # define DW_REGNO_SP 15
47 /* Fixed length encoding used for integers for now. */
48 # define DW_SLEB128_7BIT(value) \
50 # define DW_SLEB128_28BIT(value) \
51 0x80|((value)&0x7f), \
52 0x80|(((value)>>7)&0x7f), \
53 0x80|(((value)>>14)&0x7f), \
54 0x00|(((value)>>21)&0x7f)
56 # define cfi_cfa_on_stack(rsp_offs,cfa_depth) \
58 0x0f, /* DW_CFA_def_cfa_expression */ \
59 DW_SLEB128_7BIT(11), /* length */ \
60 0x7f, /* DW_OP_breg15, rsp + constant */ \
61 DW_SLEB128_28BIT(rsp_offs), \
62 0x06, /* DW_OP_deref */ \
63 0x23, /* DW_OP_plus_constu */ \
64 DW_SLEB128_28BIT((cfa_depth)+160)
72 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
74 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
76 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
93 #define STACK_FRAME_STD (8 * 16 + 8 * 4)
94 #define STACK_FRAME_F8_F15 (8 * 8)
95 #define STACK_FRAME_Y0_Y15 (16 * 16)
96 #define STACK_FRAME_CTR (4 * 16)
97 #define STACK_FRAME_PARAMS (6 * 8)
99 #define STACK_MAX (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
100 STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
103 #define STACK_F8 (STACK_MAX - STACK_FRAME_F8_F15)
104 #define STACK_F9 (STACK_F8 + 8)
105 #define STACK_F10 (STACK_F9 + 8)
106 #define STACK_F11 (STACK_F10 + 8)
107 #define STACK_F12 (STACK_F11 + 8)
108 #define STACK_F13 (STACK_F12 + 8)
109 #define STACK_F14 (STACK_F13 + 8)
110 #define STACK_F15 (STACK_F14 + 8)
111 #define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
112 #define STACK_CTR (STACK_Y0_Y15 - STACK_FRAME_CTR)
113 #define STACK_INPUT (STACK_CTR - STACK_FRAME_PARAMS)
114 #define STACK_DST (STACK_INPUT + 8)
115 #define STACK_SRC (STACK_DST + 8)
116 #define STACK_NBLKS (STACK_SRC + 8)
117 #define STACK_POCTX (STACK_NBLKS + 8)
118 #define STACK_POSRC (STACK_POCTX + 8)
120 #define STACK_G0_H3 STACK_Y0_Y15
122 /* vector registers */
216 /**********************************************************************
218 **********************************************************************/
222 #define START_STACK(last_r) \
225 stmg %r6, last_r, 6 * 8(%r15); \
226 aghi %r0, -STACK_MAX; \
229 cfi_def_cfa_register(1); \
232 cfi_cfa_on_stack(0, 0); \
233 std %f8, STACK_F8(%r15); \
234 std %f9, STACK_F9(%r15); \
235 std %f10, STACK_F10(%r15); \
236 std %f11, STACK_F11(%r15); \
237 std %f12, STACK_F12(%r15); \
238 std %f13, STACK_F13(%r15); \
239 std %f14, STACK_F14(%r15); \
240 std %f15, STACK_F15(%r15);
242 #define END_STACK(last_r) \
244 ld %f8, STACK_F8(%r15); \
245 ld %f9, STACK_F9(%r15); \
246 ld %f10, STACK_F10(%r15); \
247 ld %f11, STACK_F11(%r15); \
248 ld %f12, STACK_F12(%r15); \
249 ld %f13, STACK_F13(%r15); \
250 ld %f14, STACK_F14(%r15); \
251 ld %f15, STACK_F15(%r15); \
252 lmg %r6, last_r, 6 * 8(%r1); \
254 cfi_def_cfa_register(DW_REGNO_SP);
256 #define PLUS(dst,src) \
259 #define XOR(dst,src) \
262 #define ROTATE(v1,c) \
263 verllf v1, v1, (c)(0);
265 #define WORD_ROTATE(v1,s) \
266 vsldb v1, v1, v1, ((s) * 4);
268 #define DST_8(OPER, I, J) \
269 OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
270 OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
272 /**********************************************************************
274 **********************************************************************/
276 /**********************************************************************
277 8-way chacha20 ("vertical")
278 **********************************************************************/
280 #define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
281 x8,x9,x10,x11,x12,x13,x14,x15,\
282 y0,y1,y2,y3,y4,y5,y6,y7,\
283 y8,y9,y10,y11,y12,y13,y14,y15,\
284 op1,op2,op3,op4,op5,op6,op7,op8,\
285 op9,op10,op11,op12) \
287 PLUS(x0, x1); PLUS(x4, x5); \
288 PLUS(x8, x9); PLUS(x12, x13); \
289 PLUS(y0, y1); PLUS(y4, y5); \
290 PLUS(y8, y9); PLUS(y12, y13); \
292 XOR(x3, x0); XOR(x7, x4); \
293 XOR(x11, x8); XOR(x15, x12); \
294 XOR(y3, y0); XOR(y7, y4); \
295 XOR(y11, y8); XOR(y15, y12); \
297 ROTATE(x3, 16); ROTATE(x7, 16); \
298 ROTATE(x11, 16); ROTATE(x15, 16); \
299 ROTATE(y3, 16); ROTATE(y7, 16); \
300 ROTATE(y11, 16); ROTATE(y15, 16); \
302 PLUS(x2, x3); PLUS(x6, x7); \
303 PLUS(x10, x11); PLUS(x14, x15); \
304 PLUS(y2, y3); PLUS(y6, y7); \
305 PLUS(y10, y11); PLUS(y14, y15); \
307 XOR(x1, x2); XOR(x5, x6); \
308 XOR(x9, x10); XOR(x13, x14); \
309 XOR(y1, y2); XOR(y5, y6); \
310 XOR(y9, y10); XOR(y13, y14); \
312 ROTATE(x1,12); ROTATE(x5,12); \
313 ROTATE(x9,12); ROTATE(x13,12); \
314 ROTATE(y1,12); ROTATE(y5,12); \
315 ROTATE(y9,12); ROTATE(y13,12); \
317 PLUS(x0, x1); PLUS(x4, x5); \
318 PLUS(x8, x9); PLUS(x12, x13); \
319 PLUS(y0, y1); PLUS(y4, y5); \
320 PLUS(y8, y9); PLUS(y12, y13); \
322 XOR(x3, x0); XOR(x7, x4); \
323 XOR(x11, x8); XOR(x15, x12); \
324 XOR(y3, y0); XOR(y7, y4); \
325 XOR(y11, y8); XOR(y15, y12); \
327 ROTATE(x3,8); ROTATE(x7,8); \
328 ROTATE(x11,8); ROTATE(x15,8); \
329 ROTATE(y3,8); ROTATE(y7,8); \
330 ROTATE(y11,8); ROTATE(y15,8); \
332 PLUS(x2, x3); PLUS(x6, x7); \
333 PLUS(x10, x11); PLUS(x14, x15); \
334 PLUS(y2, y3); PLUS(y6, y7); \
335 PLUS(y10, y11); PLUS(y14, y15); \
337 XOR(x1, x2); XOR(x5, x6); \
338 XOR(x9, x10); XOR(x13, x14); \
339 XOR(y1, y2); XOR(y5, y6); \
340 XOR(y9, y10); XOR(y13, y14); \
342 ROTATE(x1,7); ROTATE(x5,7); \
343 ROTATE(x9,7); ROTATE(x13,7); \
344 ROTATE(y1,7); ROTATE(y5,7); \
345 ROTATE(y9,7); ROTATE(y13,7);
347 #define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
348 y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
349 QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
350 x8,x9,x10,x11,x12,x13,x14,x15,\
351 y0,y1,y2,y3,y4,y5,y6,y7,\
352 y8,y9,y10,y11,y12,y13,y14,y15,\
353 ,,,,,,,,,,,)
355 #define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
356 vmrhf tmp0, v0, v1; \
357 vmrhf tmp1, v2, v3; \
358 vmrlf tmp2, v0, v1; \
360 vmrhf tmpa, va, vb; \
361 vmrhf tmpb, vc, vd; \
362 vmrlf tmpc, va, vb; \
364 vpdi v0, tmp0, tmp1, 0; \
365 vpdi v1, tmp0, tmp1, 5; \
366 vpdi v2, tmp2, v3, 0; \
367 vpdi v3, tmp2, v3, 5; \
368 vpdi va, tmpa, tmpb, 0; \
369 vpdi vb, tmpa, tmpb, 5; \
370 vpdi vc, tmpc, vd, 0; \
371 vpdi vd, tmpc, vd, 5;
374 .globl __chacha20_s390x_vx_blocks8
375 ENTRY (__chacha20_s390x_vx_blocks8)
380 * %r5: nblks (multiple of 8)
389 lg %r8, (12 * 4)(INPUT);
393 /* Process eight chacha20 blocks per loop. */
395 vlm Y0, Y3, 0(INPUT);
398 lghi ROUND, (20 / 2);
400 /* Construct counter vectors X12/X13 & Y12/Y13. */
401 vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
402 vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
427 /* Store counters for blocks 0-7. */
428 vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
429 vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
446 /* Update and store counter. */
449 stg %r5, (12 * 4)(INPUT);
453 QUARTERROUND4_V8(X0, X4, X8, X12, X1, X5, X9, X13,
454 X2, X6, X10, X14, X3, X7, X11, X15,
455 Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13,
456 Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15);
457 QUARTERROUND4_V8(X0, X5, X10, X15, X1, X6, X11, X12,
458 X2, X7, X8, X13, X3, X4, X9, X14,
459 Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12,
460 Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14);
461 brctg ROUND, .Lround2_8;
463 /* Store blocks 4-7. */
464 vstm Y0, Y15, STACK_Y0_Y15(%r15);
466 /* Load counters for blocks 0-3. */
467 vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
470 j .Lfirst_output_4blks_8;
473 .Lsecond_output_4blks_8:
474 /* Load blocks 4-7. */
475 vlm X0, X15, STACK_Y0_Y15(%r15);
477 /* Load counters for blocks 4-7. */
478 vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
483 /* Output four chacha20 blocks per loop. */
484 .Lfirst_output_4blks_8:
485 vlm Y12, Y15, 0(INPUT);
517 vl Y15, (.Lbswap32 - .Lconsts)(%r7);
518 TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
519 Y9, Y10, Y11, Y12, Y13, Y14);
520 TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
521 Y9, Y10, Y11, Y12, Y13, Y14);
524 vperm X0, X0, X0, Y15;
525 vperm X1, X1, X1, Y15;
526 vperm X2, X2, X2, Y15;
527 vperm X3, X3, X3, Y15;
528 vperm X4, X4, X4, Y15;
529 vperm X5, X5, X5, Y15;
530 vperm X6, X6, X6, Y15;
531 vperm X7, X7, X7, Y15;
532 vperm X8, X8, X8, Y15;
533 vperm X9, X9, X9, Y15;
534 vperm X10, X10, X10, Y15;
535 vperm X11, X11, X11, Y15;
536 vperm X12, X12, X12, Y15;
537 vperm X13, X13, X13, Y15;
538 vperm X14, X14, X14, Y15;
539 vperm X15, X15, X15, Y15;
540 vl Y15, (15 * 16)(SRC);
558 vstm Y0, Y15, 0(DST);
563 clgije ROUND, 1, .Lsecond_output_4blks_8;
565 clgijhe NBLKS, 8, .Lloop8;
571 END (__chacha20_s390x_vx_blocks8)
573 #endif /* HAVE_S390_VX_ASM_SUPPORT */