sysdeps/aarch64/chacha20-aarch64.S

   1 /* Optimized AArch64 implementation of ChaCha20 cipher.
   2    Copyright (C) 2022 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 /* Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  21
  22    This file is part of Libgcrypt.
  23
  24    Libgcrypt is free software; you can redistribute it and/or modify
  25    it under the terms of the GNU Lesser General Public License as
  26    published by the Free Software Foundation; either version 2.1 of
  27    the License, or (at your option) any later version.
  28
  29    Libgcrypt is distributed in the hope that it will be useful,
  30    but WITHOUT ANY WARRANTY; without even the implied warranty of
  31    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  32    GNU Lesser General Public License for more details.
  33
  34    You should have received a copy of the GNU Lesser General Public
  35    License along with this program; if not, see <https://www.gnu.org/licenses/>.
  36  */
  37
  38 /* Based on D. J. Bernstein reference implementation at
  39    http://cr.yp.to/chacha.html:
  40
  41    chacha-regs.c version 20080118
  42    D. J. Bernstein
  43    Public domain.  */
  44
  45 #include <sysdep.h>
  46
  47 /* Only LE is supported.  */
  48 #ifdef __AARCH64EL__
  49
  50 #define GET_DATA_POINTER(reg, name) \
  51         adrp    reg, name ; \
  52         add     reg, reg, :lo12:name
  53
  54 /* 'ret' instruction replacement for straight-line speculation mitigation */
  55 #define ret_spec_stop \
  56         ret; dsb sy; isb;
  57
  58 .cpu generic+simd
  59
  60 .text
  61
  62 /* register macros */
  63 #define INPUT     x0
  64 #define DST       x1
  65 #define SRC       x2
  66 #define NBLKS     x3
  67 #define ROUND     x4
  68 #define INPUT_CTR x5
  69 #define INPUT_POS x6
  70 #define CTR       x7
  71
  72 /* vector registers */
  73 #define X0 v16
  74 #define X4 v17
  75 #define X8 v18
  76 #define X12 v19
  77
  78 #define X1 v20
  79 #define X5 v21
  80
  81 #define X9 v22
  82 #define X13 v23
  83 #define X2 v24
  84 #define X6 v25
  85
  86 #define X3 v26
  87 #define X7 v27
  88 #define X11 v28
  89 #define X15 v29
  90
  91 #define X10 v30
  92 #define X14 v31
  93
  94 #define VCTR    v0
  95 #define VTMP0   v1
  96 #define VTMP1   v2
  97 #define VTMP2   v3
  98 #define VTMP3   v4
  99 #define X12_TMP v5
 100 #define X13_TMP v6
 101 #define ROT8    v7
 102
 103 /**********************************************************************
 104   helper macros
 105  **********************************************************************/
 106
 107 #define _(...) __VA_ARGS__
 108
 109 #define vpunpckldq(s1, s2, dst) \
 110         zip1 dst.4s, s2.4s, s1.4s;
 111
 112 #define vpunpckhdq(s1, s2, dst) \
 113         zip2 dst.4s, s2.4s, s1.4s;
 114
 115 #define vpunpcklqdq(s1, s2, dst) \
 116         zip1 dst.2d, s2.2d, s1.2d;
 117
 118 #define vpunpckhqdq(s1, s2, dst) \
 119         zip2 dst.2d, s2.2d, s1.2d;
 120
 121 /* 4x4 32-bit integer matrix transpose */
 122 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 123         vpunpckhdq(x1, x0, t2); \
 124         vpunpckldq(x1, x0, x0); \
 125         \
 126         vpunpckldq(x3, x2, t1); \
 127         vpunpckhdq(x3, x2, x2); \
 128         \
 129         vpunpckhqdq(t1, x0, x1); \
 130         vpunpcklqdq(t1, x0, x0); \
 131         \
 132         vpunpckhqdq(x2, t2, x3); \
 133         vpunpcklqdq(x2, t2, x2);
 134
 135 /**********************************************************************
 136   4-way chacha20
 137  **********************************************************************/
 138
 139 #define XOR(d,s1,s2) \
 140         eor d.16b, s2.16b, s1.16b;
 141
 142 #define PLUS(ds,s) \
 143         add ds.4s, ds.4s, s.4s;
 144
 145 #define ROTATE4(dst1,dst2,dst3,dst4,c,src1,src2,src3,src4) \
 146         shl dst1.4s, src1.4s, #(c);             \
 147         shl dst2.4s, src2.4s, #(c);             \
 148         shl dst3.4s, src3.4s, #(c);             \
 149         shl dst4.4s, src4.4s, #(c);             \
 150         sri dst1.4s, src1.4s, #(32 - (c));      \
 151         sri dst2.4s, src2.4s, #(32 - (c));      \
 152         sri dst3.4s, src3.4s, #(32 - (c));      \
 153         sri dst4.4s, src4.4s, #(32 - (c));
 154
 155 #define ROTATE4_8(dst1,dst2,dst3,dst4,src1,src2,src3,src4) \
 156         tbl dst1.16b, {src1.16b}, ROT8.16b;     \
 157         tbl dst2.16b, {src2.16b}, ROT8.16b;     \
 158         tbl dst3.16b, {src3.16b}, ROT8.16b;     \
 159         tbl dst4.16b, {src4.16b}, ROT8.16b;
 160
 161 #define ROTATE4_16(dst1,dst2,dst3,dst4,src1,src2,src3,src4) \
 162         rev32 dst1.8h, src1.8h;                 \
 163         rev32 dst2.8h, src2.8h;                 \
 164         rev32 dst3.8h, src3.8h;                 \
 165         rev32 dst4.8h, src4.8h;
 166
 167 #define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4,ign,tmp1,tmp2,tmp3,tmp4) \
 168         PLUS(a1,b1); PLUS(a2,b2);                                               \
 169         PLUS(a3,b3); PLUS(a4,b4);                                               \
 170             XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);                                   \
 171             XOR(tmp3,d3,a3); XOR(tmp4,d4,a4);                                   \
 172                 ROTATE4_16(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4);             \
 173         PLUS(c1,d1); PLUS(c2,d2);                                               \
 174         PLUS(c3,d3); PLUS(c4,d4);                                               \
 175             XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);                                   \
 176             XOR(tmp3,b3,c3); XOR(tmp4,b4,c4);                                   \
 177                 ROTATE4(b1, b2, b3, b4, 12, tmp1, tmp2, tmp3, tmp4)             \
 178         PLUS(a1,b1); PLUS(a2,b2);                                               \
 179         PLUS(a3,b3); PLUS(a4,b4);                                               \
 180             XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);                                   \
 181             XOR(tmp3,d3,a3); XOR(tmp4,d4,a4);                                   \
 182                 ROTATE4_8(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4)               \
 183         PLUS(c1,d1); PLUS(c2,d2);                                               \
 184         PLUS(c3,d3); PLUS(c4,d4);                                               \
 185             XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);                                   \
 186             XOR(tmp3,b3,c3); XOR(tmp4,b4,c4);                                   \
 187                 ROTATE4(b1, b2, b3, b4, 7, tmp1, tmp2, tmp3, tmp4)              \
 188
 189 .align 4
 190 L(__chacha20_blocks4_data_inc_counter):
 191         .long 0,1,2,3
 192
 193 .align 4
 194 L(__chacha20_blocks4_data_rot8):
 195         .byte 3,0,1,2
 196         .byte 7,4,5,6
 197         .byte 11,8,9,10
 198         .byte 15,12,13,14
 199
 200 .hidden __chacha20_neon_blocks4
 201 ENTRY (__chacha20_neon_blocks4)
 202         /* input:
 203          *      x0: input
 204          *      x1: dst
 205          *      x2: src
 206          *      x3: nblks (multiple of 4)
 207          */
 208
 209         GET_DATA_POINTER(CTR, L(__chacha20_blocks4_data_rot8))
 210         add INPUT_CTR, INPUT, #(12*4);
 211         ld1 {ROT8.16b}, [CTR];
 212         GET_DATA_POINTER(CTR, L(__chacha20_blocks4_data_inc_counter))
 213         mov INPUT_POS, INPUT;
 214         ld1 {VCTR.16b}, [CTR];
 215
 216 L(loop4):
 217         /* Construct counter vectors X12 and X13 */
 218
 219         ld1 {X15.16b}, [INPUT_CTR];
 220         mov ROUND, #20;
 221         ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
 222
 223         dup X12.4s, X15.s[0];
 224         dup X13.4s, X15.s[1];
 225         ldr CTR, [INPUT_CTR];
 226         add X12.4s, X12.4s, VCTR.4s;
 227         dup X0.4s, VTMP1.s[0];
 228         dup X1.4s, VTMP1.s[1];
 229         dup X2.4s, VTMP1.s[2];
 230         dup X3.4s, VTMP1.s[3];
 231         dup X14.4s, X15.s[2];
 232         cmhi VTMP0.4s, VCTR.4s, X12.4s;
 233         dup X15.4s, X15.s[3];
 234         add CTR, CTR, #4; /* Update counter */
 235         dup X4.4s, VTMP2.s[0];
 236         dup X5.4s, VTMP2.s[1];
 237         dup X6.4s, VTMP2.s[2];
 238         dup X7.4s, VTMP2.s[3];
 239         sub X13.4s, X13.4s, VTMP0.4s;
 240         dup X8.4s, VTMP3.s[0];
 241         dup X9.4s, VTMP3.s[1];
 242         dup X10.4s, VTMP3.s[2];
 243         dup X11.4s, VTMP3.s[3];
 244         mov X12_TMP.16b, X12.16b;
 245         mov X13_TMP.16b, X13.16b;
 246         str CTR, [INPUT_CTR];
 247
 248 L(round2):
 249         subs ROUND, ROUND, #2
 250         QUARTERROUND4(X0, X4,  X8, X12,   X1, X5,  X9, X13,
 251                       X2, X6, X10, X14,   X3, X7, X11, X15,
 252                       tmp:=,VTMP0,VTMP1,VTMP2,VTMP3)
 253         QUARTERROUND4(X0, X5, X10, X15,   X1, X6, X11, X12,
 254                       X2, X7,  X8, X13,   X3, X4,  X9, X14,
 255                       tmp:=,VTMP0,VTMP1,VTMP2,VTMP3)
 256         b.ne L(round2);
 257
 258         ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
 259
 260         PLUS(X12, X12_TMP);        /* INPUT + 12 * 4 + counter */
 261         PLUS(X13, X13_TMP);        /* INPUT + 13 * 4 + counter */
 262
 263         dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
 264         dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
 265         dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
 266         dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
 267         PLUS(X0, VTMP2);
 268         PLUS(X1, VTMP3);
 269         PLUS(X2, X12_TMP);
 270         PLUS(X3, X13_TMP);
 271
 272         dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
 273         dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
 274         dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
 275         dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
 276         ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
 277         mov INPUT_POS, INPUT;
 278         PLUS(X4, VTMP2);
 279         PLUS(X5, VTMP3);
 280         PLUS(X6, X12_TMP);
 281         PLUS(X7, X13_TMP);
 282
 283         dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
 284         dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
 285         dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
 286         dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
 287         dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
 288         dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
 289         PLUS(X8, VTMP2);
 290         PLUS(X9, VTMP3);
 291         PLUS(X10, X12_TMP);
 292         PLUS(X11, X13_TMP);
 293         PLUS(X14, VTMP0);
 294         PLUS(X15, VTMP1);
 295
 296         transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
 297         transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
 298         transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
 299         transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
 300
 301         subs NBLKS, NBLKS, #4;
 302
 303         st1 {X0.16b,X4.16B,X8.16b, X12.16b}, [DST], #64
 304         st1 {X1.16b,X5.16b}, [DST], #32;
 305         st1 {X9.16b, X13.16b, X2.16b, X6.16b}, [DST], #64
 306         st1 {X10.16b,X14.16b}, [DST], #32;
 307         st1 {X3.16b, X7.16b, X11.16b, X15.16b}, [DST], #64;
 308
 309         b.ne L(loop4);
 310
 311         ret_spec_stop
 312 END (__chacha20_neon_blocks4)
 313
 314 #endif