]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/chacha/asm/chacha-loongarch64.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-loongarch64.pl
CommitLineData
9a41a3c6
MZ
1#! /usr/bin/env perl
2# Author: Min Zhou <zhoumin@loongson.cn>
b6461792 3# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
9a41a3c6 4#
e5313f20 5# Licensed under the Apache License 2.0 (the "License"). You may not use
9a41a3c6
MZ
6# this file except in compliance with the License. You can obtain a copy
7# in the file LICENSE in the source distribution or at
8# https://www.openssl.org/source/license.html
9
10use strict;
11
12my $code;
13
14# Here is the scalar register layout for LoongArch.
15my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22));
16my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
17my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21));
18my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$r$_",(23..31));
19
b46de72c
XR
20# The saved floating-point registers in the LP64D ABI. In LoongArch
21# with vector extension, the low 64 bits of a vector register alias with
22# the corresponding FPR. So we must save and restore the corresponding
23# FPR if we'll write into a vector register. The ABI only requires
24# saving and restoring the FPR (i.e. 64 bits of the corresponding vector
25# register), not the entire vector register.
26my ($fs0,$fs1,$fs2,$fs3,$fs4,$fs5,$fs6,$fs7)=map("\$f$_",(24..31));
27
9a41a3c6
MZ
28# Here is the 128-bit vector register layout for LSX extension.
29my ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10,
30 $vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19,
31 $vr20,$vr21,$vr22,$vr23,$vr24,$vr25,$vr26,$vr27,$vr28,
32 $vr29,$vr30,$vr31)=map("\$vr$_",(0..31));
33
34# Here is the 256-bit vector register layout for LASX extension.
35my ($xr0,$xr1,$xr2,$xr3,$xr4,$xr5,$xr6,$xr7,$xr8,$xr9,$xr10,
36 $xr11,$xr12,$xr13,$xr14,$xr15,$xr16,$xr17,$xr18,$xr19,
37 $xr20,$xr21,$xr22,$xr23,$xr24,$xr25,$xr26,$xr27,$xr28,
38 $xr29,$xr30,$xr31)=map("\$xr$_",(0..31));
39
40my $output;
41for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
42open STDOUT,">$output";
43
44# Input parameter block
45my ($out, $inp, $len, $key, $counter) = ($a0, $a1, $a2, $a3, $a4);
46
47$code .= <<EOF;
48#include "loongarch_arch.h"
49
50.text
51
52.extern OPENSSL_loongarch_hwcap_P
53
54.align 6
55.Lsigma:
56.ascii "expand 32-byte k"
57.Linc8x:
58.long 0,1,2,3,4,5,6,7
59.Linc4x:
60.long 0,1,2,3
61
62.globl ChaCha20_ctr32
63.type ChaCha20_ctr32 function
64
65.align 6
66ChaCha20_ctr32:
67 # $a0 = arg #1 (out pointer)
68 # $a1 = arg #2 (inp pointer)
69 # $a2 = arg #3 (len)
70 # $a3 = arg #4 (key array)
71 # $a4 = arg #5 (counter array)
72
73 beqz $len,.Lno_data
97102853 74 ori $t3,$zero,64
9a41a3c6
MZ
75 la.pcrel $t0,OPENSSL_loongarch_hwcap_P
76 ld.w $t0,$t0,0
77
b46de72c
XR
78 bleu $len,$t3,.LChaCha20_1x # goto 1x when len <= 64
79
80 andi $t0,$t0,LOONGARCH_HWCAP_LASX | LOONGARCH_HWCAP_LSX
81 beqz $t0,.LChaCha20_1x
82
83 addi.d $sp,$sp,-64
84 fst.d $fs0,$sp,0
85 fst.d $fs1,$sp,8
86 fst.d $fs2,$sp,16
87 fst.d $fs3,$sp,24
88 fst.d $fs4,$sp,32
89 fst.d $fs5,$sp,40
90 fst.d $fs6,$sp,48
91 fst.d $fs7,$sp,56
92
9a41a3c6
MZ
93 andi $t1,$t0,LOONGARCH_HWCAP_LASX
94 bnez $t1,.LChaCha20_8x
95
b46de72c 96 b .LChaCha20_4x
9a41a3c6
MZ
97
98EOF
99
100########################################################################
101# Scalar code path that handles all lengths.
102{
103# Load the initial states in array @x[*] and update directly
104my @x = ($t0, $t1, $t2, $t3, $t4, $t5, $t6, $t7,
105 $s0, $s1, $s2, $s3, $s4, $s5, $s6, $s7);
106
107sub ROUND {
108 my ($a0,$b0,$c0,$d0) = @_;
109 my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
110 my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
111 my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
112
113$code .= <<EOF;
114 add.w @x[$a0],@x[$a0],@x[$b0]
115 xor @x[$d0],@x[$d0],@x[$a0]
116 rotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits
117 add.w @x[$a1],@x[$a1],@x[$b1]
118 xor @x[$d1],@x[$d1],@x[$a1]
119 rotri.w @x[$d1],@x[$d1],16
120
121 add.w @x[$c0],@x[$c0],@x[$d0]
122 xor @x[$b0],@x[$b0],@x[$c0]
123 rotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits
124 add.w @x[$c1],@x[$c1],@x[$d1]
125 xor @x[$b1],@x[$b1],@x[$c1]
126 rotri.w @x[$b1],@x[$b1],20
127
128 add.w @x[$a0],@x[$a0],@x[$b0]
129 xor @x[$d0],@x[$d0],@x[$a0]
130 rotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits
131 add.w @x[$a1],@x[$a1],@x[$b1]
132 xor @x[$d1],@x[$d1],@x[$a1]
133 rotri.w @x[$d1],@x[$d1],24
134
135 add.w @x[$c0],@x[$c0],@x[$d0]
136 xor @x[$b0],@x[$b0],@x[$c0]
137 rotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits
138 add.w @x[$c1],@x[$c1],@x[$d1]
139 xor @x[$b1],@x[$b1],@x[$c1]
140 rotri.w @x[$b1],@x[$b1],25
141
142 add.w @x[$a2],@x[$a2],@x[$b2]
143 xor @x[$d2],@x[$d2],@x[$a2]
144 rotri.w @x[$d2],@x[$d2],16
145 add.w @x[$a3],@x[$a3],@x[$b3]
146 xor @x[$d3],@x[$d3],@x[$a3]
147 rotri.w @x[$d3],@x[$d3],16
148
149 add.w @x[$c2],@x[$c2],@x[$d2]
150 xor @x[$b2],@x[$b2],@x[$c2]
151 rotri.w @x[$b2],@x[$b2],20
152 add.w @x[$c3],@x[$c3],@x[$d3]
153 xor @x[$b3],@x[$b3],@x[$c3]
154 rotri.w @x[$b3],@x[$b3],20
155
156 add.w @x[$a2],@x[$a2],@x[$b2]
157 xor @x[$d2],@x[$d2],@x[$a2]
158 rotri.w @x[$d2],@x[$d2],24
159 add.w @x[$a3],@x[$a3],@x[$b3]
160 xor @x[$d3],@x[$d3],@x[$a3]
161 rotri.w @x[$d3],@x[$d3],24
162
163 add.w @x[$c2],@x[$c2],@x[$d2]
164 xor @x[$b2],@x[$b2],@x[$c2]
165 rotri.w @x[$b2],@x[$b2],25
166 add.w @x[$c3],@x[$c3],@x[$d3]
167 xor @x[$b3],@x[$b3],@x[$c3]
168 rotri.w @x[$b3],@x[$b3],25
169
170EOF
171}
172
173$code .= <<EOF;
174.align 6
175.LChaCha20_1x:
176 addi.d $sp,$sp,-256
177 st.d $s0,$sp,0
178 st.d $s1,$sp,8
179 st.d $s2,$sp,16
180 st.d $s3,$sp,24
181 st.d $s4,$sp,32
182 st.d $s5,$sp,40
183 st.d $s6,$sp,48
184 st.d $s7,$sp,56
185 st.d $s8,$sp,64
186
187 # Save the initial block counter in $s8
188 ld.w $s8,$counter,0
189 b .Loop_outer_1x
190
191.align 5
192.Loop_outer_1x:
193 # Load constants
194 la.local $t8,.Lsigma
195 ld.w @x[0],$t8,0 # 'expa'
196 ld.w @x[1],$t8,4 # 'nd 3'
197 ld.w @x[2],$t8,8 # '2-by'
198 ld.w @x[3],$t8,12 # 'te k'
199
200 # Load key
201 ld.w @x[4],$key,4*0
202 ld.w @x[5],$key,4*1
203 ld.w @x[6],$key,4*2
204 ld.w @x[7],$key,4*3
205 ld.w @x[8],$key,4*4
206 ld.w @x[9],$key,4*5
207 ld.w @x[10],$key,4*6
208 ld.w @x[11],$key,4*7
209
210 # Load block counter
211 move @x[12],$s8
212
213 # Load nonce
214 ld.w @x[13],$counter,4*1
215 ld.w @x[14],$counter,4*2
216 ld.w @x[15],$counter,4*3
217
218 # Update states in \@x[*] for 20 rounds
219 ori $t8,$zero,10
220 b .Loop_1x
221
222.align 5
223.Loop_1x:
224EOF
225
226&ROUND (0, 4, 8, 12);
227&ROUND (0, 5, 10, 15);
228
229$code .= <<EOF;
230 addi.w $t8,$t8,-1
231 bnez $t8,.Loop_1x
232
233 # Get the final states by adding the initial states
234 la.local $t8,.Lsigma
235 ld.w $a7,$t8,4*0
236 ld.w $a6,$t8,4*1
237 ld.w $a5,$t8,4*2
238 add.w @x[0],@x[0],$a7
239 add.w @x[1],@x[1],$a6
240 add.w @x[2],@x[2],$a5
241 ld.w $a7,$t8,4*3
242 add.w @x[3],@x[3],$a7
243
244 ld.w $t8,$key,4*0
245 ld.w $a7,$key,4*1
246 ld.w $a6,$key,4*2
247 ld.w $a5,$key,4*3
248 add.w @x[4],@x[4],$t8
249 add.w @x[5],@x[5],$a7
250 add.w @x[6],@x[6],$a6
251 add.w @x[7],@x[7],$a5
252
253 ld.w $t8,$key,4*4
254 ld.w $a7,$key,4*5
255 ld.w $a6,$key,4*6
256 ld.w $a5,$key,4*7
257 add.w @x[8],@x[8],$t8
258 add.w @x[9],@x[9],$a7
259 add.w @x[10],@x[10],$a6
260 add.w @x[11],@x[11],$a5
261
262 add.w @x[12],@x[12],$s8
263
264 ld.w $t8,$counter,4*1
265 ld.w $a7,$counter,4*2
266 ld.w $a6,$counter,4*3
267 add.w @x[13],@x[13],$t8
268 add.w @x[14],@x[14],$a7
269 add.w @x[15],@x[15],$a6
270
271 ori $t8,$zero,64
272 bltu $len,$t8,.Ltail_1x
273
274 # Get the encrypted message by xor states with plaintext
275 ld.w $t8,$inp,4*0
276 ld.w $a7,$inp,4*1
277 ld.w $a6,$inp,4*2
278 ld.w $a5,$inp,4*3
279 xor $t8,$t8,@x[0]
280 xor $a7,$a7,@x[1]
281 xor $a6,$a6,@x[2]
282 xor $a5,$a5,@x[3]
283 st.w $t8,$out,4*0
284 st.w $a7,$out,4*1
285 st.w $a6,$out,4*2
286 st.w $a5,$out,4*3
287
288 ld.w $t8,$inp,4*4
289 ld.w $a7,$inp,4*5
290 ld.w $a6,$inp,4*6
291 ld.w $a5,$inp,4*7
292 xor $t8,$t8,@x[4]
293 xor $a7,$a7,@x[5]
294 xor $a6,$a6,@x[6]
295 xor $a5,$a5,@x[7]
296 st.w $t8,$out,4*4
297 st.w $a7,$out,4*5
298 st.w $a6,$out,4*6
299 st.w $a5,$out,4*7
300
301 ld.w $t8,$inp,4*8
302 ld.w $a7,$inp,4*9
303 ld.w $a6,$inp,4*10
304 ld.w $a5,$inp,4*11
305 xor $t8,$t8,@x[8]
306 xor $a7,$a7,@x[9]
307 xor $a6,$a6,@x[10]
308 xor $a5,$a5,@x[11]
309 st.w $t8,$out,4*8
310 st.w $a7,$out,4*9
311 st.w $a6,$out,4*10
312 st.w $a5,$out,4*11
313
314 ld.w $t8,$inp,4*12
315 ld.w $a7,$inp,4*13
316 ld.w $a6,$inp,4*14
317 ld.w $a5,$inp,4*15
318 xor $t8,$t8,@x[12]
319 xor $a7,$a7,@x[13]
320 xor $a6,$a6,@x[14]
321 xor $a5,$a5,@x[15]
322 st.w $t8,$out,4*12
323 st.w $a7,$out,4*13
324 st.w $a6,$out,4*14
325 st.w $a5,$out,4*15
326
327 addi.d $len,$len,-64
328 beqz $len,.Ldone_1x
329 addi.d $inp,$inp,64
330 addi.d $out,$out,64
331 addi.w $s8,$s8,1
332 b .Loop_outer_1x
333
334.align 4
335.Ltail_1x:
336 # Handle the tail for 1x (1 <= tail_len <= 63)
337 addi.d $a7,$sp,72
338 st.w @x[0],$a7,4*0
339 st.w @x[1],$a7,4*1
340 st.w @x[2],$a7,4*2
341 st.w @x[3],$a7,4*3
342 st.w @x[4],$a7,4*4
343 st.w @x[5],$a7,4*5
344 st.w @x[6],$a7,4*6
345 st.w @x[7],$a7,4*7
346 st.w @x[8],$a7,4*8
347 st.w @x[9],$a7,4*9
348 st.w @x[10],$a7,4*10
349 st.w @x[11],$a7,4*11
350 st.w @x[12],$a7,4*12
351 st.w @x[13],$a7,4*13
352 st.w @x[14],$a7,4*14
353 st.w @x[15],$a7,4*15
354
355 move $t8,$zero
356
357.Loop_tail_1x:
358 # Xor input with states byte by byte
359 ldx.bu $a6,$inp,$t8
360 ldx.bu $a5,$a7,$t8
361 xor $a6,$a6,$a5
362 stx.b $a6,$out,$t8
363 addi.w $t8,$t8,1
364 addi.d $len,$len,-1
365 bnez $len,.Loop_tail_1x
366 b .Ldone_1x
367
368.Ldone_1x:
369 ld.d $s0,$sp,0
370 ld.d $s1,$sp,8
371 ld.d $s2,$sp,16
372 ld.d $s3,$sp,24
373 ld.d $s4,$sp,32
374 ld.d $s5,$sp,40
375 ld.d $s6,$sp,48
376 ld.d $s7,$sp,56
377 ld.d $s8,$sp,64
378 addi.d $sp,$sp,256
379
380 b .Lend
381
382EOF
383}
384
385########################################################################
386# 128-bit LSX code path that handles all lengths.
387{
388# Load the initial states in array @x[*] and update directly.
389my @x = ($vr0, $vr1, $vr2, $vr3, $vr4, $vr5, $vr6, $vr7,
390 $vr8, $vr9, $vr10, $vr11, $vr12, $vr13, $vr14, $vr15);
391
392# Save the initial states in array @y[*]
393my @y = ($vr16, $vr17, $vr18, $vr19, $vr20, $vr21, $vr22, $vr23,
394 $vr24, $vr25, $vr26, $vr27, $vr28, $vr29, $vr30, $vr31);
395
396sub ROUND_4x {
397 my ($a0,$b0,$c0,$d0) = @_;
398 my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
399 my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
400 my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
401
402$code .= <<EOF;
403 vadd.w @x[$a0],@x[$a0],@x[$b0]
404 vxor.v @x[$d0],@x[$d0],@x[$a0]
405 vrotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits
406 vadd.w @x[$a1],@x[$a1],@x[$b1]
407 vxor.v @x[$d1],@x[$d1],@x[$a1]
408 vrotri.w @x[$d1],@x[$d1],16
409
410 vadd.w @x[$c0],@x[$c0],@x[$d0]
411 vxor.v @x[$b0],@x[$b0],@x[$c0]
412 vrotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits
413 vadd.w @x[$c1],@x[$c1],@x[$d1]
414 vxor.v @x[$b1],@x[$b1],@x[$c1]
415 vrotri.w @x[$b1],@x[$b1],20
416
417 vadd.w @x[$a0],@x[$a0],@x[$b0]
418 vxor.v @x[$d0],@x[$d0],@x[$a0]
419 vrotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits
420 vadd.w @x[$a1],@x[$a1],@x[$b1]
421 vxor.v @x[$d1],@x[$d1],@x[$a1]
422 vrotri.w @x[$d1],@x[$d1],24
423
424 vadd.w @x[$c0],@x[$c0],@x[$d0]
425 vxor.v @x[$b0],@x[$b0],@x[$c0]
426 vrotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits
427 vadd.w @x[$c1],@x[$c1],@x[$d1]
428 vxor.v @x[$b1],@x[$b1],@x[$c1]
429 vrotri.w @x[$b1],@x[$b1],25
430
431 vadd.w @x[$a2],@x[$a2],@x[$b2]
432 vxor.v @x[$d2],@x[$d2],@x[$a2]
433 vrotri.w @x[$d2],@x[$d2],16
434 vadd.w @x[$a3],@x[$a3],@x[$b3]
435 vxor.v @x[$d3],@x[$d3],@x[$a3]
436 vrotri.w @x[$d3],@x[$d3],16
437
438 vadd.w @x[$c2],@x[$c2],@x[$d2]
439 vxor.v @x[$b2],@x[$b2],@x[$c2]
440 vrotri.w @x[$b2],@x[$b2],20
441 vadd.w @x[$c3],@x[$c3],@x[$d3]
442 vxor.v @x[$b3],@x[$b3],@x[$c3]
443 vrotri.w @x[$b3],@x[$b3],20
444
445 vadd.w @x[$a2],@x[$a2],@x[$b2]
446 vxor.v @x[$d2],@x[$d2],@x[$a2]
447 vrotri.w @x[$d2],@x[$d2],24
448 vadd.w @x[$a3],@x[$a3],@x[$b3]
449 vxor.v @x[$d3],@x[$d3],@x[$a3]
450 vrotri.w @x[$d3],@x[$d3],24
451
452 vadd.w @x[$c2],@x[$c2],@x[$d2]
453 vxor.v @x[$b2],@x[$b2],@x[$c2]
454 vrotri.w @x[$b2],@x[$b2],25
455 vadd.w @x[$c3],@x[$c3],@x[$d3]
456 vxor.v @x[$b3],@x[$b3],@x[$c3]
457 vrotri.w @x[$b3],@x[$b3],25
458
459EOF
460}
461
462$code .= <<EOF;
463.align 6
464.LChaCha20_4x:
9a41a3c6
MZ
465 addi.d $sp,$sp,-128
466
467 # Save the initial block counter in $t4
468 ld.w $t4,$counter,0
469 b .Loop_outer_4x
470
471.align 5
472.Loop_outer_4x:
473 # Load constant
474 la.local $t8,.Lsigma
475 vldrepl.w @x[0],$t8,4*0 # 'expa'
476 vldrepl.w @x[1],$t8,4*1 # 'nd 3'
477 vldrepl.w @x[2],$t8,4*2 # '2-by'
478 vldrepl.w @x[3],$t8,4*3 # 'te k'
479
480 # Load key
481 vldrepl.w @x[4],$key,4*0
482 vldrepl.w @x[5],$key,4*1
483 vldrepl.w @x[6],$key,4*2
484 vldrepl.w @x[7],$key,4*3
485 vldrepl.w @x[8],$key,4*4
486 vldrepl.w @x[9],$key,4*5
487 vldrepl.w @x[10],$key,4*6
488 vldrepl.w @x[11],$key,4*7
489
490 # Load block counter
491 vreplgr2vr.w @x[12],$t4
492
493 # Load nonce
494 vldrepl.w @x[13],$counter,4*1
495 vldrepl.w @x[14],$counter,4*2
496 vldrepl.w @x[15],$counter,4*3
497
498 # Get the correct block counter for each block
499 la.local $t8,.Linc4x
500 vld @y[0],$t8,0
501 vadd.w @x[12],@x[12],@y[0]
502
503 # Copy the initial states from \@x[*] to \@y[*]
504 vori.b @y[0],@x[0],0
505 vori.b @y[1],@x[1],0
506 vori.b @y[2],@x[2],0
507 vori.b @y[3],@x[3],0
508 vori.b @y[4],@x[4],0
509 vori.b @y[5],@x[5],0
510 vori.b @y[6],@x[6],0
511 vori.b @y[7],@x[7],0
512 vori.b @y[8],@x[8],0
513 vori.b @y[9],@x[9],0
514 vori.b @y[10],@x[10],0
515 vori.b @y[11],@x[11],0
516 vori.b @y[12],@x[12],0
517 vori.b @y[13],@x[13],0
518 vori.b @y[14],@x[14],0
519 vori.b @y[15],@x[15],0
520
521 # Update states in \@x[*] for 20 rounds
522 ori $t8,$zero,10
523 b .Loop_4x
524
525.align 5
526.Loop_4x:
527EOF
528
529&ROUND_4x (0, 4, 8, 12);
530&ROUND_4x (0, 5, 10, 15);
531
532$code .= <<EOF;
533 addi.w $t8,$t8,-1
534 bnez $t8,.Loop_4x
535
536 # Get the final states by adding the initial states
537 vadd.w @x[0],@x[0],@y[0]
538 vadd.w @x[1],@x[1],@y[1]
539 vadd.w @x[2],@x[2],@y[2]
540 vadd.w @x[3],@x[3],@y[3]
541 vadd.w @x[4],@x[4],@y[4]
542 vadd.w @x[5],@x[5],@y[5]
543 vadd.w @x[6],@x[6],@y[6]
544 vadd.w @x[7],@x[7],@y[7]
545 vadd.w @x[8],@x[8],@y[8]
546 vadd.w @x[9],@x[9],@y[9]
547 vadd.w @x[10],@x[10],@y[10]
548 vadd.w @x[11],@x[11],@y[11]
549 vadd.w @x[12],@x[12],@y[12]
550 vadd.w @x[13],@x[13],@y[13]
551 vadd.w @x[14],@x[14],@y[14]
552 vadd.w @x[15],@x[15],@y[15]
553
554 # Get the transpose of \@x[*] and save them in \@x[*]
555 vilvl.w @y[0],@x[1],@x[0]
556 vilvh.w @y[1],@x[1],@x[0]
557 vilvl.w @y[2],@x[3],@x[2]
558 vilvh.w @y[3],@x[3],@x[2]
559 vilvl.w @y[4],@x[5],@x[4]
560 vilvh.w @y[5],@x[5],@x[4]
561 vilvl.w @y[6],@x[7],@x[6]
562 vilvh.w @y[7],@x[7],@x[6]
563 vilvl.w @y[8],@x[9],@x[8]
564 vilvh.w @y[9],@x[9],@x[8]
565 vilvl.w @y[10],@x[11],@x[10]
566 vilvh.w @y[11],@x[11],@x[10]
567 vilvl.w @y[12],@x[13],@x[12]
568 vilvh.w @y[13],@x[13],@x[12]
569 vilvl.w @y[14],@x[15],@x[14]
570 vilvh.w @y[15],@x[15],@x[14]
571
572 vilvl.d @x[0],@y[2],@y[0]
573 vilvh.d @x[1],@y[2],@y[0]
574 vilvl.d @x[2],@y[3],@y[1]
575 vilvh.d @x[3],@y[3],@y[1]
576 vilvl.d @x[4],@y[6],@y[4]
577 vilvh.d @x[5],@y[6],@y[4]
578 vilvl.d @x[6],@y[7],@y[5]
579 vilvh.d @x[7],@y[7],@y[5]
580 vilvl.d @x[8],@y[10],@y[8]
581 vilvh.d @x[9],@y[10],@y[8]
582 vilvl.d @x[10],@y[11],@y[9]
583 vilvh.d @x[11],@y[11],@y[9]
584 vilvl.d @x[12],@y[14],@y[12]
585 vilvh.d @x[13],@y[14],@y[12]
586 vilvl.d @x[14],@y[15],@y[13]
587 vilvh.d @x[15],@y[15],@y[13]
588EOF
589
590# Adjust the order of elements in @x[*] for ease of use.
591@x = (@x[0],@x[4],@x[8],@x[12],@x[1],@x[5],@x[9],@x[13],
592 @x[2],@x[6],@x[10],@x[14],@x[3],@x[7],@x[11],@x[15]);
593
594$code .= <<EOF;
595 ori $t8,$zero,64*4
596 bltu $len,$t8,.Ltail_4x
597
598 # Get the encrypted message by xor states with plaintext
599 vld @y[0],$inp,16*0
600 vld @y[1],$inp,16*1
601 vld @y[2],$inp,16*2
602 vld @y[3],$inp,16*3
603 vxor.v @y[0],@y[0],@x[0]
604 vxor.v @y[1],@y[1],@x[1]
605 vxor.v @y[2],@y[2],@x[2]
606 vxor.v @y[3],@y[3],@x[3]
607 vst @y[0],$out,16*0
608 vst @y[1],$out,16*1
609 vst @y[2],$out,16*2
610 vst @y[3],$out,16*3
611
612 vld @y[0],$inp,16*4
613 vld @y[1],$inp,16*5
614 vld @y[2],$inp,16*6
615 vld @y[3],$inp,16*7
616 vxor.v @y[0],@y[0],@x[4]
617 vxor.v @y[1],@y[1],@x[5]
618 vxor.v @y[2],@y[2],@x[6]
619 vxor.v @y[3],@y[3],@x[7]
620 vst @y[0],$out,16*4
621 vst @y[1],$out,16*5
622 vst @y[2],$out,16*6
623 vst @y[3],$out,16*7
624
625 vld @y[0],$inp,16*8
626 vld @y[1],$inp,16*9
627 vld @y[2],$inp,16*10
628 vld @y[3],$inp,16*11
629 vxor.v @y[0],@y[0],@x[8]
630 vxor.v @y[1],@y[1],@x[9]
631 vxor.v @y[2],@y[2],@x[10]
632 vxor.v @y[3],@y[3],@x[11]
633 vst @y[0],$out,16*8
634 vst @y[1],$out,16*9
635 vst @y[2],$out,16*10
636 vst @y[3],$out,16*11
637
638 vld @y[0],$inp,16*12
639 vld @y[1],$inp,16*13
640 vld @y[2],$inp,16*14
641 vld @y[3],$inp,16*15
642 vxor.v @y[0],@y[0],@x[12]
643 vxor.v @y[1],@y[1],@x[13]
644 vxor.v @y[2],@y[2],@x[14]
645 vxor.v @y[3],@y[3],@x[15]
646 vst @y[0],$out,16*12
647 vst @y[1],$out,16*13
648 vst @y[2],$out,16*14
649 vst @y[3],$out,16*15
650
651 addi.d $len,$len,-64*4
652 beqz $len,.Ldone_4x
653 addi.d $inp,$inp,64*4
654 addi.d $out,$out,64*4
655 addi.w $t4,$t4,4
656 b .Loop_outer_4x
657
658.Ltail_4x:
659 # Handle the tail for 4x (1 <= tail_len <= 255)
660 ori $t8,$zero,192
661 bgeu $len,$t8,.L192_or_more4x
662 ori $t8,$zero,128
663 bgeu $len,$t8,.L128_or_more4x
664 ori $t8,$zero,64
665 bgeu $len,$t8,.L64_or_more4x
666
667 vst @x[0],$sp,16*0
668 vst @x[1],$sp,16*1
669 vst @x[2],$sp,16*2
670 vst @x[3],$sp,16*3
671 move $t8,$zero
672 b .Loop_tail_4x
673
674.align 5
675.L64_or_more4x:
676 vld @y[0],$inp,16*0
677 vld @y[1],$inp,16*1
678 vld @y[2],$inp,16*2
679 vld @y[3],$inp,16*3
680 vxor.v @y[0],@y[0],@x[0]
681 vxor.v @y[1],@y[1],@x[1]
682 vxor.v @y[2],@y[2],@x[2]
683 vxor.v @y[3],@y[3],@x[3]
684 vst @y[0],$out,16*0
685 vst @y[1],$out,16*1
686 vst @y[2],$out,16*2
687 vst @y[3],$out,16*3
688
689 addi.d $len,$len,-64
690 beqz $len,.Ldone_4x
691 addi.d $inp,$inp,64
692 addi.d $out,$out,64
693 vst @x[4],$sp,16*0
694 vst @x[5],$sp,16*1
695 vst @x[6],$sp,16*2
696 vst @x[7],$sp,16*3
697 move $t8,$zero
698 b .Loop_tail_4x
699
700.align 5
701.L128_or_more4x:
702 vld @y[0],$inp,16*0
703 vld @y[1],$inp,16*1
704 vld @y[2],$inp,16*2
705 vld @y[3],$inp,16*3
706 vxor.v @y[0],@y[0],@x[0]
707 vxor.v @y[1],@y[1],@x[1]
708 vxor.v @y[2],@y[2],@x[2]
709 vxor.v @y[3],@y[3],@x[3]
710 vst @y[0],$out,16*0
711 vst @y[1],$out,16*1
712 vst @y[2],$out,16*2
713 vst @y[3],$out,16*3
714
715 vld @y[0],$inp,16*4
716 vld @y[1],$inp,16*5
717 vld @y[2],$inp,16*6
718 vld @y[3],$inp,16*7
719 vxor.v @y[0],@y[0],@x[4]
720 vxor.v @y[1],@y[1],@x[5]
721 vxor.v @y[2],@y[2],@x[6]
722 vxor.v @y[3],@y[3],@x[7]
723 vst @y[0],$out,16*4
724 vst @y[1],$out,16*5
725 vst @y[2],$out,16*6
726 vst @y[3],$out,16*7
727
728 addi.d $len,$len,-128
729 beqz $len,.Ldone_4x
730 addi.d $inp,$inp,128
731 addi.d $out,$out,128
732 vst @x[8],$sp,16*0
733 vst @x[9],$sp,16*1
734 vst @x[10],$sp,16*2
735 vst @x[11],$sp,16*3
736 move $t8,$zero
737 b .Loop_tail_4x
738
739.align 5
740.L192_or_more4x:
741 vld @y[0],$inp,16*0
742 vld @y[1],$inp,16*1
743 vld @y[2],$inp,16*2
744 vld @y[3],$inp,16*3
745 vxor.v @y[0],@y[0],@x[0]
746 vxor.v @y[1],@y[1],@x[1]
747 vxor.v @y[2],@y[2],@x[2]
748 vxor.v @y[3],@y[3],@x[3]
749 vst @y[0],$out,16*0
750 vst @y[1],$out,16*1
751 vst @y[2],$out,16*2
752 vst @y[3],$out,16*3
753
754 vld @y[0],$inp,16*4
755 vld @y[1],$inp,16*5
756 vld @y[2],$inp,16*6
757 vld @y[3],$inp,16*7
758 vxor.v @y[0],@y[0],@x[4]
759 vxor.v @y[1],@y[1],@x[5]
760 vxor.v @y[2],@y[2],@x[6]
761 vxor.v @y[3],@y[3],@x[7]
762 vst @y[0],$out,16*4
763 vst @y[1],$out,16*5
764 vst @y[2],$out,16*6
765 vst @y[3],$out,16*7
766
767 vld @y[0],$inp,16*8
768 vld @y[1],$inp,16*9
769 vld @y[2],$inp,16*10
770 vld @y[3],$inp,16*11
771 vxor.v @y[0],@y[0],@x[8]
772 vxor.v @y[1],@y[1],@x[9]
773 vxor.v @y[2],@y[2],@x[10]
774 vxor.v @y[3],@y[3],@x[11]
775 vst @y[0],$out,16*8
776 vst @y[1],$out,16*9
777 vst @y[2],$out,16*10
778 vst @y[3],$out,16*11
779
780 addi.d $len,$len,-192
781 beqz $len,.Ldone_4x
782 addi.d $inp,$inp,192
783 addi.d $out,$out,192
784 vst @x[12],$sp,16*0
785 vst @x[13],$sp,16*1
786 vst @x[14],$sp,16*2
787 vst @x[15],$sp,16*3
788 move $t8,$zero
789 b .Loop_tail_4x
790
791.Loop_tail_4x:
792 # Xor input with states byte by byte
793 ldx.bu $t5,$inp,$t8
794 ldx.bu $t6,$sp,$t8
795 xor $t5,$t5,$t6
796 stx.b $t5,$out,$t8
797 addi.w $t8,$t8,1
798 addi.d $len,$len,-1
799 bnez $len,.Loop_tail_4x
800 b .Ldone_4x
801
802.Ldone_4x:
803 addi.d $sp,$sp,128
b46de72c 804 b .Lrestore_saved_fpr
9a41a3c6
MZ
805
806EOF
807}
808
809########################################################################
810# 256-bit LASX code path that handles all lengths.
811{
812# Load the initial states in array @x[*] and update directly.
813my @x = ($xr0, $xr1, $xr2, $xr3, $xr4, $xr5, $xr6, $xr7,
814 $xr8, $xr9, $xr10, $xr11, $xr12, $xr13, $xr14, $xr15);
815
816# Save the initial states in array @y[*]
817my @y = ($xr16, $xr17, $xr18, $xr19, $xr20, $xr21, $xr22, $xr23,
818 $xr24, $xr25, $xr26, $xr27, $xr28, $xr29, $xr30, $xr31);
819
820sub ROUND_8x {
821 my ($a0,$b0,$c0,$d0) = @_;
822 my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
823 my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
824 my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
825
826$code .= <<EOF;
827 xvadd.w @x[$a0],@x[$a0],@x[$b0]
828 xvxor.v @x[$d0],@x[$d0],@x[$a0]
829 xvrotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits
830 xvadd.w @x[$a1],@x[$a1],@x[$b1]
831 xvxor.v @x[$d1],@x[$d1],@x[$a1]
832 xvrotri.w @x[$d1],@x[$d1],16
833
834 xvadd.w @x[$c0],@x[$c0],@x[$d0]
835 xvxor.v @x[$b0],@x[$b0],@x[$c0]
836 xvrotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits
837 xvadd.w @x[$c1],@x[$c1],@x[$d1]
838 xvxor.v @x[$b1],@x[$b1],@x[$c1]
839 xvrotri.w @x[$b1],@x[$b1],20
840
841 xvadd.w @x[$a0],@x[$a0],@x[$b0]
842 xvxor.v @x[$d0],@x[$d0],@x[$a0]
843 xvrotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits
844 xvadd.w @x[$a1],@x[$a1],@x[$b1]
845 xvxor.v @x[$d1],@x[$d1],@x[$a1]
846 xvrotri.w @x[$d1],@x[$d1],24
847
848 xvadd.w @x[$c0],@x[$c0],@x[$d0]
849 xvxor.v @x[$b0],@x[$b0],@x[$c0]
850 xvrotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits
851 xvadd.w @x[$c1],@x[$c1],@x[$d1]
852 xvxor.v @x[$b1],@x[$b1],@x[$c1]
853 xvrotri.w @x[$b1],@x[$b1],25
854
855 xvadd.w @x[$a2],@x[$a2],@x[$b2]
856 xvxor.v @x[$d2],@x[$d2],@x[$a2]
857 xvrotri.w @x[$d2],@x[$d2],16
858 xvadd.w @x[$a3],@x[$a3],@x[$b3]
859 xvxor.v @x[$d3],@x[$d3],@x[$a3]
860 xvrotri.w @x[$d3],@x[$d3],16
861
862 xvadd.w @x[$c2],@x[$c2],@x[$d2]
863 xvxor.v @x[$b2],@x[$b2],@x[$c2]
864 xvrotri.w @x[$b2],@x[$b2],20
865 xvadd.w @x[$c3],@x[$c3],@x[$d3]
866 xvxor.v @x[$b3],@x[$b3],@x[$c3]
867 xvrotri.w @x[$b3],@x[$b3],20
868
869 xvadd.w @x[$a2],@x[$a2],@x[$b2]
870 xvxor.v @x[$d2],@x[$d2],@x[$a2]
871 xvrotri.w @x[$d2],@x[$d2],24
872 xvadd.w @x[$a3],@x[$a3],@x[$b3]
873 xvxor.v @x[$d3],@x[$d3],@x[$a3]
874 xvrotri.w @x[$d3],@x[$d3],24
875
876 xvadd.w @x[$c2],@x[$c2],@x[$d2]
877 xvxor.v @x[$b2],@x[$b2],@x[$c2]
878 xvrotri.w @x[$b2],@x[$b2],25
879 xvadd.w @x[$c3],@x[$c3],@x[$d3]
880 xvxor.v @x[$b3],@x[$b3],@x[$c3]
881 xvrotri.w @x[$b3],@x[$b3],25
882
883EOF
884}
885
886$code .= <<EOF;
887.align 6
888.LChaCha20_8x:
9a41a3c6
MZ
889 addi.d $sp,$sp,-128
890
891 # Save the initial block counter in $t4
892 ld.w $t4,$counter,0
893 b .Loop_outer_8x
894
895.align 5
896.Loop_outer_8x:
897 # Load constant
898 la.local $t8,.Lsigma
899 xvldrepl.w @x[0],$t8,4*0 # 'expa'
900 xvldrepl.w @x[1],$t8,4*1 # 'nd 3'
901 xvldrepl.w @x[2],$t8,4*2 # '2-by'
902 xvldrepl.w @x[3],$t8,4*3 # 'te k'
903
904 # Load key
905 xvldrepl.w @x[4],$key,4*0
906 xvldrepl.w @x[5],$key,4*1
907 xvldrepl.w @x[6],$key,4*2
908 xvldrepl.w @x[7],$key,4*3
909 xvldrepl.w @x[8],$key,4*4
910 xvldrepl.w @x[9],$key,4*5
911 xvldrepl.w @x[10],$key,4*6
912 xvldrepl.w @x[11],$key,4*7
913
914 # Load block counter
915 xvreplgr2vr.w @x[12],$t4
916
917 # Load nonce
918 xvldrepl.w @x[13],$counter,4*1
919 xvldrepl.w @x[14],$counter,4*2
920 xvldrepl.w @x[15],$counter,4*3
921
922 # Get the correct block counter for each block
923 la.local $t8,.Linc8x
924 xvld @y[0],$t8,0
925 xvadd.w @x[12],@x[12],@y[0]
926
927 # Copy the initial states from \@x[*] to \@y[*]
928 xvori.b @y[0],@x[0],0
929 xvori.b @y[1],@x[1],0
930 xvori.b @y[2],@x[2],0
931 xvori.b @y[3],@x[3],0
932 xvori.b @y[4],@x[4],0
933 xvori.b @y[5],@x[5],0
934 xvori.b @y[6],@x[6],0
935 xvori.b @y[7],@x[7],0
936 xvori.b @y[8],@x[8],0
937 xvori.b @y[9],@x[9],0
938 xvori.b @y[10],@x[10],0
939 xvori.b @y[11],@x[11],0
940 xvori.b @y[12],@x[12],0
941 xvori.b @y[13],@x[13],0
942 xvori.b @y[14],@x[14],0
943 xvori.b @y[15],@x[15],0
944
945 # Update states in \@x[*] for 20 rounds
946 ori $t8,$zero,10
947 b .Loop_8x
948
949.align 5
950.Loop_8x:
951EOF
952
953&ROUND_8x (0, 4, 8, 12);
954&ROUND_8x (0, 5, 10, 15);
955
956$code .= <<EOF;
957 addi.w $t8,$t8,-1
958 bnez $t8,.Loop_8x
959
960 # Get the final states by adding the initial states
961 xvadd.w @x[0],@x[0],@y[0]
962 xvadd.w @x[1],@x[1],@y[1]
963 xvadd.w @x[2],@x[2],@y[2]
964 xvadd.w @x[3],@x[3],@y[3]
965 xvadd.w @x[4],@x[4],@y[4]
966 xvadd.w @x[5],@x[5],@y[5]
967 xvadd.w @x[6],@x[6],@y[6]
968 xvadd.w @x[7],@x[7],@y[7]
969 xvadd.w @x[8],@x[8],@y[8]
970 xvadd.w @x[9],@x[9],@y[9]
971 xvadd.w @x[10],@x[10],@y[10]
972 xvadd.w @x[11],@x[11],@y[11]
973 xvadd.w @x[12],@x[12],@y[12]
974 xvadd.w @x[13],@x[13],@y[13]
975 xvadd.w @x[14],@x[14],@y[14]
976 xvadd.w @x[15],@x[15],@y[15]
977
978 # Get the transpose of \@x[*] and save them in \@y[*]
979 xvilvl.w @y[0],@x[1],@x[0]
980 xvilvh.w @y[1],@x[1],@x[0]
981 xvilvl.w @y[2],@x[3],@x[2]
982 xvilvh.w @y[3],@x[3],@x[2]
983 xvilvl.w @y[4],@x[5],@x[4]
984 xvilvh.w @y[5],@x[5],@x[4]
985 xvilvl.w @y[6],@x[7],@x[6]
986 xvilvh.w @y[7],@x[7],@x[6]
987 xvilvl.w @y[8],@x[9],@x[8]
988 xvilvh.w @y[9],@x[9],@x[8]
989 xvilvl.w @y[10],@x[11],@x[10]
990 xvilvh.w @y[11],@x[11],@x[10]
991 xvilvl.w @y[12],@x[13],@x[12]
992 xvilvh.w @y[13],@x[13],@x[12]
993 xvilvl.w @y[14],@x[15],@x[14]
994 xvilvh.w @y[15],@x[15],@x[14]
995
996 xvilvl.d @x[0],@y[2],@y[0]
997 xvilvh.d @x[1],@y[2],@y[0]
998 xvilvl.d @x[2],@y[3],@y[1]
999 xvilvh.d @x[3],@y[3],@y[1]
1000 xvilvl.d @x[4],@y[6],@y[4]
1001 xvilvh.d @x[5],@y[6],@y[4]
1002 xvilvl.d @x[6],@y[7],@y[5]
1003 xvilvh.d @x[7],@y[7],@y[5]
1004 xvilvl.d @x[8],@y[10],@y[8]
1005 xvilvh.d @x[9],@y[10],@y[8]
1006 xvilvl.d @x[10],@y[11],@y[9]
1007 xvilvh.d @x[11],@y[11],@y[9]
1008 xvilvl.d @x[12],@y[14],@y[12]
1009 xvilvh.d @x[13],@y[14],@y[12]
1010 xvilvl.d @x[14],@y[15],@y[13]
1011 xvilvh.d @x[15],@y[15],@y[13]
1012
1013 xvori.b @y[0],@x[4],0
1014 xvpermi.q @y[0],@x[0],0x20
1015 xvori.b @y[1],@x[5],0
1016 xvpermi.q @y[1],@x[1],0x20
1017 xvori.b @y[2],@x[6],0
1018 xvpermi.q @y[2],@x[2],0x20
1019 xvori.b @y[3],@x[7],0
1020 xvpermi.q @y[3],@x[3],0x20
1021 xvori.b @y[4],@x[4],0
1022 xvpermi.q @y[4],@x[0],0x31
1023 xvori.b @y[5],@x[5],0
1024 xvpermi.q @y[5],@x[1],0x31
1025 xvori.b @y[6],@x[6],0
1026 xvpermi.q @y[6],@x[2],0x31
1027 xvori.b @y[7],@x[7],0
1028 xvpermi.q @y[7],@x[3],0x31
1029 xvori.b @y[8],@x[12],0
1030 xvpermi.q @y[8],@x[8],0x20
1031 xvori.b @y[9],@x[13],0
1032 xvpermi.q @y[9],@x[9],0x20
1033 xvori.b @y[10],@x[14],0
1034 xvpermi.q @y[10],@x[10],0x20
1035 xvori.b @y[11],@x[15],0
1036 xvpermi.q @y[11],@x[11],0x20
1037 xvori.b @y[12],@x[12],0
1038 xvpermi.q @y[12],@x[8],0x31
1039 xvori.b @y[13],@x[13],0
1040 xvpermi.q @y[13],@x[9],0x31
1041 xvori.b @y[14],@x[14],0
1042 xvpermi.q @y[14],@x[10],0x31
1043 xvori.b @y[15],@x[15],0
1044 xvpermi.q @y[15],@x[11],0x31
1045
1046EOF
1047
1048# Adjust the order of elements in @y[*] for ease of use.
1049@y = (@y[0],@y[8],@y[1],@y[9],@y[2],@y[10],@y[3],@y[11],
1050 @y[4],@y[12],@y[5],@y[13],@y[6],@y[14],@y[7],@y[15]);
1051
1052$code .= <<EOF;
1053 ori $t8,$zero,64*8
1054 bltu $len,$t8,.Ltail_8x
1055
1056 # Get the encrypted message by xor states with plaintext
1057 xvld @x[0],$inp,32*0
1058 xvld @x[1],$inp,32*1
1059 xvld @x[2],$inp,32*2
1060 xvld @x[3],$inp,32*3
1061 xvxor.v @x[0],@x[0],@y[0]
1062 xvxor.v @x[1],@x[1],@y[1]
1063 xvxor.v @x[2],@x[2],@y[2]
1064 xvxor.v @x[3],@x[3],@y[3]
1065 xvst @x[0],$out,32*0
1066 xvst @x[1],$out,32*1
1067 xvst @x[2],$out,32*2
1068 xvst @x[3],$out,32*3
1069
1070 xvld @x[0],$inp,32*4
1071 xvld @x[1],$inp,32*5
1072 xvld @x[2],$inp,32*6
1073 xvld @x[3],$inp,32*7
1074 xvxor.v @x[0],@x[0],@y[4]
1075 xvxor.v @x[1],@x[1],@y[5]
1076 xvxor.v @x[2],@x[2],@y[6]
1077 xvxor.v @x[3],@x[3],@y[7]
1078 xvst @x[0],$out,32*4
1079 xvst @x[1],$out,32*5
1080 xvst @x[2],$out,32*6
1081 xvst @x[3],$out,32*7
1082
1083 xvld @x[0],$inp,32*8
1084 xvld @x[1],$inp,32*9
1085 xvld @x[2],$inp,32*10
1086 xvld @x[3],$inp,32*11
1087 xvxor.v @x[0],@x[0],@y[8]
1088 xvxor.v @x[1],@x[1],@y[9]
1089 xvxor.v @x[2],@x[2],@y[10]
1090 xvxor.v @x[3],@x[3],@y[11]
1091 xvst @x[0],$out,32*8
1092 xvst @x[1],$out,32*9
1093 xvst @x[2],$out,32*10
1094 xvst @x[3],$out,32*11
1095
1096 xvld @x[0],$inp,32*12
1097 xvld @x[1],$inp,32*13
1098 xvld @x[2],$inp,32*14
1099 xvld @x[3],$inp,32*15
1100 xvxor.v @x[0],@x[0],@y[12]
1101 xvxor.v @x[1],@x[1],@y[13]
1102 xvxor.v @x[2],@x[2],@y[14]
1103 xvxor.v @x[3],@x[3],@y[15]
1104 xvst @x[0],$out,32*12
1105 xvst @x[1],$out,32*13
1106 xvst @x[2],$out,32*14
1107 xvst @x[3],$out,32*15
1108
1109 addi.d $len,$len,-64*8
1110 beqz $len,.Ldone_8x
1111 addi.d $inp,$inp,64*8
1112 addi.d $out,$out,64*8
1113 addi.w $t4,$t4,8
1114 b .Loop_outer_8x
1115
1116.Ltail_8x:
1117 # Handle the tail for 8x (1 <= tail_len <= 511)
1118 ori $t8,$zero,448
1119 bgeu $len,$t8,.L448_or_more8x
1120 ori $t8,$zero,384
1121 bgeu $len,$t8,.L384_or_more8x
1122 ori $t8,$zero,320
1123 bgeu $len,$t8,.L320_or_more8x
1124 ori $t8,$zero,256
1125 bgeu $len,$t8,.L256_or_more8x
1126 ori $t8,$zero,192
1127 bgeu $len,$t8,.L192_or_more8x
1128 ori $t8,$zero,128
1129 bgeu $len,$t8,.L128_or_more8x
1130 ori $t8,$zero,64
1131 bgeu $len,$t8,.L64_or_more8x
1132
1133 xvst @y[0],$sp,32*0
1134 xvst @y[1],$sp,32*1
1135 move $t8,$zero
1136 b .Loop_tail_8x
1137
1138.align 5
1139.L64_or_more8x:
1140 xvld @x[0],$inp,32*0
1141 xvld @x[1],$inp,32*1
1142 xvxor.v @x[0],@x[0],@y[0]
1143 xvxor.v @x[1],@x[1],@y[1]
1144 xvst @x[0],$out,32*0
1145 xvst @x[1],$out,32*1
1146
1147 addi.d $len,$len,-64
1148 beqz $len,.Ldone_8x
1149 addi.d $inp,$inp,64
1150 addi.d $out,$out,64
1151 xvst @y[2],$sp,32*0
1152 xvst @y[3],$sp,32*1
1153 move $t8,$zero
1154 b .Loop_tail_8x
1155
1156.align 5
1157.L128_or_more8x:
1158 xvld @x[0],$inp,32*0
1159 xvld @x[1],$inp,32*1
1160 xvld @x[2],$inp,32*2
1161 xvld @x[3],$inp,32*3
1162 xvxor.v @x[0],@x[0],@y[0]
1163 xvxor.v @x[1],@x[1],@y[1]
1164 xvxor.v @x[2],@x[2],@y[2]
1165 xvxor.v @x[3],@x[3],@y[3]
1166 xvst @x[0],$out,32*0
1167 xvst @x[1],$out,32*1
1168 xvst @x[2],$out,32*2
1169 xvst @x[3],$out,32*3
1170
1171 addi.d $len,$len,-128
1172 beqz $len,.Ldone_8x
1173 addi.d $inp,$inp,128
1174 addi.d $out,$out,128
1175 xvst @y[4],$sp,32*0
1176 xvst @y[5],$sp,32*1
1177 move $t8,$zero
1178 b .Loop_tail_8x
1179
1180.align 5
1181.L192_or_more8x:
1182 xvld @x[0],$inp,32*0
1183 xvld @x[1],$inp,32*1
1184 xvld @x[2],$inp,32*2
1185 xvld @x[3],$inp,32*3
1186 xvxor.v @x[0],@x[0],@y[0]
1187 xvxor.v @x[1],@x[1],@y[1]
1188 xvxor.v @x[2],@x[2],@y[2]
1189 xvxor.v @x[3],@x[3],@y[3]
1190 xvst @x[0],$out,32*0
1191 xvst @x[1],$out,32*1
1192 xvst @x[2],$out,32*2
1193 xvst @x[3],$out,32*3
1194
1195 xvld @x[0],$inp,32*4
1196 xvld @x[1],$inp,32*5
1197 xvxor.v @x[0],@x[0],@y[4]
1198 xvxor.v @x[1],@x[1],@y[5]
1199 xvst @x[0],$out,32*4
1200 xvst @x[1],$out,32*5
1201
1202 addi.d $len,$len,-192
1203 beqz $len,.Ldone_8x
1204 addi.d $inp,$inp,192
1205 addi.d $out,$out,192
1206 xvst @y[6],$sp,32*0
1207 xvst @y[7],$sp,32*1
1208 move $t8,$zero
1209 b .Loop_tail_8x
1210
1211.align 5
1212.L256_or_more8x:
1213 xvld @x[0],$inp,32*0
1214 xvld @x[1],$inp,32*1
1215 xvld @x[2],$inp,32*2
1216 xvld @x[3],$inp,32*3
1217 xvxor.v @x[0],@x[0],@y[0]
1218 xvxor.v @x[1],@x[1],@y[1]
1219 xvxor.v @x[2],@x[2],@y[2]
1220 xvxor.v @x[3],@x[3],@y[3]
1221 xvst @x[0],$out,32*0
1222 xvst @x[1],$out,32*1
1223 xvst @x[2],$out,32*2
1224 xvst @x[3],$out,32*3
1225
1226 xvld @x[0],$inp,32*4
1227 xvld @x[1],$inp,32*5
1228 xvld @x[2],$inp,32*6
1229 xvld @x[3],$inp,32*7
1230 xvxor.v @x[0],@x[0],@y[4]
1231 xvxor.v @x[1],@x[1],@y[5]
1232 xvxor.v @x[2],@x[2],@y[6]
1233 xvxor.v @x[3],@x[3],@y[7]
1234 xvst @x[0],$out,32*4
1235 xvst @x[1],$out,32*5
1236 xvst @x[2],$out,32*6
1237 xvst @x[3],$out,32*7
1238
1239 addi.d $len,$len,-256
1240 beqz $len,.Ldone_8x
1241 addi.d $inp,$inp,256
1242 addi.d $out,$out,256
1243 xvst @y[8],$sp,32*0
1244 xvst @y[9],$sp,32*1
1245 move $t8,$zero
1246 b .Loop_tail_8x
1247
1248.align 5
1249.L320_or_more8x:
1250 xvld @x[0],$inp,32*0
1251 xvld @x[1],$inp,32*1
1252 xvld @x[2],$inp,32*2
1253 xvld @x[3],$inp,32*3
1254 xvxor.v @x[0],@x[0],@y[0]
1255 xvxor.v @x[1],@x[1],@y[1]
1256 xvxor.v @x[2],@x[2],@y[2]
1257 xvxor.v @x[3],@x[3],@y[3]
1258 xvst @x[0],$out,32*0
1259 xvst @x[1],$out,32*1
1260 xvst @x[2],$out,32*2
1261 xvst @x[3],$out,32*3
1262
1263 xvld @x[0],$inp,32*4
1264 xvld @x[1],$inp,32*5
1265 xvld @x[2],$inp,32*6
1266 xvld @x[3],$inp,32*7
1267 xvxor.v @x[0],@x[0],@y[4]
1268 xvxor.v @x[1],@x[1],@y[5]
1269 xvxor.v @x[2],@x[2],@y[6]
1270 xvxor.v @x[3],@x[3],@y[7]
1271 xvst @x[0],$out,32*4
1272 xvst @x[1],$out,32*5
1273 xvst @x[2],$out,32*6
1274 xvst @x[3],$out,32*7
1275
1276 xvld @x[0],$inp,32*8
1277 xvld @x[1],$inp,32*9
1278 xvxor.v @x[0],@x[0],@y[8]
1279 xvxor.v @x[1],@x[1],@y[9]
1280 xvst @x[0],$out,32*8
1281 xvst @x[1],$out,32*9
1282
1283 addi.d $len,$len,-320
1284 beqz $len,.Ldone_8x
1285 addi.d $inp,$inp,320
1286 addi.d $out,$out,320
1287 xvst @y[10],$sp,32*0
1288 xvst @y[11],$sp,32*1
1289 move $t8,$zero
1290 b .Loop_tail_8x
1291
1292.align 5
1293.L384_or_more8x:
1294 xvld @x[0],$inp,32*0
1295 xvld @x[1],$inp,32*1
1296 xvld @x[2],$inp,32*2
1297 xvld @x[3],$inp,32*3
1298 xvxor.v @x[0],@x[0],@y[0]
1299 xvxor.v @x[1],@x[1],@y[1]
1300 xvxor.v @x[2],@x[2],@y[2]
1301 xvxor.v @x[3],@x[3],@y[3]
1302 xvst @x[0],$out,32*0
1303 xvst @x[1],$out,32*1
1304 xvst @x[2],$out,32*2
1305 xvst @x[3],$out,32*3
1306
1307 xvld @x[0],$inp,32*4
1308 xvld @x[1],$inp,32*5
1309 xvld @x[2],$inp,32*6
1310 xvld @x[3],$inp,32*7
1311 xvxor.v @x[0],@x[0],@y[4]
1312 xvxor.v @x[1],@x[1],@y[5]
1313 xvxor.v @x[2],@x[2],@y[6]
1314 xvxor.v @x[3],@x[3],@y[7]
1315 xvst @x[0],$out,32*4
1316 xvst @x[1],$out,32*5
1317 xvst @x[2],$out,32*6
1318 xvst @x[3],$out,32*7
1319
1320 xvld @x[0],$inp,32*8
1321 xvld @x[1],$inp,32*9
1322 xvld @x[2],$inp,32*10
1323 xvld @x[3],$inp,32*11
1324 xvxor.v @x[0],@x[0],@y[8]
1325 xvxor.v @x[1],@x[1],@y[9]
1326 xvxor.v @x[2],@x[2],@y[10]
1327 xvxor.v @x[3],@x[3],@y[11]
1328 xvst @x[0],$out,32*8
1329 xvst @x[1],$out,32*9
1330 xvst @x[2],$out,32*10
1331 xvst @x[3],$out,32*11
1332
1333 addi.d $len,$len,-384
1334 beqz $len,.Ldone_8x
1335 addi.d $inp,$inp,384
1336 addi.d $out,$out,384
1337 xvst @y[12],$sp,32*0
1338 xvst @y[13],$sp,32*1
1339 move $t8,$zero
1340 b .Loop_tail_8x
1341
1342.align 5
1343.L448_or_more8x:
1344 xvld @x[0],$inp,32*0
1345 xvld @x[1],$inp,32*1
1346 xvld @x[2],$inp,32*2
1347 xvld @x[3],$inp,32*3
1348 xvxor.v @x[0],@x[0],@y[0]
1349 xvxor.v @x[1],@x[1],@y[1]
1350 xvxor.v @x[2],@x[2],@y[2]
1351 xvxor.v @x[3],@x[3],@y[3]
1352 xvst @x[0],$out,32*0
1353 xvst @x[1],$out,32*1
1354 xvst @x[2],$out,32*2
1355 xvst @x[3],$out,32*3
1356
1357 xvld @x[0],$inp,32*4
1358 xvld @x[1],$inp,32*5
1359 xvld @x[2],$inp,32*6
1360 xvld @x[3],$inp,32*7
1361 xvxor.v @x[0],@x[0],@y[4]
1362 xvxor.v @x[1],@x[1],@y[5]
1363 xvxor.v @x[2],@x[2],@y[6]
1364 xvxor.v @x[3],@x[3],@y[7]
1365 xvst @x[0],$out,32*4
1366 xvst @x[1],$out,32*5
1367 xvst @x[2],$out,32*6
1368 xvst @x[3],$out,32*7
1369
1370 xvld @x[0],$inp,32*8
1371 xvld @x[1],$inp,32*9
1372 xvld @x[2],$inp,32*10
1373 xvld @x[3],$inp,32*11
1374 xvxor.v @x[0],@x[0],@y[8]
1375 xvxor.v @x[1],@x[1],@y[9]
1376 xvxor.v @x[2],@x[2],@y[10]
1377 xvxor.v @x[3],@x[3],@y[11]
1378 xvst @x[0],$out,32*8
1379 xvst @x[1],$out,32*9
1380 xvst @x[2],$out,32*10
1381 xvst @x[3],$out,32*11
1382
1383 xvld @x[0],$inp,32*12
1384 xvld @x[1],$inp,32*13
1385 xvxor.v @x[0],@x[0],@y[12]
1386 xvxor.v @x[1],@x[1],@y[13]
1387 xvst @x[0],$out,32*12
1388 xvst @x[1],$out,32*13
1389
1390 addi.d $len,$len,-448
1391 beqz $len,.Ldone_8x
1392 addi.d $inp,$inp,448
1393 addi.d $out,$out,448
1394 xvst @y[14],$sp,32*0
1395 xvst @y[15],$sp,32*1
1396 move $t8,$zero
1397 b .Loop_tail_8x
1398
1399.Loop_tail_8x:
1400 # Xor input with states byte by byte
1401 ldx.bu $t5,$inp,$t8
1402 ldx.bu $t6,$sp,$t8
1403 xor $t5,$t5,$t6
1404 stx.b $t5,$out,$t8
1405 addi.w $t8,$t8,1
1406 addi.d $len,$len,-1
1407 bnez $len,.Loop_tail_8x
1408 b .Ldone_8x
1409
1410.Ldone_8x:
1411 addi.d $sp,$sp,128
b46de72c 1412 b .Lrestore_saved_fpr
9a41a3c6
MZ
1413
1414EOF
1415}
1416
1417$code .= <<EOF;
b46de72c
XR
1418.Lrestore_saved_fpr:
1419 fld.d $fs0,$sp,0
1420 fld.d $fs1,$sp,8
1421 fld.d $fs2,$sp,16
1422 fld.d $fs3,$sp,24
1423 fld.d $fs4,$sp,32
1424 fld.d $fs5,$sp,40
1425 fld.d $fs6,$sp,48
1426 fld.d $fs7,$sp,56
1427 addi.d $sp,$sp,64
9a41a3c6
MZ
1428.Lno_data:
1429.Lend:
1430 jr $ra
1431.size ChaCha20_ctr32,.-ChaCha20_ctr32
1432EOF
1433
1434$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1435
1436print $code;
1437
1438close STDOUT;