]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-x86_64.pl
Following the license change, modify the boilerplates in crypto/poly1305/
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
1212818e 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
49d3b641 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a98c648e
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for x86_64.
18#
19# March 2015
20#
abb8c44f
AP
21# Initial release.
22#
23# December 2016
24#
25# Add AVX512F+VL+BW code path.
26#
a8f302e5
AP
27# November 2017
28#
29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
30# executed even on Knights Landing. Trigger for modification was
31# observation that AVX512 code paths can negatively affect overall
32# Skylake-X system performance. Since we are likely to suppress
33# AVX512F capability flag [at least on Skylake-X], conversion serves
34# as kind of "investment protection". Note that next *lake processor,
35# Cannolake, has AVX512IFMA code path to execute...
36#
a98c648e
AP
37# Numbers are cycles per processed byte with poly1305_blocks alone,
38# measured with rdtsc at fixed clock frequency.
39#
64d92d74 40# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
4b8736a2
AP
41# P4 4.46/+120% -
42# Core 2 2.41/+90% -
43# Westmere 1.88/+120% -
a98c648e 44# Sandy Bridge 1.39/+140% 1.10
4b8736a2 45# Haswell 1.14/+175% 1.11 0.65
64d92d74 46# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
a98c648e 47# Silvermont 2.83/+95% -
4dfe4310 48# Knights L 3.60/? 1.65 1.10 0.41(***)
ace05265 49# Goldmont 1.70/+180% -
a98c648e
AP
50# VIA Nano 1.82/+150% -
51# Sledgehammer 1.38/+160% -
4b8736a2 52# Bulldozer 2.30/+130% 0.97
54f8f9a1 53# Ryzen 1.15/+200% 1.08 1.18
a98c648e
AP
54#
55# (*) improvement coefficients relative to clang are more modest and
56# are ~50% on most processors, in both cases we are comparing to
57# __int128 code;
58# (**) SSE2 implementation was attempted, but among non-AVX processors
59# it was faster than integer-only code only on older Intel P4 and
60# Core processors, 50-30%, less newer processor is, but slower on
61# contemporary ones, for example almost 2x slower on Atom, and as
62# former are naturally disappearing, SSE2 is deemed unnecessary;
4dfe4310
AP
63# (***) strangely enough performance seems to vary from core to core,
64# listed result is best case;
a98c648e
AP
65
66$flavour = shift;
67$output = shift;
68if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
69
70$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
71
72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
74( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
75die "can't locate x86_64-xlate.pl";
76
77if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
78 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
fd910ef9 79 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
a98c648e
AP
80}
81
82if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1ea01427 83 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
fd910ef9
AP
84 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
85 $avx += 2 if ($1==2.11 && $2>=8);
a98c648e
AP
86}
87
88if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
89 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
90 $avx = ($1>=10) + ($1>=12);
91}
92
93if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
94 $avx = ($2>=3.0) + ($2>3.0);
95}
96
cfe1d992 97open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
a98c648e
AP
98*STDOUT=*OUT;
99
100my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
101my ($mac,$nonce)=($inp,$len); # *_emit arguments
102my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
103my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
104
105sub poly1305_iteration {
106# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
107# output: $h0-$h2 *= $r0-$r1
108$code.=<<___;
109 mulq $h0 # h0*r1
110 mov %rax,$d2
111 mov $r0,%rax
112 mov %rdx,$d3
113
114 mulq $h0 # h0*r0
115 mov %rax,$h0 # future $h0
116 mov $r0,%rax
117 mov %rdx,$d1
118
119 mulq $h1 # h1*r0
120 add %rax,$d2
121 mov $s1,%rax
122 adc %rdx,$d3
123
124 mulq $h1 # h1*s1
125 mov $h2,$h1 # borrow $h1
126 add %rax,$h0
127 adc %rdx,$d1
128
129 imulq $s1,$h1 # h2*s1
130 add $h1,$d2
131 mov $d1,$h1
132 adc \$0,$d3
133
134 imulq $r0,$h2 # h2*r0
135 add $d2,$h1
136 mov \$-4,%rax # mask value
137 adc $h2,$d3
138
139 and $d3,%rax # last reduction step
140 mov $d3,$h2
141 shr \$2,$d3
142 and \$3,$h2
143 add $d3,%rax
144 add %rax,$h0
145 adc \$0,$h1
4b8736a2 146 adc \$0,$h2
a98c648e
AP
147___
148}
149
150########################################################################
151# Layout of opaque area is following.
152#
153# unsigned __int64 h[3]; # current hash value base 2^64
154# unsigned __int64 r[2]; # key value base 2^64
155
156$code.=<<___;
157.text
158
159.extern OPENSSL_ia32cap_P
160
161.globl poly1305_init
3992e8c0 162.hidden poly1305_init
4ef29667 163.globl poly1305_blocks
3992e8c0 164.hidden poly1305_blocks
4ef29667 165.globl poly1305_emit
3992e8c0
AP
166.hidden poly1305_emit
167
a85dbf11 168.type poly1305_init,\@function,3
a98c648e
AP
169.align 32
170poly1305_init:
171 xor %rax,%rax
172 mov %rax,0($ctx) # initialize hash value
173 mov %rax,8($ctx)
174 mov %rax,16($ctx)
175
176 cmp \$0,$inp
177 je .Lno_key
178
179 lea poly1305_blocks(%rip),%r10
180 lea poly1305_emit(%rip),%r11
181___
182$code.=<<___ if ($avx);
183 mov OPENSSL_ia32cap_P+4(%rip),%r9
184 lea poly1305_blocks_avx(%rip),%rax
185 lea poly1305_emit_avx(%rip),%rcx
186 bt \$`60-32`,%r9 # AVX?
187 cmovc %rax,%r10
188 cmovc %rcx,%r11
189___
190$code.=<<___ if ($avx>1);
191 lea poly1305_blocks_avx2(%rip),%rax
192 bt \$`5+32`,%r9 # AVX2?
193 cmovc %rax,%r10
194___
fd910ef9
AP
195$code.=<<___ if ($avx>3);
196 mov \$`(1<<31|1<<21|1<<16)`,%rax
197 shr \$32,%r9
198 and %rax,%r9
199 cmp %rax,%r9
200 je .Linit_base2_44
201___
a98c648e
AP
202$code.=<<___;
203 mov \$0x0ffffffc0fffffff,%rax
204 mov \$0x0ffffffc0ffffffc,%rcx
205 and 0($inp),%rax
206 and 8($inp),%rcx
207 mov %rax,24($ctx)
208 mov %rcx,32($ctx)
2460c7f1
AP
209___
210$code.=<<___ if ($flavour !~ /elf32/);
a98c648e
AP
211 mov %r10,0(%rdx)
212 mov %r11,8(%rdx)
2460c7f1
AP
213___
214$code.=<<___ if ($flavour =~ /elf32/);
215 mov %r10d,0(%rdx)
216 mov %r11d,4(%rdx)
217___
218$code.=<<___;
a98c648e
AP
219 mov \$1,%eax
220.Lno_key:
221 ret
222.size poly1305_init,.-poly1305_init
223
a98c648e
AP
224.type poly1305_blocks,\@function,4
225.align 32
226poly1305_blocks:
1c47e883 227.cfi_startproc
a85dbf11 228.Lblocks:
4b8736a2
AP
229 shr \$4,$len
230 jz .Lno_data # too short
a98c648e
AP
231
232 push %rbx
1c47e883 233.cfi_push %rbx
a98c648e 234 push %rbp
1c47e883 235.cfi_push %rbp
a98c648e 236 push %r12
1c47e883 237.cfi_push %r12
a98c648e 238 push %r13
1c47e883 239.cfi_push %r13
a98c648e 240 push %r14
1c47e883 241.cfi_push %r14
a98c648e 242 push %r15
1c47e883 243.cfi_push %r15
a98c648e
AP
244.Lblocks_body:
245
246 mov $len,%r15 # reassign $len
247
248 mov 24($ctx),$r0 # load r
249 mov 32($ctx),$s1
250
251 mov 0($ctx),$h0 # load hash value
252 mov 8($ctx),$h1
253 mov 16($ctx),$h2
254
255 mov $s1,$r1
256 shr \$2,$s1
257 mov $r1,%rax
258 add $r1,$s1 # s1 = r1 + (r1 >> 2)
259 jmp .Loop
260
261.align 32
262.Loop:
263 add 0($inp),$h0 # accumulate input
264 adc 8($inp),$h1
265 lea 16($inp),$inp
266 adc $padbit,$h2
267___
268 &poly1305_iteration();
269$code.=<<___;
270 mov $r1,%rax
4b8736a2
AP
271 dec %r15 # len-=16
272 jnz .Loop
a98c648e
AP
273
274 mov $h0,0($ctx) # store hash value
275 mov $h1,8($ctx)
276 mov $h2,16($ctx)
277
278 mov 0(%rsp),%r15
1c47e883 279.cfi_restore %r15
a98c648e 280 mov 8(%rsp),%r14
1c47e883 281.cfi_restore %r14
a98c648e 282 mov 16(%rsp),%r13
1c47e883 283.cfi_restore %r13
a98c648e 284 mov 24(%rsp),%r12
1c47e883 285.cfi_restore %r12
a98c648e 286 mov 32(%rsp),%rbp
1c47e883 287.cfi_restore %rbp
a98c648e 288 mov 40(%rsp),%rbx
1c47e883 289.cfi_restore %rbx
a98c648e 290 lea 48(%rsp),%rsp
1c47e883 291.cfi_adjust_cfa_offset -48
a98c648e
AP
292.Lno_data:
293.Lblocks_epilogue:
294 ret
1c47e883 295.cfi_endproc
a98c648e
AP
296.size poly1305_blocks,.-poly1305_blocks
297
a98c648e
AP
298.type poly1305_emit,\@function,3
299.align 32
300poly1305_emit:
a85dbf11 301.Lemit:
a98c648e
AP
302 mov 0($ctx),%r8 # load hash value
303 mov 8($ctx),%r9
304 mov 16($ctx),%r10
305
306 mov %r8,%rax
307 add \$5,%r8 # compare to modulus
308 mov %r9,%rcx
309 adc \$0,%r9
310 adc \$0,%r10
46f4e1be 311 shr \$2,%r10 # did 130-bit value overflow?
a98c648e
AP
312 cmovnz %r8,%rax
313 cmovnz %r9,%rcx
314
315 add 0($nonce),%rax # accumulate nonce
316 adc 8($nonce),%rcx
317 mov %rax,0($mac) # write result
318 mov %rcx,8($mac)
319
320 ret
321.size poly1305_emit,.-poly1305_emit
322___
323if ($avx) {
324
325########################################################################
326# Layout of opaque area is following.
327#
328# unsigned __int32 h[5]; # current hash value base 2^26
329# unsigned __int32 is_base2_26;
330# unsigned __int64 r[2]; # key value base 2^64
331# unsigned __int64 pad;
332# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
333#
334# where r^n are base 2^26 digits of degrees of multiplier key. There are
335# 5 digits, but last four are interleaved with multiples of 5, totalling
336# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
337
338my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
339 map("%xmm$_",(0..15));
340
341$code.=<<___;
342.type __poly1305_block,\@abi-omnipotent
343.align 32
344__poly1305_block:
345___
346 &poly1305_iteration();
347$code.=<<___;
348 ret
349.size __poly1305_block,.-__poly1305_block
350
351.type __poly1305_init_avx,\@abi-omnipotent
352.align 32
353__poly1305_init_avx:
354 mov $r0,$h0
355 mov $r1,$h1
356 xor $h2,$h2
357
358 lea 48+64($ctx),$ctx # size optimization
359
360 mov $r1,%rax
361 call __poly1305_block # r^2
362
363 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
364 mov \$0x3ffffff,%edx
365 mov $h0,$d1
366 and $h0#d,%eax
367 mov $r0,$d2
368 and $r0#d,%edx
369 mov %eax,`16*0+0-64`($ctx)
370 shr \$26,$d1
371 mov %edx,`16*0+4-64`($ctx)
372 shr \$26,$d2
373
374 mov \$0x3ffffff,%eax
375 mov \$0x3ffffff,%edx
376 and $d1#d,%eax
377 and $d2#d,%edx
378 mov %eax,`16*1+0-64`($ctx)
379 lea (%rax,%rax,4),%eax # *5
380 mov %edx,`16*1+4-64`($ctx)
381 lea (%rdx,%rdx,4),%edx # *5
382 mov %eax,`16*2+0-64`($ctx)
383 shr \$26,$d1
384 mov %edx,`16*2+4-64`($ctx)
385 shr \$26,$d2
386
387 mov $h1,%rax
388 mov $r1,%rdx
389 shl \$12,%rax
390 shl \$12,%rdx
391 or $d1,%rax
392 or $d2,%rdx
393 and \$0x3ffffff,%eax
394 and \$0x3ffffff,%edx
395 mov %eax,`16*3+0-64`($ctx)
396 lea (%rax,%rax,4),%eax # *5
397 mov %edx,`16*3+4-64`($ctx)
398 lea (%rdx,%rdx,4),%edx # *5
399 mov %eax,`16*4+0-64`($ctx)
400 mov $h1,$d1
401 mov %edx,`16*4+4-64`($ctx)
402 mov $r1,$d2
403
404 mov \$0x3ffffff,%eax
405 mov \$0x3ffffff,%edx
406 shr \$14,$d1
407 shr \$14,$d2
408 and $d1#d,%eax
409 and $d2#d,%edx
410 mov %eax,`16*5+0-64`($ctx)
411 lea (%rax,%rax,4),%eax # *5
412 mov %edx,`16*5+4-64`($ctx)
413 lea (%rdx,%rdx,4),%edx # *5
414 mov %eax,`16*6+0-64`($ctx)
415 shr \$26,$d1
416 mov %edx,`16*6+4-64`($ctx)
417 shr \$26,$d2
418
419 mov $h2,%rax
420 shl \$24,%rax
421 or %rax,$d1
422 mov $d1#d,`16*7+0-64`($ctx)
423 lea ($d1,$d1,4),$d1 # *5
424 mov $d2#d,`16*7+4-64`($ctx)
425 lea ($d2,$d2,4),$d2 # *5
426 mov $d1#d,`16*8+0-64`($ctx)
427 mov $d2#d,`16*8+4-64`($ctx)
428
429 mov $r1,%rax
430 call __poly1305_block # r^3
431
432 mov \$0x3ffffff,%eax # save r^3 base 2^26
433 mov $h0,$d1
434 and $h0#d,%eax
435 shr \$26,$d1
436 mov %eax,`16*0+12-64`($ctx)
437
438 mov \$0x3ffffff,%edx
439 and $d1#d,%edx
440 mov %edx,`16*1+12-64`($ctx)
441 lea (%rdx,%rdx,4),%edx # *5
442 shr \$26,$d1
443 mov %edx,`16*2+12-64`($ctx)
444
445 mov $h1,%rax
446 shl \$12,%rax
447 or $d1,%rax
448 and \$0x3ffffff,%eax
449 mov %eax,`16*3+12-64`($ctx)
450 lea (%rax,%rax,4),%eax # *5
451 mov $h1,$d1
452 mov %eax,`16*4+12-64`($ctx)
453
454 mov \$0x3ffffff,%edx
455 shr \$14,$d1
456 and $d1#d,%edx
457 mov %edx,`16*5+12-64`($ctx)
458 lea (%rdx,%rdx,4),%edx # *5
459 shr \$26,$d1
460 mov %edx,`16*6+12-64`($ctx)
461
462 mov $h2,%rax
463 shl \$24,%rax
464 or %rax,$d1
465 mov $d1#d,`16*7+12-64`($ctx)
466 lea ($d1,$d1,4),$d1 # *5
467 mov $d1#d,`16*8+12-64`($ctx)
468
469 mov $r1,%rax
470 call __poly1305_block # r^4
471
472 mov \$0x3ffffff,%eax # save r^4 base 2^26
473 mov $h0,$d1
474 and $h0#d,%eax
475 shr \$26,$d1
476 mov %eax,`16*0+8-64`($ctx)
477
478 mov \$0x3ffffff,%edx
479 and $d1#d,%edx
480 mov %edx,`16*1+8-64`($ctx)
481 lea (%rdx,%rdx,4),%edx # *5
482 shr \$26,$d1
483 mov %edx,`16*2+8-64`($ctx)
484
485 mov $h1,%rax
486 shl \$12,%rax
487 or $d1,%rax
488 and \$0x3ffffff,%eax
489 mov %eax,`16*3+8-64`($ctx)
490 lea (%rax,%rax,4),%eax # *5
491 mov $h1,$d1
492 mov %eax,`16*4+8-64`($ctx)
493
494 mov \$0x3ffffff,%edx
495 shr \$14,$d1
496 and $d1#d,%edx
497 mov %edx,`16*5+8-64`($ctx)
498 lea (%rdx,%rdx,4),%edx # *5
499 shr \$26,$d1
500 mov %edx,`16*6+8-64`($ctx)
501
502 mov $h2,%rax
503 shl \$24,%rax
504 or %rax,$d1
505 mov $d1#d,`16*7+8-64`($ctx)
506 lea ($d1,$d1,4),$d1 # *5
507 mov $d1#d,`16*8+8-64`($ctx)
508
509 lea -48-64($ctx),$ctx # size [de-]optimization
510 ret
511.size __poly1305_init_avx,.-__poly1305_init_avx
512
513.type poly1305_blocks_avx,\@function,4
514.align 32
515poly1305_blocks_avx:
1c47e883 516.cfi_startproc
a98c648e
AP
517 mov 20($ctx),%r8d # is_base2_26
518 cmp \$128,$len
519 jae .Lblocks_avx
520 test %r8d,%r8d
a85dbf11 521 jz .Lblocks
a98c648e
AP
522
523.Lblocks_avx:
524 and \$-16,$len
525 jz .Lno_data_avx
526
527 vzeroupper
528
529 test %r8d,%r8d
530 jz .Lbase2_64_avx
531
532 test \$31,$len
533 jz .Leven_avx
534
535 push %rbx
1c47e883 536.cfi_push %rbx
a98c648e 537 push %rbp
1c47e883 538.cfi_push %rbp
a98c648e 539 push %r12
1c47e883 540.cfi_push %r12
a98c648e 541 push %r13
1c47e883 542.cfi_push %r13
a98c648e 543 push %r14
1c47e883 544.cfi_push %r14
a98c648e 545 push %r15
1c47e883 546.cfi_push %r15
a98c648e
AP
547.Lblocks_avx_body:
548
549 mov $len,%r15 # reassign $len
550
551 mov 0($ctx),$d1 # load hash value
552 mov 8($ctx),$d2
553 mov 16($ctx),$h2#d
554
555 mov 24($ctx),$r0 # load r
556 mov 32($ctx),$s1
557
558 ################################# base 2^26 -> base 2^64
559 mov $d1#d,$h0#d
28411657 560 and \$`-1*(1<<31)`,$d1
a98c648e
AP
561 mov $d2,$r1 # borrow $r1
562 mov $d2#d,$h1#d
28411657 563 and \$`-1*(1<<31)`,$d2
a98c648e
AP
564
565 shr \$6,$d1
566 shl \$52,$r1
567 add $d1,$h0
568 shr \$12,$h1
569 shr \$18,$d2
570 add $r1,$h0
571 adc $d2,$h1
572
573 mov $h2,$d1
574 shl \$40,$d1
575 shr \$24,$h2
576 add $d1,$h1
577 adc \$0,$h2 # can be partially reduced...
578
579 mov \$-4,$d2 # ... so reduce
580 mov $h2,$d1
581 and $h2,$d2
582 shr \$2,$d1
583 and \$3,$h2
584 add $d2,$d1 # =*5
585 add $d1,$h0
586 adc \$0,$h1
4b8736a2 587 adc \$0,$h2
a98c648e
AP
588
589 mov $s1,$r1
590 mov $s1,%rax
591 shr \$2,$s1
592 add $r1,$s1 # s1 = r1 + (r1 >> 2)
593
594 add 0($inp),$h0 # accumulate input
595 adc 8($inp),$h1
596 lea 16($inp),$inp
597 adc $padbit,$h2
598
599 call __poly1305_block
600
601 test $padbit,$padbit # if $padbit is zero,
602 jz .Lstore_base2_64_avx # store hash in base 2^64 format
603
604 ################################# base 2^64 -> base 2^26
605 mov $h0,%rax
606 mov $h0,%rdx
607 shr \$52,$h0
608 mov $h1,$r0
609 mov $h1,$r1
610 shr \$26,%rdx
611 and \$0x3ffffff,%rax # h[0]
612 shl \$12,$r0
613 and \$0x3ffffff,%rdx # h[1]
614 shr \$14,$h1
615 or $r0,$h0
616 shl \$24,$h2
617 and \$0x3ffffff,$h0 # h[2]
618 shr \$40,$r1
619 and \$0x3ffffff,$h1 # h[3]
620 or $r1,$h2 # h[4]
621
622 sub \$16,%r15
623 jz .Lstore_base2_26_avx
624
625 vmovd %rax#d,$H0
626 vmovd %rdx#d,$H1
627 vmovd $h0#d,$H2
628 vmovd $h1#d,$H3
629 vmovd $h2#d,$H4
630 jmp .Lproceed_avx
631
632.align 32
633.Lstore_base2_64_avx:
634 mov $h0,0($ctx)
635 mov $h1,8($ctx)
636 mov $h2,16($ctx) # note that is_base2_26 is zeroed
637 jmp .Ldone_avx
638
639.align 16
640.Lstore_base2_26_avx:
641 mov %rax#d,0($ctx) # store hash value base 2^26
642 mov %rdx#d,4($ctx)
643 mov $h0#d,8($ctx)
644 mov $h1#d,12($ctx)
645 mov $h2#d,16($ctx)
646.align 16
647.Ldone_avx:
648 mov 0(%rsp),%r15
1c47e883 649.cfi_restore %r15
a98c648e 650 mov 8(%rsp),%r14
1c47e883 651.cfi_restore %r14
a98c648e 652 mov 16(%rsp),%r13
1c47e883 653.cfi_restore %r13
a98c648e 654 mov 24(%rsp),%r12
1c47e883 655.cfi_restore %r12
a98c648e 656 mov 32(%rsp),%rbp
1c47e883 657.cfi_restore %rbp
a98c648e 658 mov 40(%rsp),%rbx
1c47e883 659.cfi_restore %rbx
a98c648e 660 lea 48(%rsp),%rsp
1c47e883 661.cfi_adjust_cfa_offset -48
a98c648e
AP
662.Lno_data_avx:
663.Lblocks_avx_epilogue:
664 ret
1c47e883 665.cfi_endproc
a98c648e
AP
666
667.align 32
668.Lbase2_64_avx:
1c47e883 669.cfi_startproc
a98c648e 670 push %rbx
1c47e883 671.cfi_push %rbx
a98c648e 672 push %rbp
1c47e883 673.cfi_push %rbp
a98c648e 674 push %r12
1c47e883 675.cfi_push %r12
a98c648e 676 push %r13
1c47e883 677.cfi_push %r13
a98c648e 678 push %r14
1c47e883 679.cfi_push %r14
a98c648e 680 push %r15
1c47e883 681.cfi_push %r15
a98c648e
AP
682.Lbase2_64_avx_body:
683
684 mov $len,%r15 # reassign $len
685
686 mov 24($ctx),$r0 # load r
687 mov 32($ctx),$s1
688
689 mov 0($ctx),$h0 # load hash value
690 mov 8($ctx),$h1
691 mov 16($ctx),$h2#d
692
693 mov $s1,$r1
694 mov $s1,%rax
695 shr \$2,$s1
696 add $r1,$s1 # s1 = r1 + (r1 >> 2)
697
698 test \$31,$len
699 jz .Linit_avx
700
701 add 0($inp),$h0 # accumulate input
702 adc 8($inp),$h1
703 lea 16($inp),$inp
704 adc $padbit,$h2
705 sub \$16,%r15
706
707 call __poly1305_block
708
709.Linit_avx:
710 ################################# base 2^64 -> base 2^26
711 mov $h0,%rax
712 mov $h0,%rdx
713 shr \$52,$h0
714 mov $h1,$d1
715 mov $h1,$d2
716 shr \$26,%rdx
717 and \$0x3ffffff,%rax # h[0]
718 shl \$12,$d1
719 and \$0x3ffffff,%rdx # h[1]
720 shr \$14,$h1
721 or $d1,$h0
722 shl \$24,$h2
723 and \$0x3ffffff,$h0 # h[2]
724 shr \$40,$d2
725 and \$0x3ffffff,$h1 # h[3]
726 or $d2,$h2 # h[4]
727
728 vmovd %rax#d,$H0
729 vmovd %rdx#d,$H1
730 vmovd $h0#d,$H2
731 vmovd $h1#d,$H3
732 vmovd $h2#d,$H4
733 movl \$1,20($ctx) # set is_base2_26
734
735 call __poly1305_init_avx
736
737.Lproceed_avx:
738 mov %r15,$len
739
740 mov 0(%rsp),%r15
1c47e883 741.cfi_restore %r15
a98c648e 742 mov 8(%rsp),%r14
1c47e883 743.cfi_restore %r14
a98c648e 744 mov 16(%rsp),%r13
1c47e883 745.cfi_restore %r13
a98c648e 746 mov 24(%rsp),%r12
1c47e883 747.cfi_restore %r12
a98c648e 748 mov 32(%rsp),%rbp
1c47e883 749.cfi_restore %rbp
a98c648e 750 mov 40(%rsp),%rbx
1c47e883 751.cfi_restore %rbx
a98c648e
AP
752 lea 48(%rsp),%rax
753 lea 48(%rsp),%rsp
1c47e883 754.cfi_adjust_cfa_offset -48
a98c648e
AP
755.Lbase2_64_avx_epilogue:
756 jmp .Ldo_avx
1c47e883 757.cfi_endproc
a98c648e
AP
758
759.align 32
760.Leven_avx:
1c47e883 761.cfi_startproc
a98c648e
AP
762 vmovd 4*0($ctx),$H0 # load hash value
763 vmovd 4*1($ctx),$H1
764 vmovd 4*2($ctx),$H2
765 vmovd 4*3($ctx),$H3
766 vmovd 4*4($ctx),$H4
767
768.Ldo_avx:
769___
770$code.=<<___ if (!$win64);
771 lea -0x58(%rsp),%r11
1c47e883 772.cfi_def_cfa %r11,0x60
a98c648e
AP
773 sub \$0x178,%rsp
774___
775$code.=<<___ if ($win64);
776 lea -0xf8(%rsp),%r11
777 sub \$0x218,%rsp
778 vmovdqa %xmm6,0x50(%r11)
779 vmovdqa %xmm7,0x60(%r11)
780 vmovdqa %xmm8,0x70(%r11)
781 vmovdqa %xmm9,0x80(%r11)
782 vmovdqa %xmm10,0x90(%r11)
783 vmovdqa %xmm11,0xa0(%r11)
784 vmovdqa %xmm12,0xb0(%r11)
785 vmovdqa %xmm13,0xc0(%r11)
786 vmovdqa %xmm14,0xd0(%r11)
787 vmovdqa %xmm15,0xe0(%r11)
788.Ldo_avx_body:
789___
790$code.=<<___;
791 sub \$64,$len
792 lea -32($inp),%rax
793 cmovc %rax,$inp
794
795 vmovdqu `16*3`($ctx),$D4 # preload r0^2
796 lea `16*3+64`($ctx),$ctx # size optimization
797 lea .Lconst(%rip),%rcx
798
799 ################################################################
800 # load input
801 vmovdqu 16*2($inp),$T0
802 vmovdqu 16*3($inp),$T1
803 vmovdqa 64(%rcx),$MASK # .Lmask26
804
805 vpsrldq \$6,$T0,$T2 # splat input
806 vpsrldq \$6,$T1,$T3
807 vpunpckhqdq $T1,$T0,$T4 # 4
808 vpunpcklqdq $T1,$T0,$T0 # 0:1
809 vpunpcklqdq $T3,$T2,$T3 # 2:3
810
811 vpsrlq \$40,$T4,$T4 # 4
812 vpsrlq \$26,$T0,$T1
813 vpand $MASK,$T0,$T0 # 0
814 vpsrlq \$4,$T3,$T2
815 vpand $MASK,$T1,$T1 # 1
816 vpsrlq \$30,$T3,$T3
817 vpand $MASK,$T2,$T2 # 2
818 vpand $MASK,$T3,$T3 # 3
819 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
820
821 jbe .Lskip_loop_avx
822
823 # expand and copy pre-calculated table to stack
824 vmovdqu `16*1-64`($ctx),$D1
825 vmovdqu `16*2-64`($ctx),$D2
826 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
827 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
828 vmovdqa $D3,-0x90(%r11)
829 vmovdqa $D0,0x00(%rsp)
830 vpshufd \$0xEE,$D1,$D4
831 vmovdqu `16*3-64`($ctx),$D0
832 vpshufd \$0x44,$D1,$D1
833 vmovdqa $D4,-0x80(%r11)
834 vmovdqa $D1,0x10(%rsp)
835 vpshufd \$0xEE,$D2,$D3
836 vmovdqu `16*4-64`($ctx),$D1
837 vpshufd \$0x44,$D2,$D2
838 vmovdqa $D3,-0x70(%r11)
839 vmovdqa $D2,0x20(%rsp)
840 vpshufd \$0xEE,$D0,$D4
841 vmovdqu `16*5-64`($ctx),$D2
842 vpshufd \$0x44,$D0,$D0
843 vmovdqa $D4,-0x60(%r11)
844 vmovdqa $D0,0x30(%rsp)
845 vpshufd \$0xEE,$D1,$D3
846 vmovdqu `16*6-64`($ctx),$D0
847 vpshufd \$0x44,$D1,$D1
848 vmovdqa $D3,-0x50(%r11)
849 vmovdqa $D1,0x40(%rsp)
850 vpshufd \$0xEE,$D2,$D4
851 vmovdqu `16*7-64`($ctx),$D1
852 vpshufd \$0x44,$D2,$D2
853 vmovdqa $D4,-0x40(%r11)
854 vmovdqa $D2,0x50(%rsp)
855 vpshufd \$0xEE,$D0,$D3
856 vmovdqu `16*8-64`($ctx),$D2
857 vpshufd \$0x44,$D0,$D0
858 vmovdqa $D3,-0x30(%r11)
859 vmovdqa $D0,0x60(%rsp)
860 vpshufd \$0xEE,$D1,$D4
861 vpshufd \$0x44,$D1,$D1
862 vmovdqa $D4,-0x20(%r11)
863 vmovdqa $D1,0x70(%rsp)
864 vpshufd \$0xEE,$D2,$D3
865 vmovdqa 0x00(%rsp),$D4 # preload r0^2
866 vpshufd \$0x44,$D2,$D2
867 vmovdqa $D3,-0x10(%r11)
868 vmovdqa $D2,0x80(%rsp)
869
870 jmp .Loop_avx
871
872.align 32
873.Loop_avx:
874 ################################################################
875 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
876 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
877 # \___________________/
878 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
879 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
880 # \___________________/ \____________________/
881 #
882 # Note that we start with inp[2:3]*r^2. This is because it
883 # doesn't depend on reduction in previous iteration.
884 ################################################################
885 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
886 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
887 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
888 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
889 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
890 #
891 # though note that $Tx and $Hx are "reversed" in this section,
892 # and $D4 is preloaded with r0^2...
893
894 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
895 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
896 vmovdqa $H2,0x20(%r11) # offload hash
897 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
898 vmovdqa 0x10(%rsp),$H2 # r1^2
899 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
900 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
901
902 vmovdqa $H0,0x00(%r11) #
903 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
904 vmovdqa $H1,0x10(%r11) #
905 vpmuludq $T3,$H2,$H1 # h3*r1
906 vpaddq $H0,$D0,$D0 # d0 += h4*s1
907 vpaddq $H1,$D4,$D4 # d4 += h3*r1
908 vmovdqa $H3,0x30(%r11) #
909 vpmuludq $T2,$H2,$H0 # h2*r1
910 vpmuludq $T1,$H2,$H1 # h1*r1
911 vpaddq $H0,$D3,$D3 # d3 += h2*r1
912 vmovdqa 0x30(%rsp),$H3 # r2^2
913 vpaddq $H1,$D2,$D2 # d2 += h1*r1
914 vmovdqa $H4,0x40(%r11) #
915 vpmuludq $T0,$H2,$H2 # h0*r1
916 vpmuludq $T2,$H3,$H0 # h2*r2
917 vpaddq $H2,$D1,$D1 # d1 += h0*r1
918
919 vmovdqa 0x40(%rsp),$H4 # s2^2
920 vpaddq $H0,$D4,$D4 # d4 += h2*r2
921 vpmuludq $T1,$H3,$H1 # h1*r2
922 vpmuludq $T0,$H3,$H3 # h0*r2
923 vpaddq $H1,$D3,$D3 # d3 += h1*r2
924 vmovdqa 0x50(%rsp),$H2 # r3^2
925 vpaddq $H3,$D2,$D2 # d2 += h0*r2
926 vpmuludq $T4,$H4,$H0 # h4*s2
927 vpmuludq $T3,$H4,$H4 # h3*s2
928 vpaddq $H0,$D1,$D1 # d1 += h4*s2
929 vmovdqa 0x60(%rsp),$H3 # s3^2
930 vpaddq $H4,$D0,$D0 # d0 += h3*s2
931
932 vmovdqa 0x80(%rsp),$H4 # s4^2
933 vpmuludq $T1,$H2,$H1 # h1*r3
934 vpmuludq $T0,$H2,$H2 # h0*r3
935 vpaddq $H1,$D4,$D4 # d4 += h1*r3
936 vpaddq $H2,$D3,$D3 # d3 += h0*r3
937 vpmuludq $T4,$H3,$H0 # h4*s3
938 vpmuludq $T3,$H3,$H1 # h3*s3
939 vpaddq $H0,$D2,$D2 # d2 += h4*s3
940 vmovdqu 16*0($inp),$H0 # load input
941 vpaddq $H1,$D1,$D1 # d1 += h3*s3
942 vpmuludq $T2,$H3,$H3 # h2*s3
943 vpmuludq $T2,$H4,$T2 # h2*s4
944 vpaddq $H3,$D0,$D0 # d0 += h2*s3
945
946 vmovdqu 16*1($inp),$H1 #
947 vpaddq $T2,$D1,$D1 # d1 += h2*s4
948 vpmuludq $T3,$H4,$T3 # h3*s4
949 vpmuludq $T4,$H4,$T4 # h4*s4
950 vpsrldq \$6,$H0,$H2 # splat input
951 vpaddq $T3,$D2,$D2 # d2 += h3*s4
952 vpaddq $T4,$D3,$D3 # d3 += h4*s4
953 vpsrldq \$6,$H1,$H3 #
954 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
955 vpmuludq $T1,$H4,$T0 # h1*s4
956 vpunpckhqdq $H1,$H0,$H4 # 4
957 vpaddq $T4,$D4,$D4 # d4 += h0*r4
958 vmovdqa -0x90(%r11),$T4 # r0^4
959 vpaddq $T0,$D0,$D0 # d0 += h1*s4
960
961 vpunpcklqdq $H1,$H0,$H0 # 0:1
962 vpunpcklqdq $H3,$H2,$H3 # 2:3
963
964 #vpsrlq \$40,$H4,$H4 # 4
965 vpsrldq \$`40/8`,$H4,$H4 # 4
966 vpsrlq \$26,$H0,$H1
967 vpand $MASK,$H0,$H0 # 0
968 vpsrlq \$4,$H3,$H2
969 vpand $MASK,$H1,$H1 # 1
970 vpand 0(%rcx),$H4,$H4 # .Lmask24
971 vpsrlq \$30,$H3,$H3
972 vpand $MASK,$H2,$H2 # 2
973 vpand $MASK,$H3,$H3 # 3
974 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
975
976 vpaddq 0x00(%r11),$H0,$H0 # add hash value
977 vpaddq 0x10(%r11),$H1,$H1
978 vpaddq 0x20(%r11),$H2,$H2
979 vpaddq 0x30(%r11),$H3,$H3
980 vpaddq 0x40(%r11),$H4,$H4
981
982 lea 16*2($inp),%rax
983 lea 16*4($inp),$inp
984 sub \$64,$len
985 cmovc %rax,$inp
986
987 ################################################################
988 # Now we accumulate (inp[0:1]+hash)*r^4
989 ################################################################
990 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
991 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
992 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
993 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
994 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
995
996 vpmuludq $H0,$T4,$T0 # h0*r0
997 vpmuludq $H1,$T4,$T1 # h1*r0
998 vpaddq $T0,$D0,$D0
999 vpaddq $T1,$D1,$D1
1000 vmovdqa -0x80(%r11),$T2 # r1^4
1001 vpmuludq $H2,$T4,$T0 # h2*r0
1002 vpmuludq $H3,$T4,$T1 # h3*r0
1003 vpaddq $T0,$D2,$D2
1004 vpaddq $T1,$D3,$D3
1005 vpmuludq $H4,$T4,$T4 # h4*r0
1006 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1007 vpaddq $T4,$D4,$D4
1008
1009 vpaddq $T0,$D0,$D0 # d0 += h4*s1
1010 vpmuludq $H2,$T2,$T1 # h2*r1
1011 vpmuludq $H3,$T2,$T0 # h3*r1
1012 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1013 vmovdqa -0x60(%r11),$T3 # r2^4
1014 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1015 vpmuludq $H1,$T2,$T1 # h1*r1
1016 vpmuludq $H0,$T2,$T2 # h0*r1
1017 vpaddq $T1,$D2,$D2 # d2 += h1*r1
1018 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1019
1020 vmovdqa -0x50(%r11),$T4 # s2^4
1021 vpmuludq $H2,$T3,$T0 # h2*r2
1022 vpmuludq $H1,$T3,$T1 # h1*r2
1023 vpaddq $T0,$D4,$D4 # d4 += h2*r2
1024 vpaddq $T1,$D3,$D3 # d3 += h1*r2
1025 vmovdqa -0x40(%r11),$T2 # r3^4
1026 vpmuludq $H0,$T3,$T3 # h0*r2
1027 vpmuludq $H4,$T4,$T0 # h4*s2
1028 vpaddq $T3,$D2,$D2 # d2 += h0*r2
1029 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1030 vmovdqa -0x30(%r11),$T3 # s3^4
1031 vpmuludq $H3,$T4,$T4 # h3*s2
1032 vpmuludq $H1,$T2,$T1 # h1*r3
1033 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1034
1035 vmovdqa -0x10(%r11),$T4 # s4^4
1036 vpaddq $T1,$D4,$D4 # d4 += h1*r3
1037 vpmuludq $H0,$T2,$T2 # h0*r3
1038 vpmuludq $H4,$T3,$T0 # h4*s3
1039 vpaddq $T2,$D3,$D3 # d3 += h0*r3
1040 vpaddq $T0,$D2,$D2 # d2 += h4*s3
1041 vmovdqu 16*2($inp),$T0 # load input
1042 vpmuludq $H3,$T3,$T2 # h3*s3
1043 vpmuludq $H2,$T3,$T3 # h2*s3
1044 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1045 vmovdqu 16*3($inp),$T1 #
1046 vpaddq $T3,$D0,$D0 # d0 += h2*s3
1047
1048 vpmuludq $H2,$T4,$H2 # h2*s4
1049 vpmuludq $H3,$T4,$H3 # h3*s4
1050 vpsrldq \$6,$T0,$T2 # splat input
1051 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1052 vpmuludq $H4,$T4,$H4 # h4*s4
1053 vpsrldq \$6,$T1,$T3 #
1054 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1055 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1056 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1057 vpmuludq $H1,$T4,$H0
1058 vpunpckhqdq $T1,$T0,$T4 # 4
1059 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1060 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1061
1062 vpunpcklqdq $T1,$T0,$T0 # 0:1
1063 vpunpcklqdq $T3,$T2,$T3 # 2:3
1064
1065 #vpsrlq \$40,$T4,$T4 # 4
1066 vpsrldq \$`40/8`,$T4,$T4 # 4
1067 vpsrlq \$26,$T0,$T1
1068 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1069 vpand $MASK,$T0,$T0 # 0
1070 vpsrlq \$4,$T3,$T2
1071 vpand $MASK,$T1,$T1 # 1
1072 vpand 0(%rcx),$T4,$T4 # .Lmask24
1073 vpsrlq \$30,$T3,$T3
1074 vpand $MASK,$T2,$T2 # 2
1075 vpand $MASK,$T3,$T3 # 3
1076 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1077
1078 ################################################################
1079 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1080 # and P. Schwabe
1081
1082 vpsrlq \$26,$H3,$D3
1083 vpand $MASK,$H3,$H3
1084 vpaddq $D3,$H4,$H4 # h3 -> h4
1085
1086 vpsrlq \$26,$H0,$D0
1087 vpand $MASK,$H0,$H0
1088 vpaddq $D0,$D1,$H1 # h0 -> h1
1089
1090 vpsrlq \$26,$H4,$D0
1091 vpand $MASK,$H4,$H4
1092
1093 vpsrlq \$26,$H1,$D1
1094 vpand $MASK,$H1,$H1
1095 vpaddq $D1,$H2,$H2 # h1 -> h2
1096
1097 vpaddq $D0,$H0,$H0
1098 vpsllq \$2,$D0,$D0
1099 vpaddq $D0,$H0,$H0 # h4 -> h0
1100
1101 vpsrlq \$26,$H2,$D2
1102 vpand $MASK,$H2,$H2
1103 vpaddq $D2,$H3,$H3 # h2 -> h3
1104
1105 vpsrlq \$26,$H0,$D0
1106 vpand $MASK,$H0,$H0
1107 vpaddq $D0,$H1,$H1 # h0 -> h1
1108
1109 vpsrlq \$26,$H3,$D3
1110 vpand $MASK,$H3,$H3
1111 vpaddq $D3,$H4,$H4 # h3 -> h4
1112
1113 ja .Loop_avx
1114
1115.Lskip_loop_avx:
1116 ################################################################
1117 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1118
1119 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1120 add \$32,$len
1121 jnz .Long_tail_avx
1122
1123 vpaddq $H2,$T2,$T2
1124 vpaddq $H0,$T0,$T0
1125 vpaddq $H1,$T1,$T1
1126 vpaddq $H3,$T3,$T3
1127 vpaddq $H4,$T4,$T4
1128
1129.Long_tail_avx:
1130 vmovdqa $H2,0x20(%r11)
1131 vmovdqa $H0,0x00(%r11)
1132 vmovdqa $H1,0x10(%r11)
1133 vmovdqa $H3,0x30(%r11)
1134 vmovdqa $H4,0x40(%r11)
1135
1136 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1137 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1138 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1139 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1140 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1141
1142 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1143 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1144 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1145 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1146 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1147 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1148
1149 vpmuludq $T3,$H2,$H0 # h3*r1
1150 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1151 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1152 vpmuludq $T2,$H2,$H1 # h2*r1
1153 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1154 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1155 vpmuludq $T1,$H2,$H0 # h1*r1
1156 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1157 vpmuludq $T0,$H2,$H2 # h0*r1
1158 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1159 vpmuludq $T4,$H3,$H3 # h4*s1
1160 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1161
1162 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1163 vpmuludq $T2,$H4,$H1 # h2*r2
1164 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1165 vpmuludq $T1,$H4,$H0 # h1*r2
1166 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1167 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1168 vpmuludq $T0,$H4,$H4 # h0*r2
1169 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1170 vpmuludq $T4,$H2,$H1 # h4*s2
1171 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1172 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1173 vpmuludq $T3,$H2,$H2 # h3*s2
1174 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1175
1176 vpmuludq $T1,$H3,$H0 # h1*r3
1177 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1178 vpmuludq $T0,$H3,$H3 # h0*r3
1179 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1180 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1181 vpmuludq $T4,$H4,$H1 # h4*s3
1182 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1183 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1184 vpmuludq $T3,$H4,$H0 # h3*s3
1185 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1186 vpmuludq $T2,$H4,$H4 # h2*s3
1187 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1188
1189 vpmuludq $T0,$H2,$H2 # h0*r4
1190 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1191 vpmuludq $T4,$H3,$H1 # h4*s4
1192 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1193 vpmuludq $T3,$H3,$H0 # h3*s4
1194 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1195 vpmuludq $T2,$H3,$H1 # h2*s4
1196 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1197 vpmuludq $T1,$H3,$H3 # h1*s4
1198 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1199
1200 jz .Lshort_tail_avx
1201
1202 vmovdqu 16*0($inp),$H0 # load input
1203 vmovdqu 16*1($inp),$H1
1204
1205 vpsrldq \$6,$H0,$H2 # splat input
1206 vpsrldq \$6,$H1,$H3
1207 vpunpckhqdq $H1,$H0,$H4 # 4
1208 vpunpcklqdq $H1,$H0,$H0 # 0:1
1209 vpunpcklqdq $H3,$H2,$H3 # 2:3
1210
1211 vpsrlq \$40,$H4,$H4 # 4
1212 vpsrlq \$26,$H0,$H1
1213 vpand $MASK,$H0,$H0 # 0
1214 vpsrlq \$4,$H3,$H2
1215 vpand $MASK,$H1,$H1 # 1
1216 vpsrlq \$30,$H3,$H3
1217 vpand $MASK,$H2,$H2 # 2
1218 vpand $MASK,$H3,$H3 # 3
1219 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1220
1221 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1222 vpaddq 0x00(%r11),$H0,$H0
1223 vpaddq 0x10(%r11),$H1,$H1
1224 vpaddq 0x20(%r11),$H2,$H2
1225 vpaddq 0x30(%r11),$H3,$H3
1226 vpaddq 0x40(%r11),$H4,$H4
1227
1228 ################################################################
1229 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1230
1231 vpmuludq $H0,$T4,$T0 # h0*r0
1232 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1233 vpmuludq $H1,$T4,$T1 # h1*r0
1234 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1235 vpmuludq $H2,$T4,$T0 # h2*r0
1236 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1237 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1238 vpmuludq $H3,$T4,$T1 # h3*r0
1239 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1240 vpmuludq $H4,$T4,$T4 # h4*r0
1241 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1242
1243 vpmuludq $H3,$T2,$T0 # h3*r1
1244 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1245 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1246 vpmuludq $H2,$T2,$T1 # h2*r1
1247 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1248 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1249 vpmuludq $H1,$T2,$T0 # h1*r1
1250 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1251 vpmuludq $H0,$T2,$T2 # h0*r1
1252 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1253 vpmuludq $H4,$T3,$T3 # h4*s1
1254 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1255
1256 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1257 vpmuludq $H2,$T4,$T1 # h2*r2
1258 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1259 vpmuludq $H1,$T4,$T0 # h1*r2
1260 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1261 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1262 vpmuludq $H0,$T4,$T4 # h0*r2
1263 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1264 vpmuludq $H4,$T2,$T1 # h4*s2
1265 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1266 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1267 vpmuludq $H3,$T2,$T2 # h3*s2
1268 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1269
1270 vpmuludq $H1,$T3,$T0 # h1*r3
1271 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1272 vpmuludq $H0,$T3,$T3 # h0*r3
1273 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1274 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1275 vpmuludq $H4,$T4,$T1 # h4*s3
1276 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1277 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1278 vpmuludq $H3,$T4,$T0 # h3*s3
1279 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1280 vpmuludq $H2,$T4,$T4 # h2*s3
1281 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1282
1283 vpmuludq $H0,$T2,$T2 # h0*r4
1284 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1285 vpmuludq $H4,$T3,$T1 # h4*s4
1286 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1287 vpmuludq $H3,$T3,$T0 # h3*s4
1288 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1289 vpmuludq $H2,$T3,$T1 # h2*s4
1290 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1291 vpmuludq $H1,$T3,$T3 # h1*s4
1292 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1293
1294.Lshort_tail_avx:
1ea8ae50
AP
1295 ################################################################
1296 # horizontal addition
1297
1298 vpsrldq \$8,$D4,$T4
1299 vpsrldq \$8,$D3,$T3
1300 vpsrldq \$8,$D1,$T1
1301 vpsrldq \$8,$D0,$T0
1302 vpsrldq \$8,$D2,$T2
1303 vpaddq $T3,$D3,$D3
1304 vpaddq $T4,$D4,$D4
1305 vpaddq $T0,$D0,$D0
1306 vpaddq $T1,$D1,$D1
1307 vpaddq $T2,$D2,$D2
1308
a98c648e
AP
1309 ################################################################
1310 # lazy reduction
1311
1312 vpsrlq \$26,$D3,$H3
1313 vpand $MASK,$D3,$D3
1314 vpaddq $H3,$D4,$D4 # h3 -> h4
1315
1316 vpsrlq \$26,$D0,$H0
1317 vpand $MASK,$D0,$D0
1318 vpaddq $H0,$D1,$D1 # h0 -> h1
1319
1320 vpsrlq \$26,$D4,$H4
1321 vpand $MASK,$D4,$D4
1322
1323 vpsrlq \$26,$D1,$H1
1324 vpand $MASK,$D1,$D1
1325 vpaddq $H1,$D2,$D2 # h1 -> h2
1326
1327 vpaddq $H4,$D0,$D0
1328 vpsllq \$2,$H4,$H4
1329 vpaddq $H4,$D0,$D0 # h4 -> h0
1330
1331 vpsrlq \$26,$D2,$H2
1332 vpand $MASK,$D2,$D2
1333 vpaddq $H2,$D3,$D3 # h2 -> h3
1334
1335 vpsrlq \$26,$D0,$H0
1336 vpand $MASK,$D0,$D0
1337 vpaddq $H0,$D1,$D1 # h0 -> h1
1338
1339 vpsrlq \$26,$D3,$H3
1340 vpand $MASK,$D3,$D3
1341 vpaddq $H3,$D4,$D4 # h3 -> h4
1342
1ea8ae50
AP
1343 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1344 vmovd $D1,`4*1-48-64`($ctx)
1345 vmovd $D2,`4*2-48-64`($ctx)
1346 vmovd $D3,`4*3-48-64`($ctx)
1347 vmovd $D4,`4*4-48-64`($ctx)
a98c648e
AP
1348___
1349$code.=<<___ if ($win64);
1350 vmovdqa 0x50(%r11),%xmm6
1351 vmovdqa 0x60(%r11),%xmm7
1352 vmovdqa 0x70(%r11),%xmm8
1353 vmovdqa 0x80(%r11),%xmm9
1354 vmovdqa 0x90(%r11),%xmm10
1355 vmovdqa 0xa0(%r11),%xmm11
1356 vmovdqa 0xb0(%r11),%xmm12
1357 vmovdqa 0xc0(%r11),%xmm13
1358 vmovdqa 0xd0(%r11),%xmm14
1359 vmovdqa 0xe0(%r11),%xmm15
1360 lea 0xf8(%r11),%rsp
1361.Ldo_avx_epilogue:
1362___
1363$code.=<<___ if (!$win64);
1364 lea 0x58(%r11),%rsp
1c47e883 1365.cfi_def_cfa %rsp,8
a98c648e
AP
1366___
1367$code.=<<___;
1368 vzeroupper
1369 ret
1c47e883 1370.cfi_endproc
a98c648e
AP
1371.size poly1305_blocks_avx,.-poly1305_blocks_avx
1372
1373.type poly1305_emit_avx,\@function,3
1374.align 32
1375poly1305_emit_avx:
1376 cmpl \$0,20($ctx) # is_base2_26?
a85dbf11 1377 je .Lemit
a98c648e
AP
1378
1379 mov 0($ctx),%eax # load hash value base 2^26
1380 mov 4($ctx),%ecx
1381 mov 8($ctx),%r8d
1382 mov 12($ctx),%r11d
1383 mov 16($ctx),%r10d
1384
1385 shl \$26,%rcx # base 2^26 -> base 2^64
1386 mov %r8,%r9
1387 shl \$52,%r8
1388 add %rcx,%rax
1389 shr \$12,%r9
1390 add %rax,%r8 # h0
1391 adc \$0,%r9
1392
1393 shl \$14,%r11
1394 mov %r10,%rax
1395 shr \$24,%r10
1396 add %r11,%r9
1397 shl \$40,%rax
1398 add %rax,%r9 # h1
1399 adc \$0,%r10 # h2
1400
1401 mov %r10,%rax # could be partially reduced, so reduce
1402 mov %r10,%rcx
1403 and \$3,%r10
1404 shr \$2,%rax
1405 and \$-4,%rcx
1406 add %rcx,%rax
1407 add %rax,%r8
1408 adc \$0,%r9
4b8736a2 1409 adc \$0,%r10
a98c648e
AP
1410
1411 mov %r8,%rax
1412 add \$5,%r8 # compare to modulus
1413 mov %r9,%rcx
1414 adc \$0,%r9
1415 adc \$0,%r10
46f4e1be 1416 shr \$2,%r10 # did 130-bit value overflow?
a98c648e
AP
1417 cmovnz %r8,%rax
1418 cmovnz %r9,%rcx
1419
1420 add 0($nonce),%rax # accumulate nonce
1421 adc 8($nonce),%rcx
1422 mov %rax,0($mac) # write result
1423 mov %rcx,8($mac)
1424
1425 ret
1426.size poly1305_emit_avx,.-poly1305_emit_avx
1427___
1428
1429if ($avx>1) {
1430my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1431 map("%ymm$_",(0..15));
1432my $S4=$MASK;
1433
1434$code.=<<___;
1435.type poly1305_blocks_avx2,\@function,4
1436.align 32
1437poly1305_blocks_avx2:
1c47e883 1438.cfi_startproc
a98c648e
AP
1439 mov 20($ctx),%r8d # is_base2_26
1440 cmp \$128,$len
1441 jae .Lblocks_avx2
1442 test %r8d,%r8d
a85dbf11 1443 jz .Lblocks
a98c648e
AP
1444
1445.Lblocks_avx2:
1446 and \$-16,$len
1447 jz .Lno_data_avx2
1448
1449 vzeroupper
1450
1451 test %r8d,%r8d
1452 jz .Lbase2_64_avx2
1453
1454 test \$63,$len
1455 jz .Leven_avx2
1456
1457 push %rbx
1c47e883 1458.cfi_push %rbx
a98c648e 1459 push %rbp
1c47e883 1460.cfi_push %rbp
a98c648e 1461 push %r12
1c47e883 1462.cfi_push %r12
a98c648e 1463 push %r13
1c47e883 1464.cfi_push %r13
a98c648e 1465 push %r14
1c47e883 1466.cfi_push %r14
a98c648e 1467 push %r15
1c47e883 1468.cfi_push %r15
a98c648e
AP
1469.Lblocks_avx2_body:
1470
1471 mov $len,%r15 # reassign $len
1472
1473 mov 0($ctx),$d1 # load hash value
1474 mov 8($ctx),$d2
1475 mov 16($ctx),$h2#d
1476
1477 mov 24($ctx),$r0 # load r
1478 mov 32($ctx),$s1
1479
1480 ################################# base 2^26 -> base 2^64
1481 mov $d1#d,$h0#d
28411657 1482 and \$`-1*(1<<31)`,$d1
a98c648e
AP
1483 mov $d2,$r1 # borrow $r1
1484 mov $d2#d,$h1#d
28411657 1485 and \$`-1*(1<<31)`,$d2
a98c648e
AP
1486
1487 shr \$6,$d1
1488 shl \$52,$r1
1489 add $d1,$h0
1490 shr \$12,$h1
1491 shr \$18,$d2
1492 add $r1,$h0
1493 adc $d2,$h1
1494
1495 mov $h2,$d1
1496 shl \$40,$d1
1497 shr \$24,$h2
1498 add $d1,$h1
1499 adc \$0,$h2 # can be partially reduced...
1500
1501 mov \$-4,$d2 # ... so reduce
1502 mov $h2,$d1
1503 and $h2,$d2
1504 shr \$2,$d1
1505 and \$3,$h2
1506 add $d2,$d1 # =*5
1507 add $d1,$h0
1508 adc \$0,$h1
4b8736a2 1509 adc \$0,$h2
a98c648e
AP
1510
1511 mov $s1,$r1
1512 mov $s1,%rax
1513 shr \$2,$s1
1514 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1515
1516.Lbase2_26_pre_avx2:
1517 add 0($inp),$h0 # accumulate input
1518 adc 8($inp),$h1
1519 lea 16($inp),$inp
1520 adc $padbit,$h2
1521 sub \$16,%r15
1522
1523 call __poly1305_block
1524 mov $r1,%rax
1525
1526 test \$63,%r15
1527 jnz .Lbase2_26_pre_avx2
1528
1529 test $padbit,$padbit # if $padbit is zero,
1530 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1531
1532 ################################# base 2^64 -> base 2^26
1533 mov $h0,%rax
1534 mov $h0,%rdx
1535 shr \$52,$h0
1536 mov $h1,$r0
1537 mov $h1,$r1
1538 shr \$26,%rdx
1539 and \$0x3ffffff,%rax # h[0]
1540 shl \$12,$r0
1541 and \$0x3ffffff,%rdx # h[1]
1542 shr \$14,$h1
1543 or $r0,$h0
1544 shl \$24,$h2
1545 and \$0x3ffffff,$h0 # h[2]
1546 shr \$40,$r1
1547 and \$0x3ffffff,$h1 # h[3]
1548 or $r1,$h2 # h[4]
1549
1550 test %r15,%r15
1551 jz .Lstore_base2_26_avx2
1552
1553 vmovd %rax#d,%x#$H0
1554 vmovd %rdx#d,%x#$H1
1555 vmovd $h0#d,%x#$H2
1556 vmovd $h1#d,%x#$H3
1557 vmovd $h2#d,%x#$H4
1558 jmp .Lproceed_avx2
1559
1560.align 32
1561.Lstore_base2_64_avx2:
1562 mov $h0,0($ctx)
1563 mov $h1,8($ctx)
1564 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1565 jmp .Ldone_avx2
1566
1567.align 16
1568.Lstore_base2_26_avx2:
1569 mov %rax#d,0($ctx) # store hash value base 2^26
1570 mov %rdx#d,4($ctx)
1571 mov $h0#d,8($ctx)
1572 mov $h1#d,12($ctx)
1573 mov $h2#d,16($ctx)
1574.align 16
1575.Ldone_avx2:
1576 mov 0(%rsp),%r15
1c47e883 1577.cfi_restore %r15
a98c648e 1578 mov 8(%rsp),%r14
1c47e883 1579.cfi_restore %r14
a98c648e 1580 mov 16(%rsp),%r13
1c47e883 1581.cfi_restore %r13
a98c648e 1582 mov 24(%rsp),%r12
1c47e883 1583.cfi_restore %r12
a98c648e 1584 mov 32(%rsp),%rbp
1c47e883 1585.cfi_restore %rbp
a98c648e 1586 mov 40(%rsp),%rbx
1c47e883 1587.cfi_restore %rbx
a98c648e 1588 lea 48(%rsp),%rsp
1c47e883 1589.cfi_adjust_cfa_offset -48
a98c648e
AP
1590.Lno_data_avx2:
1591.Lblocks_avx2_epilogue:
1592 ret
1c47e883 1593.cfi_endproc
a98c648e
AP
1594
1595.align 32
1596.Lbase2_64_avx2:
1c47e883 1597.cfi_startproc
a98c648e 1598 push %rbx
1c47e883 1599.cfi_push %rbx
a98c648e 1600 push %rbp
1c47e883 1601.cfi_push %rbp
a98c648e 1602 push %r12
1c47e883 1603.cfi_push %r12
a98c648e 1604 push %r13
1c47e883 1605.cfi_push %r13
a98c648e 1606 push %r14
1c47e883 1607.cfi_push %r14
a98c648e 1608 push %r15
1c47e883 1609.cfi_push %r15
a98c648e
AP
1610.Lbase2_64_avx2_body:
1611
1612 mov $len,%r15 # reassign $len
1613
1614 mov 24($ctx),$r0 # load r
1615 mov 32($ctx),$s1
1616
1617 mov 0($ctx),$h0 # load hash value
1618 mov 8($ctx),$h1
1619 mov 16($ctx),$h2#d
1620
1621 mov $s1,$r1
1622 mov $s1,%rax
1623 shr \$2,$s1
1624 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1625
1626 test \$63,$len
1627 jz .Linit_avx2
1628
1629.Lbase2_64_pre_avx2:
1630 add 0($inp),$h0 # accumulate input
1631 adc 8($inp),$h1
1632 lea 16($inp),$inp
1633 adc $padbit,$h2
1634 sub \$16,%r15
1635
1636 call __poly1305_block
1637 mov $r1,%rax
1638
1639 test \$63,%r15
1640 jnz .Lbase2_64_pre_avx2
1641
1642.Linit_avx2:
1643 ################################# base 2^64 -> base 2^26
1644 mov $h0,%rax
1645 mov $h0,%rdx
1646 shr \$52,$h0
1647 mov $h1,$d1
1648 mov $h1,$d2
1649 shr \$26,%rdx
1650 and \$0x3ffffff,%rax # h[0]
1651 shl \$12,$d1
1652 and \$0x3ffffff,%rdx # h[1]
1653 shr \$14,$h1
1654 or $d1,$h0
1655 shl \$24,$h2
1656 and \$0x3ffffff,$h0 # h[2]
1657 shr \$40,$d2
1658 and \$0x3ffffff,$h1 # h[3]
1659 or $d2,$h2 # h[4]
1660
1661 vmovd %rax#d,%x#$H0
1662 vmovd %rdx#d,%x#$H1
1663 vmovd $h0#d,%x#$H2
1664 vmovd $h1#d,%x#$H3
1665 vmovd $h2#d,%x#$H4
1666 movl \$1,20($ctx) # set is_base2_26
1667
1668 call __poly1305_init_avx
1669
1670.Lproceed_avx2:
abb8c44f
AP
1671 mov %r15,$len # restore $len
1672 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1673 mov \$`(1<<31|1<<30|1<<16)`,%r11d
a98c648e
AP
1674
1675 mov 0(%rsp),%r15
1c47e883 1676.cfi_restore %r15
a98c648e 1677 mov 8(%rsp),%r14
1c47e883 1678.cfi_restore %r14
a98c648e 1679 mov 16(%rsp),%r13
1c47e883 1680.cfi_restore %r13
a98c648e 1681 mov 24(%rsp),%r12
1c47e883 1682.cfi_restore %r12
a98c648e 1683 mov 32(%rsp),%rbp
1c47e883 1684.cfi_restore %rbp
a98c648e 1685 mov 40(%rsp),%rbx
1c47e883 1686.cfi_restore %rbx
a98c648e
AP
1687 lea 48(%rsp),%rax
1688 lea 48(%rsp),%rsp
1c47e883 1689.cfi_adjust_cfa_offset -48
a98c648e
AP
1690.Lbase2_64_avx2_epilogue:
1691 jmp .Ldo_avx2
1c47e883 1692.cfi_endproc
a98c648e
AP
1693
1694.align 32
1695.Leven_avx2:
1c47e883 1696.cfi_startproc
abb8c44f 1697 mov OPENSSL_ia32cap_P+8(%rip),%r10d
a98c648e
AP
1698 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1699 vmovd 4*1($ctx),%x#$H1
1700 vmovd 4*2($ctx),%x#$H2
1701 vmovd 4*3($ctx),%x#$H3
1702 vmovd 4*4($ctx),%x#$H4
1703
1704.Ldo_avx2:
1705___
abb8c44f
AP
1706$code.=<<___ if ($avx>2);
1707 cmp \$512,$len
1708 jb .Lskip_avx512
1709 and %r11d,%r10d
a8f302e5
AP
1710 test \$`1<<16`,%r10d # check for AVX512F
1711 jnz .Lblocks_avx512
abb8c44f
AP
1712.Lskip_avx512:
1713___
a98c648e
AP
1714$code.=<<___ if (!$win64);
1715 lea -8(%rsp),%r11
1c47e883 1716.cfi_def_cfa %r11,16
a98c648e
AP
1717 sub \$0x128,%rsp
1718___
1719$code.=<<___ if ($win64);
1720 lea -0xf8(%rsp),%r11
1721 sub \$0x1c8,%rsp
1722 vmovdqa %xmm6,0x50(%r11)
1723 vmovdqa %xmm7,0x60(%r11)
1724 vmovdqa %xmm8,0x70(%r11)
1725 vmovdqa %xmm9,0x80(%r11)
1726 vmovdqa %xmm10,0x90(%r11)
1727 vmovdqa %xmm11,0xa0(%r11)
1728 vmovdqa %xmm12,0xb0(%r11)
1729 vmovdqa %xmm13,0xc0(%r11)
1730 vmovdqa %xmm14,0xd0(%r11)
1731 vmovdqa %xmm15,0xe0(%r11)
1732.Ldo_avx2_body:
1733___
1734$code.=<<___;
a98c648e 1735 lea .Lconst(%rip),%rcx
73e8a5c8
AP
1736 lea 48+64($ctx),$ctx # size optimization
1737 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
a98c648e
AP
1738
1739 # expand and copy pre-calculated table to stack
1740 vmovdqu `16*0-64`($ctx),%x#$T2
1741 and \$-512,%rsp
1742 vmovdqu `16*1-64`($ctx),%x#$T3
1743 vmovdqu `16*2-64`($ctx),%x#$T4
1744 vmovdqu `16*3-64`($ctx),%x#$D0
1745 vmovdqu `16*4-64`($ctx),%x#$D1
1746 vmovdqu `16*5-64`($ctx),%x#$D2
73e8a5c8 1747 lea 0x90(%rsp),%rax # size optimization
a98c648e 1748 vmovdqu `16*6-64`($ctx),%x#$D3
73e8a5c8 1749 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
a98c648e 1750 vmovdqu `16*7-64`($ctx),%x#$D4
73e8a5c8 1751 vpermd $T3,$T0,$T3
a98c648e 1752 vmovdqu `16*8-64`($ctx),%x#$MASK
73e8a5c8 1753 vpermd $T4,$T0,$T4
a98c648e 1754 vmovdqa $T2,0x00(%rsp)
73e8a5c8
AP
1755 vpermd $D0,$T0,$D0
1756 vmovdqa $T3,0x20-0x90(%rax)
1757 vpermd $D1,$T0,$D1
1758 vmovdqa $T4,0x40-0x90(%rax)
1759 vpermd $D2,$T0,$D2
1760 vmovdqa $D0,0x60-0x90(%rax)
1761 vpermd $D3,$T0,$D3
1762 vmovdqa $D1,0x80-0x90(%rax)
1763 vpermd $D4,$T0,$D4
1764 vmovdqa $D2,0xa0-0x90(%rax)
1765 vpermd $MASK,$T0,$MASK
1766 vmovdqa $D3,0xc0-0x90(%rax)
1767 vmovdqa $D4,0xe0-0x90(%rax)
1768 vmovdqa $MASK,0x100-0x90(%rax)
a98c648e
AP
1769 vmovdqa 64(%rcx),$MASK # .Lmask26
1770
1771 ################################################################
1772 # load input
1773 vmovdqu 16*0($inp),%x#$T0
1774 vmovdqu 16*1($inp),%x#$T1
1775 vinserti128 \$1,16*2($inp),$T0,$T0
1776 vinserti128 \$1,16*3($inp),$T1,$T1
1777 lea 16*4($inp),$inp
1778
1779 vpsrldq \$6,$T0,$T2 # splat input
1780 vpsrldq \$6,$T1,$T3
1781 vpunpckhqdq $T1,$T0,$T4 # 4
1782 vpunpcklqdq $T3,$T2,$T2 # 2:3
1783 vpunpcklqdq $T1,$T0,$T0 # 0:1
1784
1785 vpsrlq \$30,$T2,$T3
1786 vpsrlq \$4,$T2,$T2
1787 vpsrlq \$26,$T0,$T1
1788 vpsrlq \$40,$T4,$T4 # 4
1789 vpand $MASK,$T2,$T2 # 2
1790 vpand $MASK,$T0,$T0 # 0
1791 vpand $MASK,$T1,$T1 # 1
1792 vpand $MASK,$T3,$T3 # 3
1793 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1794
a98c648e
AP
1795 vpaddq $H2,$T2,$H2 # accumulate input
1796 sub \$64,$len
1797 jz .Ltail_avx2
1798 jmp .Loop_avx2
1799
1800.align 32
1801.Loop_avx2:
1802 ################################################################
abb8c44f
AP
1803 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1804 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1805 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1806 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1807 # \________/\__________/
a98c648e
AP
1808 ################################################################
1809 #vpaddq $H2,$T2,$H2 # accumulate input
1810 vpaddq $H0,$T0,$H0
1811 vmovdqa `32*0`(%rsp),$T0 # r0^4
1812 vpaddq $H1,$T1,$H1
1813 vmovdqa `32*1`(%rsp),$T1 # r1^4
1814 vpaddq $H3,$T3,$H3
1815 vmovdqa `32*3`(%rsp),$T2 # r2^4
1816 vpaddq $H4,$T4,$H4
1817 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1818 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1819
1820 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1821 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1822 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1823 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1824 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1825 #
1826 # however, as h2 is "chronologically" first one available pull
1827 # corresponding operations up, so it's
1828 #
1829 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1830 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1831 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1832 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1833 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1834
1835 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1836 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1837 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1838 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1839 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1840
1841 vpmuludq $H0,$T1,$T4 # h0*r1
1842 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1843 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1844 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1845 vpmuludq $H3,$T1,$T4 # h3*r1
1846 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1847 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1848 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1849 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1850
1851 vpmuludq $H0,$T0,$T4 # h0*r0
1852 vpmuludq $H1,$T0,$H2 # h1*r0
1853 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1854 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1855 vpmuludq $H3,$T0,$T4 # h3*r0
1856 vpmuludq $H4,$T0,$H2 # h4*r0
1857 vmovdqu 16*0($inp),%x#$T0 # load input
1858 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1859 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1860 vinserti128 \$1,16*2($inp),$T0,$T0
1861
1862 vpmuludq $H3,$T1,$T4 # h3*s2
1863 vpmuludq $H4,$T1,$H2 # h4*s2
1864 vmovdqu 16*1($inp),%x#$T1
1865 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1866 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1867 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1868 vpmuludq $H1,$T2,$T4 # h1*r2
1869 vpmuludq $H0,$T2,$T2 # h0*r2
1870 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1871 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1872 vinserti128 \$1,16*3($inp),$T1,$T1
1873 lea 16*4($inp),$inp
1874
1875 vpmuludq $H1,$H2,$T4 # h1*r3
1876 vpmuludq $H0,$H2,$H2 # h0*r3
1877 vpsrldq \$6,$T0,$T2 # splat input
1878 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1879 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1880 vpmuludq $H3,$T3,$T4 # h3*s3
1881 vpmuludq $H4,$T3,$H2 # h4*s3
1882 vpsrldq \$6,$T1,$T3
1883 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1884 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1885 vpunpckhqdq $T1,$T0,$T4 # 4
1886
1887 vpmuludq $H3,$S4,$H3 # h3*s4
1888 vpmuludq $H4,$S4,$H4 # h4*s4
1889 vpunpcklqdq $T1,$T0,$T0 # 0:1
1890 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1891 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1892 vpunpcklqdq $T3,$T2,$T3 # 2:3
1893 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1894 vpmuludq $H1,$S4,$H0 # h1*s4
1895 vmovdqa 64(%rcx),$MASK # .Lmask26
1896 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1897 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1898
1899 ################################################################
1900 # lazy reduction (interleaved with tail of input splat)
1901
1902 vpsrlq \$26,$H3,$D3
1903 vpand $MASK,$H3,$H3
1904 vpaddq $D3,$H4,$H4 # h3 -> h4
1905
1906 vpsrlq \$26,$H0,$D0
1907 vpand $MASK,$H0,$H0
1908 vpaddq $D0,$D1,$H1 # h0 -> h1
1909
1910 vpsrlq \$26,$H4,$D4
1911 vpand $MASK,$H4,$H4
1912
1913 vpsrlq \$4,$T3,$T2
1914
1915 vpsrlq \$26,$H1,$D1
1916 vpand $MASK,$H1,$H1
1917 vpaddq $D1,$H2,$H2 # h1 -> h2
1918
1919 vpaddq $D4,$H0,$H0
1920 vpsllq \$2,$D4,$D4
1921 vpaddq $D4,$H0,$H0 # h4 -> h0
1922
1923 vpand $MASK,$T2,$T2 # 2
1924 vpsrlq \$26,$T0,$T1
1925
1926 vpsrlq \$26,$H2,$D2
1927 vpand $MASK,$H2,$H2
1928 vpaddq $D2,$H3,$H3 # h2 -> h3
1929
1930 vpaddq $T2,$H2,$H2 # modulo-scheduled
1931 vpsrlq \$30,$T3,$T3
1932
1933 vpsrlq \$26,$H0,$D0
1934 vpand $MASK,$H0,$H0
1935 vpaddq $D0,$H1,$H1 # h0 -> h1
1936
1937 vpsrlq \$40,$T4,$T4 # 4
1938
1939 vpsrlq \$26,$H3,$D3
1940 vpand $MASK,$H3,$H3
1941 vpaddq $D3,$H4,$H4 # h3 -> h4
1942
1943 vpand $MASK,$T0,$T0 # 0
1944 vpand $MASK,$T1,$T1 # 1
1945 vpand $MASK,$T3,$T3 # 3
1946 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1947
1948 sub \$64,$len
1949 jnz .Loop_avx2
1950
1951 .byte 0x66,0x90
1952.Ltail_avx2:
1953 ################################################################
1954 # while above multiplications were by r^4 in all lanes, in last
1955 # iteration we multiply least significant lane by r^4 and most
1956 # significant one by r, so copy of above except that references
1957 # to the precomputed table are displaced by 4...
1958
1959 #vpaddq $H2,$T2,$H2 # accumulate input
1960 vpaddq $H0,$T0,$H0
1961 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1962 vpaddq $H1,$T1,$H1
1963 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1964 vpaddq $H3,$T3,$H3
1965 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1966 vpaddq $H4,$T4,$H4
1967 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1968 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1969
1970 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1971 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1972 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1973 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1974 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1975
1976 vpmuludq $H0,$T1,$T4 # h0*r1
1977 vpmuludq $H1,$T1,$H2 # h1*r1
1978 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1979 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1980 vpmuludq $H3,$T1,$T4 # h3*r1
1981 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1982 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1983 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1984
1985 vpmuludq $H0,$T0,$T4 # h0*r0
1986 vpmuludq $H1,$T0,$H2 # h1*r0
1987 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1988 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
1989 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1990 vpmuludq $H3,$T0,$T4 # h3*r0
1991 vpmuludq $H4,$T0,$H2 # h4*r0
1992 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1993 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1994
1995 vpmuludq $H3,$T1,$T4 # h3*s2
1996 vpmuludq $H4,$T1,$H2 # h4*s2
1997 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1998 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1999 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2000 vpmuludq $H1,$T2,$T4 # h1*r2
2001 vpmuludq $H0,$T2,$T2 # h0*r2
2002 vpaddq $T4,$D3,$D3 # d3 += h1*r2
2003 vpaddq $T2,$D2,$D2 # d2 += h0*r2
2004
2005 vpmuludq $H1,$H2,$T4 # h1*r3
2006 vpmuludq $H0,$H2,$H2 # h0*r3
2007 vpaddq $T4,$D4,$D4 # d4 += h1*r3
2008 vpaddq $H2,$D3,$D3 # d3 += h0*r3
2009 vpmuludq $H3,$T3,$T4 # h3*s3
2010 vpmuludq $H4,$T3,$H2 # h4*s3
2011 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2012 vpaddq $H2,$D2,$D2 # d2 += h4*s3
2013
2014 vpmuludq $H3,$S4,$H3 # h3*s4
2015 vpmuludq $H4,$S4,$H4 # h4*s4
2016 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2017 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2018 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2019 vpmuludq $H1,$S4,$H0 # h1*s4
2020 vmovdqa 64(%rcx),$MASK # .Lmask26
2021 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2022 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2023
1ea8ae50
AP
2024 ################################################################
2025 # horizontal addition
2026
2027 vpsrldq \$8,$D1,$T1
2028 vpsrldq \$8,$H2,$T2
2029 vpsrldq \$8,$H3,$T3
2030 vpsrldq \$8,$H4,$T4
2031 vpsrldq \$8,$H0,$T0
2032 vpaddq $T1,$D1,$D1
2033 vpaddq $T2,$H2,$H2
2034 vpaddq $T3,$H3,$H3
2035 vpaddq $T4,$H4,$H4
2036 vpaddq $T0,$H0,$H0
2037
2038 vpermq \$0x2,$H3,$T3
2039 vpermq \$0x2,$H4,$T4
2040 vpermq \$0x2,$H0,$T0
2041 vpermq \$0x2,$D1,$T1
2042 vpermq \$0x2,$H2,$T2
2043 vpaddq $T3,$H3,$H3
2044 vpaddq $T4,$H4,$H4
2045 vpaddq $T0,$H0,$H0
2046 vpaddq $T1,$D1,$D1
2047 vpaddq $T2,$H2,$H2
2048
a98c648e
AP
2049 ################################################################
2050 # lazy reduction
2051
2052 vpsrlq \$26,$H3,$D3
2053 vpand $MASK,$H3,$H3
2054 vpaddq $D3,$H4,$H4 # h3 -> h4
2055
2056 vpsrlq \$26,$H0,$D0
2057 vpand $MASK,$H0,$H0
2058 vpaddq $D0,$D1,$H1 # h0 -> h1
2059
2060 vpsrlq \$26,$H4,$D4
2061 vpand $MASK,$H4,$H4
2062
2063 vpsrlq \$26,$H1,$D1
2064 vpand $MASK,$H1,$H1
2065 vpaddq $D1,$H2,$H2 # h1 -> h2
2066
2067 vpaddq $D4,$H0,$H0
2068 vpsllq \$2,$D4,$D4
2069 vpaddq $D4,$H0,$H0 # h4 -> h0
2070
2071 vpsrlq \$26,$H2,$D2
2072 vpand $MASK,$H2,$H2
2073 vpaddq $D2,$H3,$H3 # h2 -> h3
2074
2075 vpsrlq \$26,$H0,$D0
2076 vpand $MASK,$H0,$H0
2077 vpaddq $D0,$H1,$H1 # h0 -> h1
2078
2079 vpsrlq \$26,$H3,$D3
2080 vpand $MASK,$H3,$H3
2081 vpaddq $D3,$H4,$H4 # h3 -> h4
2082
a98c648e
AP
2083 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2084 vmovd %x#$H1,`4*1-48-64`($ctx)
2085 vmovd %x#$H2,`4*2-48-64`($ctx)
2086 vmovd %x#$H3,`4*3-48-64`($ctx)
2087 vmovd %x#$H4,`4*4-48-64`($ctx)
2088___
2089$code.=<<___ if ($win64);
2090 vmovdqa 0x50(%r11),%xmm6
2091 vmovdqa 0x60(%r11),%xmm7
2092 vmovdqa 0x70(%r11),%xmm8
2093 vmovdqa 0x80(%r11),%xmm9
2094 vmovdqa 0x90(%r11),%xmm10
2095 vmovdqa 0xa0(%r11),%xmm11
2096 vmovdqa 0xb0(%r11),%xmm12
2097 vmovdqa 0xc0(%r11),%xmm13
2098 vmovdqa 0xd0(%r11),%xmm14
2099 vmovdqa 0xe0(%r11),%xmm15
2100 lea 0xf8(%r11),%rsp
2101.Ldo_avx2_epilogue:
2102___
2103$code.=<<___ if (!$win64);
2104 lea 8(%r11),%rsp
1c47e883 2105.cfi_def_cfa %rsp,8
a98c648e
AP
2106___
2107$code.=<<___;
2108 vzeroupper
2109 ret
1c47e883 2110.cfi_endproc
a98c648e
AP
2111.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2112___
abb8c44f
AP
2113#######################################################################
2114if ($avx>2) {
2115# On entry we have input length divisible by 64. But since inner loop
2116# processes 128 bytes per iteration, cases when length is not divisible
2117# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2118# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2119# for this tail, we wouldn't have to even allocate stack frame...
2120
a8f302e5
AP
2121my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2122my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
abb8c44f 2123my $PADBIT="%zmm30";
a8f302e5
AP
2124
2125map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2126map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2127map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2128map(s/%y/%z/,($MASK));
abb8c44f
AP
2129
2130$code.=<<___;
2131.type poly1305_blocks_avx512,\@function,4
2132.align 32
2133poly1305_blocks_avx512:
1c47e883 2134.cfi_startproc
abb8c44f 2135.Lblocks_avx512:
a8f302e5
AP
2136 mov \$15,%eax
2137 kmovw %eax,%k2
abb8c44f
AP
2138___
2139$code.=<<___ if (!$win64);
2140 lea -8(%rsp),%r11
1c47e883 2141.cfi_def_cfa %r11,16
abb8c44f
AP
2142 sub \$0x128,%rsp
2143___
2144$code.=<<___ if ($win64);
2145 lea -0xf8(%rsp),%r11
2146 sub \$0x1c8,%rsp
2147 vmovdqa %xmm6,0x50(%r11)
2148 vmovdqa %xmm7,0x60(%r11)
2149 vmovdqa %xmm8,0x70(%r11)
a8f302e5
AP
2150 vmovdqa %xmm9,0x80(%r11)
2151 vmovdqa %xmm10,0x90(%r11)
2152 vmovdqa %xmm11,0xa0(%r11)
2153 vmovdqa %xmm12,0xb0(%r11)
2154 vmovdqa %xmm13,0xc0(%r11)
2155 vmovdqa %xmm14,0xd0(%r11)
2156 vmovdqa %xmm15,0xe0(%r11)
abb8c44f
AP
2157.Ldo_avx512_body:
2158___
2159$code.=<<___;
abb8c44f 2160 lea .Lconst(%rip),%rcx
73e8a5c8 2161 lea 48+64($ctx),$ctx # size optimization
a8f302e5 2162 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
abb8c44f
AP
2163
2164 # expand pre-calculated table
4dfe4310 2165 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
abb8c44f 2166 and \$-512,%rsp
4dfe4310 2167 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
a8f302e5 2168 mov \$0x20,%rax
4dfe4310
AP
2169 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2170 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2171 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2172 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2173 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2174 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2175 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2176 vpermd $D0,$T2,$R0 # 00003412 -> 14243444
a8f302e5 2177 vpbroadcastq 64(%rcx),$MASK # .Lmask26
4dfe4310
AP
2178 vpermd $D1,$T2,$R1
2179 vpermd $T0,$T2,$S1
2180 vpermd $D2,$T2,$R2
a8f302e5 2181 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
abb8c44f 2182 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
4dfe4310 2183 vpermd $T1,$T2,$S2
a8f302e5 2184 vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
abb8c44f 2185 vpsrlq \$32,$R1,$T1
4dfe4310 2186 vpermd $D3,$T2,$R3
a8f302e5 2187 vmovdqa64 $S1,0x40(%rsp){%k2}
4dfe4310
AP
2188 vpermd $T3,$T2,$S3
2189 vpermd $D4,$T2,$R4
a8f302e5 2190 vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
4dfe4310 2191 vpermd $T4,$T2,$S4
a8f302e5
AP
2192 vmovdqa64 $S2,0x80(%rsp){%k2}
2193 vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2194 vmovdqa64 $S3,0xc0(%rsp){%k2}
2195 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2196 vmovdqa64 $S4,0x100(%rsp){%k2}
abb8c44f
AP
2197
2198 ################################################################
2199 # calculate 5th through 8th powers of the key
2200 #
2201 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2202 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2203 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2204 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2205 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2206
2207 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2208 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2209 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2210 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2211 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2212 vpsrlq \$32,$R2,$T2
2213
2214 vpmuludq $T1,$S4,$M0
2215 vpmuludq $T1,$R0,$M1
2216 vpmuludq $T1,$R1,$M2
2217 vpmuludq $T1,$R2,$M3
2218 vpmuludq $T1,$R3,$M4
2219 vpsrlq \$32,$R3,$T3
2220 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2221 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2222 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2223 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2224 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2225
2226 vpmuludq $T2,$S3,$M0
2227 vpmuludq $T2,$S4,$M1
2228 vpmuludq $T2,$R1,$M3
2229 vpmuludq $T2,$R2,$M4
2230 vpmuludq $T2,$R0,$M2
2231 vpsrlq \$32,$R4,$T4
2232 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2233 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2234 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2235 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2236 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2237
2238 vpmuludq $T3,$S2,$M0
2239 vpmuludq $T3,$R0,$M3
2240 vpmuludq $T3,$R1,$M4
2241 vpmuludq $T3,$S3,$M1
2242 vpmuludq $T3,$S4,$M2
2243 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2244 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2245 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2246 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2247 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2248
2249 vpmuludq $T4,$S4,$M3
2250 vpmuludq $T4,$R0,$M4
2251 vpmuludq $T4,$S1,$M0
2252 vpmuludq $T4,$S2,$M1
2253 vpmuludq $T4,$S3,$M2
2254 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2255 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2256 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2257 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2258 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2259
2260 ################################################################
2261 # load input
c1e1fc50
AP
2262 vmovdqu64 16*0($inp),%z#$T3
2263 vmovdqu64 16*4($inp),%z#$T4
2264 lea 16*8($inp),$inp
abb8c44f
AP
2265
2266 ################################################################
2267 # lazy reduction
2268
2269 vpsrlq \$26,$D3,$M3
2270 vpandq $MASK,$D3,$D3
2271 vpaddq $M3,$D4,$D4 # d3 -> d4
2272
2273 vpsrlq \$26,$D0,$M0
2274 vpandq $MASK,$D0,$D0
2275 vpaddq $M0,$D1,$D1 # d0 -> d1
2276
2277 vpsrlq \$26,$D4,$M4
2278 vpandq $MASK,$D4,$D4
2279
2280 vpsrlq \$26,$D1,$M1
2281 vpandq $MASK,$D1,$D1
2282 vpaddq $M1,$D2,$D2 # d1 -> d2
2283
2284 vpaddq $M4,$D0,$D0
2285 vpsllq \$2,$M4,$M4
2286 vpaddq $M4,$D0,$D0 # d4 -> d0
2287
2288 vpsrlq \$26,$D2,$M2
2289 vpandq $MASK,$D2,$D2
2290 vpaddq $M2,$D3,$D3 # d2 -> d3
2291
2292 vpsrlq \$26,$D0,$M0
2293 vpandq $MASK,$D0,$D0
2294 vpaddq $M0,$D1,$D1 # d0 -> d1
2295
2296 vpsrlq \$26,$D3,$M3
2297 vpandq $MASK,$D3,$D3
2298 vpaddq $M3,$D4,$D4 # d3 -> d4
2299
abb8c44f 2300 ################################################################
c1e1fc50
AP
2301 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2302 # $D0-$D4, ...
abb8c44f 2303
c1e1fc50
AP
2304 vpunpcklqdq $T4,$T3,$T0 # transpose input
2305 vpunpckhqdq $T4,$T3,$T4
abb8c44f 2306
c1e1fc50
AP
2307 # ... since input 64-bit lanes are ordered as 73625140, we could
2308 # "vperm" it to 76543210 (here and in each loop iteration), *or*
2309 # we could just flow along, hence the goal for $R0-$S4 is
2310 # 1858286838784888 ...
2311
e052083c
AP
2312 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2313 mov \$0x7777,%eax
c1e1fc50 2314 kmovw %eax,%k1
e052083c
AP
2315
2316 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2317 vpermd $R1,$M0,$R1
2318 vpermd $R2,$M0,$R2
2319 vpermd $R3,$M0,$R3
2320 vpermd $R4,$M0,$R4
2321
2322 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2323 vpermd $D1,$M0,${R1}{%k1}
2324 vpermd $D2,$M0,${R2}{%k1}
2325 vpermd $D3,$M0,${R3}{%k1}
2326 vpermd $D4,$M0,${R4}{%k1}
abb8c44f
AP
2327
2328 vpslld \$2,$R1,$S1 # *5
2329 vpslld \$2,$R2,$S2
2330 vpslld \$2,$R3,$S3
2331 vpslld \$2,$R4,$S4
2332 vpaddd $R1,$S1,$S1
2333 vpaddd $R2,$S2,$S2
2334 vpaddd $R3,$S3,$S3
2335 vpaddd $R4,$S4,$S4
2336
c1e1fc50 2337 vpbroadcastq 32(%rcx),$PADBIT # .L129
abb8c44f 2338
c1e1fc50
AP
2339 vpsrlq \$52,$T0,$T2 # splat input
2340 vpsllq \$12,$T4,$T3
2341 vporq $T3,$T2,$T2
abb8c44f 2342 vpsrlq \$26,$T0,$T1
c1e1fc50 2343 vpsrlq \$14,$T4,$T3
abb8c44f
AP
2344 vpsrlq \$40,$T4,$T4 # 4
2345 vpandq $MASK,$T2,$T2 # 2
2346 vpandq $MASK,$T0,$T0 # 0
e052083c
AP
2347 #vpandq $MASK,$T1,$T1 # 1
2348 #vpandq $MASK,$T3,$T3 # 3
abb8c44f
AP
2349 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2350
2351 vpaddq $H2,$T2,$H2 # accumulate input
abb8c44f
AP
2352 sub \$192,$len
2353 jbe .Ltail_avx512
a8f302e5 2354 jmp .Loop_avx512
abb8c44f 2355
73e8a5c8 2356.align 32
abb8c44f
AP
2357.Loop_avx512:
2358 ################################################################
2359 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2360 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2361 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2362 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2363 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2364 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2365 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2366 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2367 # \________/\___________/
2368 ################################################################
2369 #vpaddq $H2,$T2,$H2 # accumulate input
2370
2371 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2372 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2373 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2374 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2375 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2376 #
2377 # however, as h2 is "chronologically" first one available pull
2378 # corresponding operations up, so it's
2379 #
2380 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2381 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2382 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2383 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2384 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2385
2386 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2387 vpaddq $H0,$T0,$H0
abb8c44f 2388 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
e052083c 2389 vpandq $MASK,$T1,$T1 # 1
abb8c44f 2390 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
e052083c 2391 vpandq $MASK,$T3,$T3 # 3
abb8c44f
AP
2392 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2393 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2394 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2395 vpaddq $H1,$T1,$H1 # accumulate input
2396 vpaddq $H3,$T3,$H3
2397 vpaddq $H4,$T4,$H4
2398
c1e1fc50
AP
2399 vmovdqu64 16*0($inp),$T3 # load input
2400 vmovdqu64 16*4($inp),$T4
2401 lea 16*8($inp),$inp
abb8c44f
AP
2402 vpmuludq $H0,$R3,$M3
2403 vpmuludq $H0,$R4,$M4
2404 vpmuludq $H0,$R0,$M0
2405 vpmuludq $H0,$R1,$M1
2406 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2407 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2408 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2409 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2410
abb8c44f
AP
2411 vpmuludq $H1,$R2,$M3
2412 vpmuludq $H1,$R3,$M4
2413 vpmuludq $H1,$S4,$M0
2414 vpmuludq $H0,$R2,$M2
2415 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2416 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2417 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2418 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2419
c1e1fc50
AP
2420 vpunpcklqdq $T4,$T3,$T0 # transpose input
2421 vpunpckhqdq $T4,$T3,$T4
2422
abb8c44f
AP
2423 vpmuludq $H3,$R0,$M3
2424 vpmuludq $H3,$R1,$M4
2425 vpmuludq $H1,$R0,$M1
2426 vpmuludq $H1,$R1,$M2
2427 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2428 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2429 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2430 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2431
abb8c44f
AP
2432 vpmuludq $H4,$S4,$M3
2433 vpmuludq $H4,$R0,$M4
2434 vpmuludq $H3,$S2,$M0
2435 vpmuludq $H3,$S3,$M1
2436 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2437 vpmuludq $H3,$S4,$M2
2438 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2439 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2440 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2441 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2442
abb8c44f
AP
2443 vpmuludq $H4,$S1,$M0
2444 vpmuludq $H4,$S2,$M1
2445 vpmuludq $H4,$S3,$M2
2446 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2447 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2448 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2449
2450 ################################################################
c1e1fc50
AP
2451 # lazy reduction (interleaved with input splat)
2452
2453 vpsrlq \$52,$T0,$T2 # splat input
2454 vpsllq \$12,$T4,$T3
abb8c44f
AP
2455
2456 vpsrlq \$26,$D3,$H3
2457 vpandq $MASK,$D3,$D3
2458 vpaddq $H3,$D4,$H4 # h3 -> h4
2459
c1e1fc50
AP
2460 vporq $T3,$T2,$T2
2461
abb8c44f
AP
2462 vpsrlq \$26,$H0,$D0
2463 vpandq $MASK,$H0,$H0
2464 vpaddq $D0,$H1,$H1 # h0 -> h1
2465
c1e1fc50
AP
2466 vpandq $MASK,$T2,$T2 # 2
2467
abb8c44f
AP
2468 vpsrlq \$26,$H4,$D4
2469 vpandq $MASK,$H4,$H4
2470
abb8c44f
AP
2471 vpsrlq \$26,$H1,$D1
2472 vpandq $MASK,$H1,$H1
2473 vpaddq $D1,$H2,$H2 # h1 -> h2
2474
2475 vpaddq $D4,$H0,$H0
2476 vpsllq \$2,$D4,$D4
2477 vpaddq $D4,$H0,$H0 # h4 -> h0
2478
c1e1fc50 2479 vpaddq $T2,$H2,$H2 # modulo-scheduled
abb8c44f
AP
2480 vpsrlq \$26,$T0,$T1
2481
2482 vpsrlq \$26,$H2,$D2
2483 vpandq $MASK,$H2,$H2
2484 vpaddq $D2,$D3,$H3 # h2 -> h3
2485
c1e1fc50 2486 vpsrlq \$14,$T4,$T3
abb8c44f
AP
2487
2488 vpsrlq \$26,$H0,$D0
2489 vpandq $MASK,$H0,$H0
2490 vpaddq $D0,$H1,$H1 # h0 -> h1
2491
2492 vpsrlq \$40,$T4,$T4 # 4
2493
2494 vpsrlq \$26,$H3,$D3
2495 vpandq $MASK,$H3,$H3
2496 vpaddq $D3,$H4,$H4 # h3 -> h4
2497
2498 vpandq $MASK,$T0,$T0 # 0
e052083c
AP
2499 #vpandq $MASK,$T1,$T1 # 1
2500 #vpandq $MASK,$T3,$T3 # 3
abb8c44f
AP
2501 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2502
2503 sub \$128,$len
2504 ja .Loop_avx512
2505
2506.Ltail_avx512:
2507 ################################################################
2508 # while above multiplications were by r^8 in all lanes, in last
2509 # iteration we multiply least significant lane by r^8 and most
2510 # significant one by r, that's why table gets shifted...
2511
c1e1fc50 2512 vpsrlq \$32,$R0,$R0 # 0105020603070408
abb8c44f
AP
2513 vpsrlq \$32,$R1,$R1
2514 vpsrlq \$32,$R2,$R2
2515 vpsrlq \$32,$S3,$S3
2516 vpsrlq \$32,$S4,$S4
2517 vpsrlq \$32,$R3,$R3
2518 vpsrlq \$32,$R4,$R4
2519 vpsrlq \$32,$S1,$S1
2520 vpsrlq \$32,$S2,$S2
2521
2522 ################################################################
2523 # load either next or last 64 byte of input
2524 lea ($inp,$len),$inp
2525
2526 #vpaddq $H2,$T2,$H2 # accumulate input
2527 vpaddq $H0,$T0,$H0
2528
2529 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2530 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2531 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
e052083c 2532 vpandq $MASK,$T1,$T1 # 1
abb8c44f 2533 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
e052083c 2534 vpandq $MASK,$T3,$T3 # 3
abb8c44f 2535 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
abb8c44f
AP
2536 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2537 vpaddq $H1,$T1,$H1 # accumulate input
2538 vpaddq $H3,$T3,$H3
2539 vpaddq $H4,$T4,$H4
2540
a8f302e5 2541 vmovdqu 16*0($inp),%x#$T0
abb8c44f
AP
2542 vpmuludq $H0,$R3,$M3
2543 vpmuludq $H0,$R4,$M4
2544 vpmuludq $H0,$R0,$M0
2545 vpmuludq $H0,$R1,$M1
2546 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2547 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2548 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2549 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2550
a8f302e5 2551 vmovdqu 16*1($inp),%x#$T1
abb8c44f
AP
2552 vpmuludq $H1,$R2,$M3
2553 vpmuludq $H1,$R3,$M4
2554 vpmuludq $H1,$S4,$M0
2555 vpmuludq $H0,$R2,$M2
2556 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2557 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2558 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2559 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2560
a8f302e5 2561 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
abb8c44f
AP
2562 vpmuludq $H3,$R0,$M3
2563 vpmuludq $H3,$R1,$M4
2564 vpmuludq $H1,$R0,$M1
2565 vpmuludq $H1,$R1,$M2
2566 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2567 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2568 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2569 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2570
a8f302e5 2571 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
abb8c44f
AP
2572 vpmuludq $H4,$S4,$M3
2573 vpmuludq $H4,$R0,$M4
2574 vpmuludq $H3,$S2,$M0
2575 vpmuludq $H3,$S3,$M1
2576 vpmuludq $H3,$S4,$M2
2577 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2578 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2579 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2580 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2581 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2582
2583 vpmuludq $H4,$S1,$M0
2584 vpmuludq $H4,$S2,$M1
2585 vpmuludq $H4,$S3,$M2
2586 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2587 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2588 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2589
2590 ################################################################
2591 # horizontal addition
2592
2593 mov \$1,%eax
a8f302e5
AP
2594 vpermq \$0xb1,$H3,$D3
2595 vpermq \$0xb1,$D4,$H4
2596 vpermq \$0xb1,$H0,$D0
2597 vpermq \$0xb1,$H1,$D1
2598 vpermq \$0xb1,$H2,$D2
abb8c44f
AP
2599 vpaddq $D3,$H3,$H3
2600 vpaddq $D4,$H4,$H4
2601 vpaddq $D0,$H0,$H0
2602 vpaddq $D1,$H1,$H1
2603 vpaddq $D2,$H2,$H2
2604
2605 kmovw %eax,%k3
2606 vpermq \$0x2,$H3,$D3
2607 vpermq \$0x2,$H4,$D4
2608 vpermq \$0x2,$H0,$D0
2609 vpermq \$0x2,$H1,$D1
2610 vpermq \$0x2,$H2,$D2
2611 vpaddq $D3,$H3,$H3
2612 vpaddq $D4,$H4,$H4
2613 vpaddq $D0,$H0,$H0
2614 vpaddq $D1,$H1,$H1
2615 vpaddq $D2,$H2,$H2
2616
2617 vextracti64x4 \$0x1,$H3,%y#$D3
2618 vextracti64x4 \$0x1,$H4,%y#$D4
2619 vextracti64x4 \$0x1,$H0,%y#$D0
2620 vextracti64x4 \$0x1,$H1,%y#$D1
2621 vextracti64x4 \$0x1,$H2,%y#$D2
2622 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2623 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2624 vpaddq $D0,$H0,${H0}{%k3}{z}
2625 vpaddq $D1,$H1,${H1}{%k3}{z}
2626 vpaddq $D2,$H2,${H2}{%k3}{z}
2627___
2628map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2629map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2630$code.=<<___;
2631 ################################################################
2632 # lazy reduction (interleaved with input splat)
2633
2634 vpsrlq \$26,$H3,$D3
a8f302e5 2635 vpand $MASK,$H3,$H3
abb8c44f
AP
2636 vpsrldq \$6,$T0,$T2 # splat input
2637 vpsrldq \$6,$T1,$T3
2638 vpunpckhqdq $T1,$T0,$T4 # 4
2639 vpaddq $D3,$H4,$H4 # h3 -> h4
2640
2641 vpsrlq \$26,$H0,$D0
a8f302e5 2642 vpand $MASK,$H0,$H0
abb8c44f
AP
2643 vpunpcklqdq $T3,$T2,$T2 # 2:3
2644 vpunpcklqdq $T1,$T0,$T0 # 0:1
2645 vpaddq $D0,$H1,$H1 # h0 -> h1
2646
2647 vpsrlq \$26,$H4,$D4
a8f302e5 2648 vpand $MASK,$H4,$H4
abb8c44f
AP
2649
2650 vpsrlq \$26,$H1,$D1
a8f302e5 2651 vpand $MASK,$H1,$H1
abb8c44f
AP
2652 vpsrlq \$30,$T2,$T3
2653 vpsrlq \$4,$T2,$T2
2654 vpaddq $D1,$H2,$H2 # h1 -> h2
2655
2656 vpaddq $D4,$H0,$H0
2657 vpsllq \$2,$D4,$D4
2658 vpsrlq \$26,$T0,$T1
2659 vpsrlq \$40,$T4,$T4 # 4
2660 vpaddq $D4,$H0,$H0 # h4 -> h0
2661
2662 vpsrlq \$26,$H2,$D2
a8f302e5
AP
2663 vpand $MASK,$H2,$H2
2664 vpand $MASK,$T2,$T2 # 2
2665 vpand $MASK,$T0,$T0 # 0
abb8c44f
AP
2666 vpaddq $D2,$H3,$H3 # h2 -> h3
2667
2668 vpsrlq \$26,$H0,$D0
a8f302e5 2669 vpand $MASK,$H0,$H0
abb8c44f 2670 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
a8f302e5 2671 vpand $MASK,$T1,$T1 # 1
abb8c44f
AP
2672 vpaddq $D0,$H1,$H1 # h0 -> h1
2673
2674 vpsrlq \$26,$H3,$D3
a8f302e5
AP
2675 vpand $MASK,$H3,$H3
2676 vpand $MASK,$T3,$T3 # 3
2677 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
abb8c44f
AP
2678 vpaddq $D3,$H4,$H4 # h3 -> h4
2679
2680 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2681 add \$64,$len
2682 jnz .Ltail_avx2
2683
2684 vpsubq $T2,$H2,$H2 # undo input accumulation
2685 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2686 vmovd %x#$H1,`4*1-48-64`($ctx)
2687 vmovd %x#$H2,`4*2-48-64`($ctx)
2688 vmovd %x#$H3,`4*3-48-64`($ctx)
2689 vmovd %x#$H4,`4*4-48-64`($ctx)
c1e1fc50 2690 vzeroall
abb8c44f
AP
2691___
2692$code.=<<___ if ($win64);
c1e1fc50
AP
2693 movdqa 0x50(%r11),%xmm6
2694 movdqa 0x60(%r11),%xmm7
2695 movdqa 0x70(%r11),%xmm8
2696 movdqa 0x80(%r11),%xmm9
2697 movdqa 0x90(%r11),%xmm10
2698 movdqa 0xa0(%r11),%xmm11
2699 movdqa 0xb0(%r11),%xmm12
2700 movdqa 0xc0(%r11),%xmm13
2701 movdqa 0xd0(%r11),%xmm14
2702 movdqa 0xe0(%r11),%xmm15
abb8c44f
AP
2703 lea 0xf8(%r11),%rsp
2704.Ldo_avx512_epilogue:
2705___
2706$code.=<<___ if (!$win64);
2707 lea 8(%r11),%rsp
1c47e883 2708.cfi_def_cfa %rsp,8
abb8c44f
AP
2709___
2710$code.=<<___;
abb8c44f 2711 ret
1c47e883 2712.cfi_endproc
abb8c44f
AP
2713.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2714___
fd910ef9
AP
2715if ($avx>3) {
2716########################################################################
2717# VPMADD52 version using 2^44 radix.
2718#
2719# One can argue that base 2^52 would be more natural. Well, even though
2720# some operations would be more natural, one has to recognize couple of
2721# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2722# at amount of multiply-n-accumulate operations. Secondly, it makes it
2723# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2724# reference implementations], which means that more such operations
2725# would have to be performed in inner loop, which in turn makes critical
2726# path longer. In other words, even though base 2^44 reduction might
2727# look less elegant, overall critical path is actually shorter...
2728
c2b93590
AP
2729########################################################################
2730# Layout of opaque area is following.
2731#
2732# unsigned __int64 h[3]; # current hash value base 2^44
2733# unsigned __int64 s[2]; # key value*20 base 2^44
2734# unsigned __int64 r[3]; # key value base 2^44
2735# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2736# # r^n positions reflect
2737# # placement in register, not
2738# # memory, R[3] is R[1]*20
2739
fd910ef9
AP
2740$code.=<<___;
2741.type poly1305_init_base2_44,\@function,3
2742.align 32
2743poly1305_init_base2_44:
2744 xor %rax,%rax
2745 mov %rax,0($ctx) # initialize hash value
2746 mov %rax,8($ctx)
2747 mov %rax,16($ctx)
2748
2749.Linit_base2_44:
2750 lea poly1305_blocks_vpmadd52(%rip),%r10
2751 lea poly1305_emit_base2_44(%rip),%r11
2752
2753 mov \$0x0ffffffc0fffffff,%rax
2754 mov \$0x0ffffffc0ffffffc,%rcx
2755 and 0($inp),%rax
2756 mov \$0x00000fffffffffff,%r8
2757 and 8($inp),%rcx
2758 mov \$0x00000fffffffffff,%r9
2759 and %rax,%r8
2760 shrd \$44,%rcx,%rax
2761 mov %r8,40($ctx) # r0
2762 and %r9,%rax
2763 shr \$24,%rcx
2764 mov %rax,48($ctx) # r1
2765 lea (%rax,%rax,4),%rax # *5
2766 mov %rcx,56($ctx) # r2
2767 shl \$2,%rax # magic <<2
2768 lea (%rcx,%rcx,4),%rcx # *5
2769 shl \$2,%rcx # magic <<2
2770 mov %rax,24($ctx) # s1
2771 mov %rcx,32($ctx) # s2
c2b93590 2772 movq \$-1,64($ctx) # write impossible value
fd910ef9
AP
2773___
2774$code.=<<___ if ($flavour !~ /elf32/);
2775 mov %r10,0(%rdx)
2776 mov %r11,8(%rdx)
2777___
2778$code.=<<___ if ($flavour =~ /elf32/);
2779 mov %r10d,0(%rdx)
2780 mov %r11d,4(%rdx)
2781___
2782$code.=<<___;
2783 mov \$1,%eax
2784 ret
2785.size poly1305_init_base2_44,.-poly1305_init_base2_44
2786___
2787{
2788my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2789my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2790my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2791
2792$code.=<<___;
2793.type poly1305_blocks_vpmadd52,\@function,4
2794.align 32
2795poly1305_blocks_vpmadd52:
2796 shr \$4,$len
2797 jz .Lno_data_vpmadd52 # too short
2798
c2b93590
AP
2799 shl \$40,$padbit
2800 mov 64($ctx),%r8 # peek on power of the key
2801
2802 # if powers of the key are not calculated yet, process up to 3
2803 # blocks with this single-block subroutine, otherwise ensure that
2804 # length is divisible by 2 blocks and pass the rest down to next
2805 # subroutine...
2806
2807 mov \$3,%rax
2808 mov \$1,%r10
2809 cmp \$4,$len # is input long
2810 cmovae %r10,%rax
2811 test %r8,%r8 # is power value impossible?
2812 cmovns %r10,%rax
2813
2814 and $len,%rax # is input of favourable length?
2815 jz .Lblocks_vpmadd52_4x
2816
2817 sub %rax,$len
fd910ef9
AP
2818 mov \$7,%r10d
2819 mov \$1,%r11d
2820 kmovw %r10d,%k7
2821 lea .L2_44_inp_permd(%rip),%r10
fd910ef9
AP
2822 kmovw %r11d,%k1
2823
2824 vmovq $padbit,%x#$PAD
2825 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2826 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2827 vpermq \$0xcf,$PAD,$PAD
2828 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2829
2830 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2831 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2832 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2833 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2834
2835 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2836 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2837
2838 jmp .Loop_vpmadd52
2839
2840.align 32
2841.Loop_vpmadd52:
2842 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2843 lea 16($inp),$inp
2844
2845 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2846 vpsrlvq $inp_shift,$T0,$T0
2847 vpandq $reduc_mask,$T0,$T0
2848 vporq $PAD,$T0,$T0
2849
2850 vpaddq $T0,$Dlo,$Dlo # accumulate input
2851
2852 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2853 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2854 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2855
2856 vpxord $Dlo,$Dlo,$Dlo
2857 vpxord $Dhi,$Dhi,$Dhi
2858
2859 vpmadd52luq $r2r1r0,$H0,$Dlo
2860 vpmadd52huq $r2r1r0,$H0,$Dhi
2861
2862 vpmadd52luq $r1r0s2,$H1,$Dlo
2863 vpmadd52huq $r1r0s2,$H1,$Dhi
2864
2865 vpmadd52luq $r0s2s1,$H2,$Dlo
2866 vpmadd52huq $r0s2s1,$H2,$Dhi
2867
2868 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2869 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2870 vpandq $reduc_mask,$Dlo,$Dlo
2871
2872 vpaddq $T0,$Dhi,$Dhi
2873
2874 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2875
2876 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2877
2878 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2879 vpandq $reduc_mask,$Dlo,$Dlo
2880
2881 vpermq \$0b10010011,$T0,$T0
2882
2883 vpaddq $T0,$Dlo,$Dlo
2884
2885 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2886
2887 vpaddq $T0,$Dlo,$Dlo
2888 vpsllq \$2,$T0,$T0
2889
2890 vpaddq $T0,$Dlo,$Dlo
2891
c2b93590 2892 dec %rax # len-=16
fd910ef9
AP
2893 jnz .Loop_vpmadd52
2894
2895 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2896
c2b93590
AP
2897 test $len,$len
2898 jnz .Lblocks_vpmadd52_4x
2899
fd910ef9
AP
2900.Lno_data_vpmadd52:
2901 ret
2902.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2903___
2904}
c2b93590 2905{
0a5d1a38
AP
2906########################################################################
2907# As implied by its name 4x subroutine processes 4 blocks in parallel
2908# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2909# and is handled in 256-bit %ymm registers.
2910
c2b93590
AP
2911my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2912my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2913my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2914
2915$code.=<<___;
2916.type poly1305_blocks_vpmadd52_4x,\@function,4
2917.align 32
2918poly1305_blocks_vpmadd52_4x:
2919 shr \$4,$len
2920 jz .Lno_data_vpmadd52_4x # too short
2921
2922 shl \$40,$padbit
2923 mov 64($ctx),%r8 # peek on power of the key
2924
2925.Lblocks_vpmadd52_4x:
2926 vpbroadcastq $padbit,$PAD
2927
2928 vmovdqa64 .Lx_mask44(%rip),$mask44
2929 mov \$5,%eax
2930 vmovdqa64 .Lx_mask42(%rip),$mask42
2931 kmovw %eax,%k1 # used in 2x path
2932
2933 test %r8,%r8 # is power value impossible?
2934 js .Linit_vpmadd52 # if it is, then init R[4]
2935
2936 vmovq 0($ctx),%x#$H0 # load current hash value
2937 vmovq 8($ctx),%x#$H1
2938 vmovq 16($ctx),%x#$H2
2939
2940 test \$3,$len # is length 4*n+2?
2941 jnz .Lblocks_vpmadd52_2x_do
2942
2943.Lblocks_vpmadd52_4x_do:
2944 vpbroadcastq 64($ctx),$R0 # load 4th power of the key
2945 vpbroadcastq 96($ctx),$R1
2946 vpbroadcastq 128($ctx),$R2
2947 vpbroadcastq 160($ctx),$S1
2948
2949.Lblocks_vpmadd52_4x_key_loaded:
2950 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
2951 vpaddq $R2,$S2,$S2
2952 vpsllq \$2,$S2,$S2
2953
0a5d1a38
AP
2954 test \$7,$len # is len 8*n?
2955 jz .Lblocks_vpmadd52_8x
2956
c2b93590
AP
2957 vmovdqu64 16*0($inp),$T2 # load data
2958 vmovdqu64 16*2($inp),$T3
2959 lea 16*4($inp),$inp
2960
2961 vpunpcklqdq $T3,$T2,$T1 # transpose data
2962 vpunpckhqdq $T3,$T2,$T3
2963
2964 # at this point 64-bit lanes are ordered as 3-1-2-0
2965
2966 vpsrlq \$24,$T3,$T2 # splat the data
2967 vporq $PAD,$T2,$T2
2968 vpaddq $T2,$H2,$H2 # accumulate input
2969 vpandq $mask44,$T1,$T0
2970 vpsrlq \$44,$T1,$T1
2971 vpsllq \$20,$T3,$T3
2972 vporq $T3,$T1,$T1
2973 vpandq $mask44,$T1,$T1
2974
2975 sub \$4,$len
2976 jz .Ltail_vpmadd52_4x
2977 jmp .Loop_vpmadd52_4x
2978 ud2
2979
2980.align 32
2981.Linit_vpmadd52:
2982 vmovq 24($ctx),%x#$S1 # load key
2983 vmovq 56($ctx),%x#$H2
2984 vmovq 32($ctx),%x#$S2
2985 vmovq 40($ctx),%x#$R0
2986 vmovq 48($ctx),%x#$R1
2987
2988 vmovdqa $R0,$H0
2989 vmovdqa $R1,$H1
2990 vmovdqa $H2,$R2
2991
2992 mov \$2,%eax
2993
2994.Lmul_init_vpmadd52:
2995 vpxorq $D0lo,$D0lo,$D0lo
2996 vpmadd52luq $H2,$S1,$D0lo
2997 vpxorq $D0hi,$D0hi,$D0hi
2998 vpmadd52huq $H2,$S1,$D0hi
2999 vpxorq $D1lo,$D1lo,$D1lo
3000 vpmadd52luq $H2,$S2,$D1lo
3001 vpxorq $D1hi,$D1hi,$D1hi
3002 vpmadd52huq $H2,$S2,$D1hi
3003 vpxorq $D2lo,$D2lo,$D2lo
3004 vpmadd52luq $H2,$R0,$D2lo
3005 vpxorq $D2hi,$D2hi,$D2hi
3006 vpmadd52huq $H2,$R0,$D2hi
3007
3008 vpmadd52luq $H0,$R0,$D0lo
3009 vpmadd52huq $H0,$R0,$D0hi
3010 vpmadd52luq $H0,$R1,$D1lo
3011 vpmadd52huq $H0,$R1,$D1hi
3012 vpmadd52luq $H0,$R2,$D2lo
3013 vpmadd52huq $H0,$R2,$D2hi
3014
3015 vpmadd52luq $H1,$S2,$D0lo
3016 vpmadd52huq $H1,$S2,$D0hi
3017 vpmadd52luq $H1,$R0,$D1lo
3018 vpmadd52huq $H1,$R0,$D1hi
3019 vpmadd52luq $H1,$R1,$D2lo
3020 vpmadd52huq $H1,$R1,$D2hi
3021
3022 ################################################################
3023 # partial reduction
3024 vpsrlq \$44,$D0lo,$tmp
3025 vpsllq \$8,$D0hi,$D0hi
3026 vpandq $mask44,$D0lo,$H0
3027 vpaddq $tmp,$D0hi,$D0hi
3028
3029 vpaddq $D0hi,$D1lo,$D1lo
3030
3031 vpsrlq \$44,$D1lo,$tmp
3032 vpsllq \$8,$D1hi,$D1hi
3033 vpandq $mask44,$D1lo,$H1
3034 vpaddq $tmp,$D1hi,$D1hi
3035
3036 vpaddq $D1hi,$D2lo,$D2lo
3037
3038 vpsrlq \$42,$D2lo,$tmp
3039 vpsllq \$10,$D2hi,$D2hi
3040 vpandq $mask42,$D2lo,$H2
3041 vpaddq $tmp,$D2hi,$D2hi
3042
3043 vpaddq $D2hi,$H0,$H0
3044 vpsllq \$2,$D2hi,$D2hi
3045
3046 vpaddq $D2hi,$H0,$H0
3047
3048 vpsrlq \$44,$H0,$tmp # additional step
3049 vpandq $mask44,$H0,$H0
3050
3051 vpaddq $tmp,$H1,$H1
3052
3053 dec %eax
3054 jz .Ldone_init_vpmadd52
3055
3056 vpunpcklqdq $R1,$H1,$R1 # 1,2
3057 vpbroadcastq %x#$H1,%x#$H1 # 2,2
3058 vpunpcklqdq $R2,$H2,$R2
3059 vpbroadcastq %x#$H2,%x#$H2
3060 vpunpcklqdq $R0,$H0,$R0
3061 vpbroadcastq %x#$H0,%x#$H0
3062
3063 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3064 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3065 vpaddq $R1,$S1,$S1
3066 vpaddq $R2,$S2,$S2
3067 vpsllq \$2,$S1,$S1
3068 vpsllq \$2,$S2,$S2
3069
3070 jmp .Lmul_init_vpmadd52
3071 ud2
3072
3073.align 32
3074.Ldone_init_vpmadd52:
3075 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3076 vinserti128 \$1,%x#$R2,$H2,$R2
3077 vinserti128 \$1,%x#$R0,$H0,$R0
3078
3079 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3080 vpermq \$0b11011000,$R2,$R2
3081 vpermq \$0b11011000,$R0,$R0
3082
3083 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3084 vpaddq $R1,$S1,$S1
3085 vpsllq \$2,$S1,$S1
3086
3087 vmovq 0($ctx),%x#$H0 # load current hash value
3088 vmovq 8($ctx),%x#$H1
3089 vmovq 16($ctx),%x#$H2
3090
3091 test \$3,$len # is length 4*n+2?
3092 jnz .Ldone_init_vpmadd52_2x
3093
3094 vmovdqu64 $R0,64($ctx) # save key powers
3095 vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3096 vmovdqu64 $R1,96($ctx)
3097 vpbroadcastq %x#$R1,$R1
3098 vmovdqu64 $R2,128($ctx)
3099 vpbroadcastq %x#$R2,$R2
3100 vmovdqu64 $S1,160($ctx)
3101 vpbroadcastq %x#$S1,$S1
3102
3103 jmp .Lblocks_vpmadd52_4x_key_loaded
3104 ud2
3105
3106.align 32
3107.Ldone_init_vpmadd52_2x:
3108 vmovdqu64 $R0,64($ctx) # save key powers
3109 vpsrldq \$8,$R0,$R0 # 0-1-0-2
3110 vmovdqu64 $R1,96($ctx)
3111 vpsrldq \$8,$R1,$R1
3112 vmovdqu64 $R2,128($ctx)
3113 vpsrldq \$8,$R2,$R2
3114 vmovdqu64 $S1,160($ctx)
3115 vpsrldq \$8,$S1,$S1
3116 jmp .Lblocks_vpmadd52_2x_key_loaded
3117 ud2
3118
3119.align 32
3120.Lblocks_vpmadd52_2x_do:
3121 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3122 vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3123 vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3124 vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3125
3126.Lblocks_vpmadd52_2x_key_loaded:
3127 vmovdqu64 16*0($inp),$T2 # load data
3128 vpxorq $T3,$T3,$T3
3129 lea 16*2($inp),$inp
3130
3131 vpunpcklqdq $T3,$T2,$T1 # transpose data
3132 vpunpckhqdq $T3,$T2,$T3
3133
3134 # at this point 64-bit lanes are ordered as x-1-x-0
3135
3136 vpsrlq \$24,$T3,$T2 # splat the data
3137 vporq $PAD,$T2,$T2
3138 vpaddq $T2,$H2,$H2 # accumulate input
3139 vpandq $mask44,$T1,$T0
3140 vpsrlq \$44,$T1,$T1
3141 vpsllq \$20,$T3,$T3
3142 vporq $T3,$T1,$T1
3143 vpandq $mask44,$T1,$T1
3144
3145 jmp .Ltail_vpmadd52_2x
3146 ud2
3147
3148.align 32
3149.Loop_vpmadd52_4x:
3150 #vpaddq $T2,$H2,$H2 # accumulate input
3151 vpaddq $T0,$H0,$H0
3152 vpaddq $T1,$H1,$H1
3153
3154 vpxorq $D0lo,$D0lo,$D0lo
3155 vpmadd52luq $H2,$S1,$D0lo
3156 vpxorq $D0hi,$D0hi,$D0hi
3157 vpmadd52huq $H2,$S1,$D0hi
3158 vpxorq $D1lo,$D1lo,$D1lo
3159 vpmadd52luq $H2,$S2,$D1lo
3160 vpxorq $D1hi,$D1hi,$D1hi
3161 vpmadd52huq $H2,$S2,$D1hi
3162 vpxorq $D2lo,$D2lo,$D2lo
3163 vpmadd52luq $H2,$R0,$D2lo
3164 vpxorq $D2hi,$D2hi,$D2hi
3165 vpmadd52huq $H2,$R0,$D2hi
3166
3167 vmovdqu64 16*0($inp),$T2 # load data
3168 vmovdqu64 16*2($inp),$T3
3169 lea 16*4($inp),$inp
3170 vpmadd52luq $H0,$R0,$D0lo
3171 vpmadd52huq $H0,$R0,$D0hi
3172 vpmadd52luq $H0,$R1,$D1lo
3173 vpmadd52huq $H0,$R1,$D1hi
3174 vpmadd52luq $H0,$R2,$D2lo
3175 vpmadd52huq $H0,$R2,$D2hi
3176
3177 vpunpcklqdq $T3,$T2,$T1 # transpose data
3178 vpunpckhqdq $T3,$T2,$T3
3179 vpmadd52luq $H1,$S2,$D0lo
3180 vpmadd52huq $H1,$S2,$D0hi
3181 vpmadd52luq $H1,$R0,$D1lo
3182 vpmadd52huq $H1,$R0,$D1hi
3183 vpmadd52luq $H1,$R1,$D2lo
3184 vpmadd52huq $H1,$R1,$D2hi
3185
3186 ################################################################
3187 # partial reduction (interleaved with data splat)
3188 vpsrlq \$44,$D0lo,$tmp
3189 vpsllq \$8,$D0hi,$D0hi
3190 vpandq $mask44,$D0lo,$H0
3191 vpaddq $tmp,$D0hi,$D0hi
3192
3193 vpsrlq \$24,$T3,$T2
3194 vporq $PAD,$T2,$T2
3195 vpaddq $D0hi,$D1lo,$D1lo
3196
3197 vpsrlq \$44,$D1lo,$tmp
3198 vpsllq \$8,$D1hi,$D1hi
3199 vpandq $mask44,$D1lo,$H1
3200 vpaddq $tmp,$D1hi,$D1hi
3201
3202 vpandq $mask44,$T1,$T0
3203 vpsrlq \$44,$T1,$T1
3204 vpsllq \$20,$T3,$T3
3205 vpaddq $D1hi,$D2lo,$D2lo
3206
3207 vpsrlq \$42,$D2lo,$tmp
3208 vpsllq \$10,$D2hi,$D2hi
3209 vpandq $mask42,$D2lo,$H2
3210 vpaddq $tmp,$D2hi,$D2hi
3211
3212 vpaddq $T2,$H2,$H2 # accumulate input
3213 vpaddq $D2hi,$H0,$H0
3214 vpsllq \$2,$D2hi,$D2hi
3215
3216 vpaddq $D2hi,$H0,$H0
3217 vporq $T3,$T1,$T1
3218 vpandq $mask44,$T1,$T1
3219
3220 vpsrlq \$44,$H0,$tmp # additional step
3221 vpandq $mask44,$H0,$H0
3222
3223 vpaddq $tmp,$H1,$H1
3224
3225 sub \$4,$len # len-=64
3226 jnz .Loop_vpmadd52_4x
3227
3228.Ltail_vpmadd52_4x:
3229 vmovdqu64 128($ctx),$R2 # load all key powers
3230 vmovdqu64 160($ctx),$S1
3231 vmovdqu64 64($ctx),$R0
3232 vmovdqu64 96($ctx),$R1
3233
3234.Ltail_vpmadd52_2x:
3235 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3236 vpaddq $R2,$S2,$S2
3237 vpsllq \$2,$S2,$S2
3238
3239 #vpaddq $T2,$H2,$H2 # accumulate input
3240 vpaddq $T0,$H0,$H0
3241 vpaddq $T1,$H1,$H1
3242
3243 vpxorq $D0lo,$D0lo,$D0lo
3244 vpmadd52luq $H2,$S1,$D0lo
3245 vpxorq $D0hi,$D0hi,$D0hi
3246 vpmadd52huq $H2,$S1,$D0hi
3247 vpxorq $D1lo,$D1lo,$D1lo
3248 vpmadd52luq $H2,$S2,$D1lo
3249 vpxorq $D1hi,$D1hi,$D1hi
3250 vpmadd52huq $H2,$S2,$D1hi
3251 vpxorq $D2lo,$D2lo,$D2lo
3252 vpmadd52luq $H2,$R0,$D2lo
3253 vpxorq $D2hi,$D2hi,$D2hi
3254 vpmadd52huq $H2,$R0,$D2hi
3255
3256 vpmadd52luq $H0,$R0,$D0lo
3257 vpmadd52huq $H0,$R0,$D0hi
3258 vpmadd52luq $H0,$R1,$D1lo
3259 vpmadd52huq $H0,$R1,$D1hi
3260 vpmadd52luq $H0,$R2,$D2lo
3261 vpmadd52huq $H0,$R2,$D2hi
3262
3263 vpmadd52luq $H1,$S2,$D0lo
3264 vpmadd52huq $H1,$S2,$D0hi
3265 vpmadd52luq $H1,$R0,$D1lo
3266 vpmadd52huq $H1,$R0,$D1hi
3267 vpmadd52luq $H1,$R1,$D2lo
3268 vpmadd52huq $H1,$R1,$D2hi
3269
3270 ################################################################
3271 # horizontal addition
3272
3273 mov \$1,%eax
3274 kmovw %eax,%k1
3275 vpsrldq \$8,$D0lo,$T0
3276 vpsrldq \$8,$D0hi,$H0
3277 vpsrldq \$8,$D1lo,$T1
3278 vpsrldq \$8,$D1hi,$H1
3279 vpaddq $T0,$D0lo,$D0lo
3280 vpaddq $H0,$D0hi,$D0hi
3281 vpsrldq \$8,$D2lo,$T2
3282 vpsrldq \$8,$D2hi,$H2
3283 vpaddq $T1,$D1lo,$D1lo
3284 vpaddq $H1,$D1hi,$D1hi
3285 vpermq \$0x2,$D0lo,$T0
3286 vpermq \$0x2,$D0hi,$H0
3287 vpaddq $T2,$D2lo,$D2lo
3288 vpaddq $H2,$D2hi,$D2hi
3289
3290 vpermq \$0x2,$D1lo,$T1
3291 vpermq \$0x2,$D1hi,$H1
3292 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3293 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3294 vpermq \$0x2,$D2lo,$T2
3295 vpermq \$0x2,$D2hi,$H2
3296 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3297 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3298 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3299 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3300
3301 ################################################################
3302 # partial reduction
3303 vpsrlq \$44,$D0lo,$tmp
3304 vpsllq \$8,$D0hi,$D0hi
3305 vpandq $mask44,$D0lo,$H0
3306 vpaddq $tmp,$D0hi,$D0hi
3307
3308 vpaddq $D0hi,$D1lo,$D1lo
3309
3310 vpsrlq \$44,$D1lo,$tmp
3311 vpsllq \$8,$D1hi,$D1hi
3312 vpandq $mask44,$D1lo,$H1
3313 vpaddq $tmp,$D1hi,$D1hi
3314
3315 vpaddq $D1hi,$D2lo,$D2lo
3316
3317 vpsrlq \$42,$D2lo,$tmp
3318 vpsllq \$10,$D2hi,$D2hi
3319 vpandq $mask42,$D2lo,$H2
3320 vpaddq $tmp,$D2hi,$D2hi
3321
3322 vpaddq $D2hi,$H0,$H0
3323 vpsllq \$2,$D2hi,$D2hi
3324
3325 vpaddq $D2hi,$H0,$H0
3326
3327 vpsrlq \$44,$H0,$tmp # additional step
3328 vpandq $mask44,$H0,$H0
3329
3330 vpaddq $tmp,$H1,$H1
3331 # at this point $len is
3332 # either 4*n+2 or 0...
3333 sub \$2,$len # len-=32
3334 ja .Lblocks_vpmadd52_4x_do
3335
3336 vmovq %x#$H0,0($ctx)
3337 vmovq %x#$H1,8($ctx)
3338 vmovq %x#$H2,16($ctx)
0a5d1a38 3339 vzeroall
c2b93590
AP
3340
3341.Lno_data_vpmadd52_4x:
3342 ret
3343.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3344___
3345}
0a5d1a38
AP
3346{
3347########################################################################
3348# As implied by its name 8x subroutine processes 8 blocks in parallel...
3349# This is intermediate version, as it's used only in cases when input
3350# length is either 8*n, 8*n+1 or 8*n+2...
3351
3352my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3353my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3354my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3355my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3356
3357$code.=<<___;
3358.type poly1305_blocks_vpmadd52_8x,\@function,4
3359.align 32
3360poly1305_blocks_vpmadd52_8x:
3361 shr \$4,$len
3362 jz .Lno_data_vpmadd52_8x # too short
3363
3364 shl \$40,$padbit
3365 mov 64($ctx),%r8 # peek on power of the key
3366
3367 vmovdqa64 .Lx_mask44(%rip),$mask44
3368 vmovdqa64 .Lx_mask42(%rip),$mask42
3369
3370 test %r8,%r8 # is power value impossible?
3371 js .Linit_vpmadd52 # if it is, then init R[4]
3372
3373 vmovq 0($ctx),%x#$H0 # load current hash value
3374 vmovq 8($ctx),%x#$H1
3375 vmovq 16($ctx),%x#$H2
3376
3377.Lblocks_vpmadd52_8x:
3378 ################################################################
3379 # fist we calculate more key powers
3380
3381 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3382 vmovdqu64 160($ctx),$S1
3383 vmovdqu64 64($ctx),$R0
3384 vmovdqu64 96($ctx),$R1
3385
3386 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3387 vpaddq $R2,$S2,$S2
3388 vpsllq \$2,$S2,$S2
3389
3390 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3391 vpbroadcastq %x#$R0,$RR0
3392 vpbroadcastq %x#$R1,$RR1
3393
3394 vpxorq $D0lo,$D0lo,$D0lo
3395 vpmadd52luq $RR2,$S1,$D0lo
3396 vpxorq $D0hi,$D0hi,$D0hi
3397 vpmadd52huq $RR2,$S1,$D0hi
3398 vpxorq $D1lo,$D1lo,$D1lo
3399 vpmadd52luq $RR2,$S2,$D1lo
3400 vpxorq $D1hi,$D1hi,$D1hi
3401 vpmadd52huq $RR2,$S2,$D1hi
3402 vpxorq $D2lo,$D2lo,$D2lo
3403 vpmadd52luq $RR2,$R0,$D2lo
3404 vpxorq $D2hi,$D2hi,$D2hi
3405 vpmadd52huq $RR2,$R0,$D2hi
3406
3407 vpmadd52luq $RR0,$R0,$D0lo
3408 vpmadd52huq $RR0,$R0,$D0hi
3409 vpmadd52luq $RR0,$R1,$D1lo
3410 vpmadd52huq $RR0,$R1,$D1hi
3411 vpmadd52luq $RR0,$R2,$D2lo
3412 vpmadd52huq $RR0,$R2,$D2hi
3413
3414 vpmadd52luq $RR1,$S2,$D0lo
3415 vpmadd52huq $RR1,$S2,$D0hi
3416 vpmadd52luq $RR1,$R0,$D1lo
3417 vpmadd52huq $RR1,$R0,$D1hi
3418 vpmadd52luq $RR1,$R1,$D2lo
3419 vpmadd52huq $RR1,$R1,$D2hi
3420
3421 ################################################################
3422 # partial reduction
3423 vpsrlq \$44,$D0lo,$tmp
3424 vpsllq \$8,$D0hi,$D0hi
3425 vpandq $mask44,$D0lo,$RR0
3426 vpaddq $tmp,$D0hi,$D0hi
3427
3428 vpaddq $D0hi,$D1lo,$D1lo
3429
3430 vpsrlq \$44,$D1lo,$tmp
3431 vpsllq \$8,$D1hi,$D1hi
3432 vpandq $mask44,$D1lo,$RR1
3433 vpaddq $tmp,$D1hi,$D1hi
3434
3435 vpaddq $D1hi,$D2lo,$D2lo
3436
3437 vpsrlq \$42,$D2lo,$tmp
3438 vpsllq \$10,$D2hi,$D2hi
3439 vpandq $mask42,$D2lo,$RR2
3440 vpaddq $tmp,$D2hi,$D2hi
3441
3442 vpaddq $D2hi,$RR0,$RR0
3443 vpsllq \$2,$D2hi,$D2hi
3444
3445 vpaddq $D2hi,$RR0,$RR0
3446
3447 vpsrlq \$44,$RR0,$tmp # additional step
3448 vpandq $mask44,$RR0,$RR0
3449
3450 vpaddq $tmp,$RR1,$RR1
3451
3452 ################################################################
3453 # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3454 # is 15263748, which reflects how data is loaded...
3455
3456 vpunpcklqdq $R2,$RR2,$T2 # 3748
3457 vpunpckhqdq $R2,$RR2,$R2 # 1526
3458 vpunpcklqdq $R0,$RR0,$T0
3459 vpunpckhqdq $R0,$RR0,$R0
3460 vpunpcklqdq $R1,$RR1,$T1
3461 vpunpckhqdq $R1,$RR1,$R1
3462___
3463######## switch to %zmm
3464map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3465map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3466map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3467map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3468
3469$code.=<<___;
3470 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3471 vshufi64x2 \$0x44,$R0,$T0,$RR0
3472 vshufi64x2 \$0x44,$R1,$T1,$RR1
3473
3474 vmovdqu64 16*0($inp),$T2 # load data
3475 vmovdqu64 16*4($inp),$T3
3476 lea 16*8($inp),$inp
3477
3478 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3479 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3480 vpaddq $RR2,$SS2,$SS2
3481 vpaddq $RR1,$SS1,$SS1
3482 vpsllq \$2,$SS2,$SS2
3483 vpsllq \$2,$SS1,$SS1
3484
3485 vpbroadcastq $padbit,$PAD
3486 vpbroadcastq %x#$mask44,$mask44
3487 vpbroadcastq %x#$mask42,$mask42
3488
3489 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3490 vpbroadcastq %x#$SS2,$S2
3491 vpbroadcastq %x#$RR0,$R0
3492 vpbroadcastq %x#$RR1,$R1
3493 vpbroadcastq %x#$RR2,$R2
3494
3495 vpunpcklqdq $T3,$T2,$T1 # transpose data
3496 vpunpckhqdq $T3,$T2,$T3
3497
3498 # at this point 64-bit lanes are ordered as 73625140
3499
3500 vpsrlq \$24,$T3,$T2 # splat the data
3501 vporq $PAD,$T2,$T2
3502 vpaddq $T2,$H2,$H2 # accumulate input
3503 vpandq $mask44,$T1,$T0
3504 vpsrlq \$44,$T1,$T1
3505 vpsllq \$20,$T3,$T3
3506 vporq $T3,$T1,$T1
3507 vpandq $mask44,$T1,$T1
3508
3509 sub \$8,$len
3510 jz .Ltail_vpmadd52_8x
3511 jmp .Loop_vpmadd52_8x
3512
3513.align 32
3514.Loop_vpmadd52_8x:
3515 #vpaddq $T2,$H2,$H2 # accumulate input
3516 vpaddq $T0,$H0,$H0
3517 vpaddq $T1,$H1,$H1
3518
3519 vpxorq $D0lo,$D0lo,$D0lo
3520 vpmadd52luq $H2,$S1,$D0lo
3521 vpxorq $D0hi,$D0hi,$D0hi
3522 vpmadd52huq $H2,$S1,$D0hi
3523 vpxorq $D1lo,$D1lo,$D1lo
3524 vpmadd52luq $H2,$S2,$D1lo
3525 vpxorq $D1hi,$D1hi,$D1hi
3526 vpmadd52huq $H2,$S2,$D1hi
3527 vpxorq $D2lo,$D2lo,$D2lo
3528 vpmadd52luq $H2,$R0,$D2lo
3529 vpxorq $D2hi,$D2hi,$D2hi
3530 vpmadd52huq $H2,$R0,$D2hi
3531
3532 vmovdqu64 16*0($inp),$T2 # load data
3533 vmovdqu64 16*4($inp),$T3
3534 lea 16*8($inp),$inp
3535 vpmadd52luq $H0,$R0,$D0lo
3536 vpmadd52huq $H0,$R0,$D0hi
3537 vpmadd52luq $H0,$R1,$D1lo
3538 vpmadd52huq $H0,$R1,$D1hi
3539 vpmadd52luq $H0,$R2,$D2lo
3540 vpmadd52huq $H0,$R2,$D2hi
3541
3542 vpunpcklqdq $T3,$T2,$T1 # transpose data
3543 vpunpckhqdq $T3,$T2,$T3
3544 vpmadd52luq $H1,$S2,$D0lo
3545 vpmadd52huq $H1,$S2,$D0hi
3546 vpmadd52luq $H1,$R0,$D1lo
3547 vpmadd52huq $H1,$R0,$D1hi
3548 vpmadd52luq $H1,$R1,$D2lo
3549 vpmadd52huq $H1,$R1,$D2hi
3550
3551 ################################################################
3552 # partial reduction (interleaved with data splat)
3553 vpsrlq \$44,$D0lo,$tmp
3554 vpsllq \$8,$D0hi,$D0hi
3555 vpandq $mask44,$D0lo,$H0
3556 vpaddq $tmp,$D0hi,$D0hi
3557
3558 vpsrlq \$24,$T3,$T2
3559 vporq $PAD,$T2,$T2
3560 vpaddq $D0hi,$D1lo,$D1lo
3561
3562 vpsrlq \$44,$D1lo,$tmp
3563 vpsllq \$8,$D1hi,$D1hi
3564 vpandq $mask44,$D1lo,$H1
3565 vpaddq $tmp,$D1hi,$D1hi
3566
3567 vpandq $mask44,$T1,$T0
3568 vpsrlq \$44,$T1,$T1
3569 vpsllq \$20,$T3,$T3
3570 vpaddq $D1hi,$D2lo,$D2lo
3571
3572 vpsrlq \$42,$D2lo,$tmp
3573 vpsllq \$10,$D2hi,$D2hi
3574 vpandq $mask42,$D2lo,$H2
3575 vpaddq $tmp,$D2hi,$D2hi
3576
3577 vpaddq $T2,$H2,$H2 # accumulate input
3578 vpaddq $D2hi,$H0,$H0
3579 vpsllq \$2,$D2hi,$D2hi
3580
3581 vpaddq $D2hi,$H0,$H0
3582 vporq $T3,$T1,$T1
3583 vpandq $mask44,$T1,$T1
3584
3585 vpsrlq \$44,$H0,$tmp # additional step
3586 vpandq $mask44,$H0,$H0
3587
3588 vpaddq $tmp,$H1,$H1
3589
3590 sub \$8,$len # len-=128
3591 jnz .Loop_vpmadd52_8x
3592
3593.Ltail_vpmadd52_8x:
3594 #vpaddq $T2,$H2,$H2 # accumulate input
3595 vpaddq $T0,$H0,$H0
3596 vpaddq $T1,$H1,$H1
3597
3598 vpxorq $D0lo,$D0lo,$D0lo
3599 vpmadd52luq $H2,$SS1,$D0lo
3600 vpxorq $D0hi,$D0hi,$D0hi
3601 vpmadd52huq $H2,$SS1,$D0hi
3602 vpxorq $D1lo,$D1lo,$D1lo
3603 vpmadd52luq $H2,$SS2,$D1lo
3604 vpxorq $D1hi,$D1hi,$D1hi
3605 vpmadd52huq $H2,$SS2,$D1hi
3606 vpxorq $D2lo,$D2lo,$D2lo
3607 vpmadd52luq $H2,$RR0,$D2lo
3608 vpxorq $D2hi,$D2hi,$D2hi
3609 vpmadd52huq $H2,$RR0,$D2hi
3610
3611 vpmadd52luq $H0,$RR0,$D0lo
3612 vpmadd52huq $H0,$RR0,$D0hi
3613 vpmadd52luq $H0,$RR1,$D1lo
3614 vpmadd52huq $H0,$RR1,$D1hi
3615 vpmadd52luq $H0,$RR2,$D2lo
3616 vpmadd52huq $H0,$RR2,$D2hi
3617
3618 vpmadd52luq $H1,$SS2,$D0lo
3619 vpmadd52huq $H1,$SS2,$D0hi
3620 vpmadd52luq $H1,$RR0,$D1lo
3621 vpmadd52huq $H1,$RR0,$D1hi
3622 vpmadd52luq $H1,$RR1,$D2lo
3623 vpmadd52huq $H1,$RR1,$D2hi
3624
3625 ################################################################
3626 # horizontal addition
3627
3628 mov \$1,%eax
3629 kmovw %eax,%k1
3630 vpsrldq \$8,$D0lo,$T0
3631 vpsrldq \$8,$D0hi,$H0
3632 vpsrldq \$8,$D1lo,$T1
3633 vpsrldq \$8,$D1hi,$H1
3634 vpaddq $T0,$D0lo,$D0lo
3635 vpaddq $H0,$D0hi,$D0hi
3636 vpsrldq \$8,$D2lo,$T2
3637 vpsrldq \$8,$D2hi,$H2
3638 vpaddq $T1,$D1lo,$D1lo
3639 vpaddq $H1,$D1hi,$D1hi
3640 vpermq \$0x2,$D0lo,$T0
3641 vpermq \$0x2,$D0hi,$H0
3642 vpaddq $T2,$D2lo,$D2lo
3643 vpaddq $H2,$D2hi,$D2hi
3644
3645 vpermq \$0x2,$D1lo,$T1
3646 vpermq \$0x2,$D1hi,$H1
3647 vpaddq $T0,$D0lo,$D0lo
3648 vpaddq $H0,$D0hi,$D0hi
3649 vpermq \$0x2,$D2lo,$T2
3650 vpermq \$0x2,$D2hi,$H2
3651 vpaddq $T1,$D1lo,$D1lo
3652 vpaddq $H1,$D1hi,$D1hi
3653 vextracti64x4 \$1,$D0lo,%y#$T0
3654 vextracti64x4 \$1,$D0hi,%y#$H0
3655 vpaddq $T2,$D2lo,$D2lo
3656 vpaddq $H2,$D2hi,$D2hi
3657
3658 vextracti64x4 \$1,$D1lo,%y#$T1
3659 vextracti64x4 \$1,$D1hi,%y#$H1
3660 vextracti64x4 \$1,$D2lo,%y#$T2
3661 vextracti64x4 \$1,$D2hi,%y#$H2
3662___
3663######## switch back to %ymm
3664map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3665map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3666map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3667
3668$code.=<<___;
3669 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3670 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3671 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3672 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3673 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3674 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3675
3676 ################################################################
3677 # partial reduction
3678 vpsrlq \$44,$D0lo,$tmp
3679 vpsllq \$8,$D0hi,$D0hi
3680 vpandq $mask44,$D0lo,$H0
3681 vpaddq $tmp,$D0hi,$D0hi
3682
3683 vpaddq $D0hi,$D1lo,$D1lo
3684
3685 vpsrlq \$44,$D1lo,$tmp
3686 vpsllq \$8,$D1hi,$D1hi
3687 vpandq $mask44,$D1lo,$H1
3688 vpaddq $tmp,$D1hi,$D1hi
3689
3690 vpaddq $D1hi,$D2lo,$D2lo
3691
3692 vpsrlq \$42,$D2lo,$tmp
3693 vpsllq \$10,$D2hi,$D2hi
3694 vpandq $mask42,$D2lo,$H2
3695 vpaddq $tmp,$D2hi,$D2hi
3696
3697 vpaddq $D2hi,$H0,$H0
3698 vpsllq \$2,$D2hi,$D2hi
3699
3700 vpaddq $D2hi,$H0,$H0
3701
3702 vpsrlq \$44,$H0,$tmp # additional step
3703 vpandq $mask44,$H0,$H0
3704
3705 vpaddq $tmp,$H1,$H1
3706
3707 ################################################################
3708
3709 vmovq %x#$H0,0($ctx)
3710 vmovq %x#$H1,8($ctx)
3711 vmovq %x#$H2,16($ctx)
3712 vzeroall
3713
3714.Lno_data_vpmadd52_8x:
3715 ret
3716.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3717___
3718}
fd910ef9
AP
3719$code.=<<___;
3720.type poly1305_emit_base2_44,\@function,3
3721.align 32
3722poly1305_emit_base2_44:
3723 mov 0($ctx),%r8 # load hash value
3724 mov 8($ctx),%r9
3725 mov 16($ctx),%r10
3726
3727 mov %r9,%rax
3728 shr \$20,%r9
3729 shl \$44,%rax
3730 mov %r10,%rcx
3731 shr \$40,%r10
3732 shl \$24,%rcx
3733
3734 add %rax,%r8
3735 adc %rcx,%r9
3736 adc \$0,%r10
3737
3738 mov %r8,%rax
3739 add \$5,%r8 # compare to modulus
3740 mov %r9,%rcx
3741 adc \$0,%r9
3742 adc \$0,%r10
46f4e1be 3743 shr \$2,%r10 # did 130-bit value overflow?
fd910ef9
AP
3744 cmovnz %r8,%rax
3745 cmovnz %r9,%rcx
3746
3747 add 0($nonce),%rax # accumulate nonce
3748 adc 8($nonce),%rcx
3749 mov %rax,0($mac) # write result
3750 mov %rcx,8($mac)
3751
3752 ret
3753.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3754___
3755} } }
89778806
AP
3756$code.=<<___;
3757.align 64
3758.Lconst:
3759.Lmask24:
3760.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3761.L129:
3762.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3763.Lmask26:
3764.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3765.Lpermd_avx2:
3766.long 2,2,2,3,2,0,2,1
3767.Lpermd_avx512:
3768.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3769
3770.L2_44_inp_permd:
3771.long 0,1,1,2,2,3,7,7
3772.L2_44_inp_shift:
3773.quad 0,12,24,64
3774.L2_44_mask:
3775.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3776.L2_44_shift_rgt:
3777.quad 44,44,42,64
3778.L2_44_shift_lft:
3779.quad 8,8,10,64
3780
3781.align 64
3782.Lx_mask44:
3783.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3784.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3785.Lx_mask42:
3786.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3787.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3788___
3789}
3790$code.=<<___;
3791.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3792.align 16
3793___
0edb109f
AP
3794
3795{ # chacha20-poly1305 helpers
3796my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3797 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3798$code.=<<___;
3799.globl xor128_encrypt_n_pad
3800.type xor128_encrypt_n_pad,\@abi-omnipotent
3801.align 16
3802xor128_encrypt_n_pad:
3803 sub $otp,$inp
3804 sub $otp,$out
3805 mov $len,%r10 # put len aside
3806 shr \$4,$len # len / 16
3807 jz .Ltail_enc
3808 nop
3809.Loop_enc_xmm:
3810 movdqu ($inp,$otp),%xmm0
3811 pxor ($otp),%xmm0
3812 movdqu %xmm0,($out,$otp)
3813 movdqa %xmm0,($otp)
3814 lea 16($otp),$otp
3815 dec $len
3816 jnz .Loop_enc_xmm
3817
3818 and \$15,%r10 # len % 16
3819 jz .Ldone_enc
3820
3821.Ltail_enc:
3822 mov \$16,$len
3823 sub %r10,$len
3824 xor %eax,%eax
3825.Loop_enc_byte:
3826 mov ($inp,$otp),%al
3827 xor ($otp),%al
3828 mov %al,($out,$otp)
3829 mov %al,($otp)
3830 lea 1($otp),$otp
3831 dec %r10
3832 jnz .Loop_enc_byte
3833
3834 xor %eax,%eax
3835.Loop_enc_pad:
3836 mov %al,($otp)
3837 lea 1($otp),$otp
3838 dec $len
3839 jnz .Loop_enc_pad
3840
3841.Ldone_enc:
3842 mov $otp,%rax
3843 ret
3844.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3845
3846.globl xor128_decrypt_n_pad
3847.type xor128_decrypt_n_pad,\@abi-omnipotent
3848.align 16
3849xor128_decrypt_n_pad:
3850 sub $otp,$inp
3851 sub $otp,$out
3852 mov $len,%r10 # put len aside
3853 shr \$4,$len # len / 16
3854 jz .Ltail_dec
3855 nop
3856.Loop_dec_xmm:
3857 movdqu ($inp,$otp),%xmm0
3858 movdqa ($otp),%xmm1
3859 pxor %xmm0,%xmm1
3860 movdqu %xmm1,($out,$otp)
3861 movdqa %xmm0,($otp)
3862 lea 16($otp),$otp
3863 dec $len
3864 jnz .Loop_dec_xmm
3865
3866 pxor %xmm1,%xmm1
3867 and \$15,%r10 # len % 16
3868 jz .Ldone_dec
3869
3870.Ltail_dec:
3871 mov \$16,$len
3872 sub %r10,$len
3873 xor %eax,%eax
3874 xor %r11,%r11
3875.Loop_dec_byte:
3876 mov ($inp,$otp),%r11b
3877 mov ($otp),%al
3878 xor %r11b,%al
3879 mov %al,($out,$otp)
3880 mov %r11b,($otp)
3881 lea 1($otp),$otp
3882 dec %r10
3883 jnz .Loop_dec_byte
3884
3885 xor %eax,%eax
3886.Loop_dec_pad:
3887 mov %al,($otp)
3888 lea 1($otp),$otp
3889 dec $len
3890 jnz .Loop_dec_pad
3891
3892.Ldone_dec:
3893 mov $otp,%rax
3894 ret
3895.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3896___
3897}
a98c648e
AP
3898
3899# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3900# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3901if ($win64) {
3902$rec="%rcx";
3903$frame="%rdx";
3904$context="%r8";
3905$disp="%r9";
3906
3907$code.=<<___;
3908.extern __imp_RtlVirtualUnwind
3909.type se_handler,\@abi-omnipotent
3910.align 16
3911se_handler:
3912 push %rsi
3913 push %rdi
3914 push %rbx
3915 push %rbp
3916 push %r12
3917 push %r13
3918 push %r14
3919 push %r15
3920 pushfq
3921 sub \$64,%rsp
3922
3923 mov 120($context),%rax # pull context->Rax
3924 mov 248($context),%rbx # pull context->Rip
3925
3926 mov 8($disp),%rsi # disp->ImageBase
3927 mov 56($disp),%r11 # disp->HandlerData
3928
3929 mov 0(%r11),%r10d # HandlerData[0]
3930 lea (%rsi,%r10),%r10 # prologue label
3931 cmp %r10,%rbx # context->Rip<.Lprologue
3932 jb .Lcommon_seh_tail
3933
3934 mov 152($context),%rax # pull context->Rsp
3935
3936 mov 4(%r11),%r10d # HandlerData[1]
3937 lea (%rsi,%r10),%r10 # epilogue label
3938 cmp %r10,%rbx # context->Rip>=.Lepilogue
3939 jae .Lcommon_seh_tail
3940
3941 lea 48(%rax),%rax
3942
3943 mov -8(%rax),%rbx
3944 mov -16(%rax),%rbp
3945 mov -24(%rax),%r12
3946 mov -32(%rax),%r13
3947 mov -40(%rax),%r14
3948 mov -48(%rax),%r15
3949 mov %rbx,144($context) # restore context->Rbx
3950 mov %rbp,160($context) # restore context->Rbp
3951 mov %r12,216($context) # restore context->R12
3952 mov %r13,224($context) # restore context->R13
3953 mov %r14,232($context) # restore context->R14
3954 mov %r15,240($context) # restore context->R14
3955
3956 jmp .Lcommon_seh_tail
3957.size se_handler,.-se_handler
3958
3959.type avx_handler,\@abi-omnipotent
3960.align 16
3961avx_handler:
3962 push %rsi
3963 push %rdi
3964 push %rbx
3965 push %rbp
3966 push %r12
3967 push %r13
3968 push %r14
3969 push %r15
3970 pushfq
3971 sub \$64,%rsp
3972
3973 mov 120($context),%rax # pull context->Rax
3974 mov 248($context),%rbx # pull context->Rip
3975
3976 mov 8($disp),%rsi # disp->ImageBase
3977 mov 56($disp),%r11 # disp->HandlerData
3978
3979 mov 0(%r11),%r10d # HandlerData[0]
3980 lea (%rsi,%r10),%r10 # prologue label
3981 cmp %r10,%rbx # context->Rip<prologue label
3982 jb .Lcommon_seh_tail
3983
3984 mov 152($context),%rax # pull context->Rsp
3985
3986 mov 4(%r11),%r10d # HandlerData[1]
3987 lea (%rsi,%r10),%r10 # epilogue label
3988 cmp %r10,%rbx # context->Rip>=epilogue label
3989 jae .Lcommon_seh_tail
3990
3991 mov 208($context),%rax # pull context->R11
3992
3993 lea 0x50(%rax),%rsi
3994 lea 0xf8(%rax),%rax
3995 lea 512($context),%rdi # &context.Xmm6
3996 mov \$20,%ecx
3997 .long 0xa548f3fc # cld; rep movsq
3998
3999.Lcommon_seh_tail:
4000 mov 8(%rax),%rdi
4001 mov 16(%rax),%rsi
4002 mov %rax,152($context) # restore context->Rsp
4003 mov %rsi,168($context) # restore context->Rsi
4004 mov %rdi,176($context) # restore context->Rdi
4005
4006 mov 40($disp),%rdi # disp->ContextRecord
4007 mov $context,%rsi # context
4008 mov \$154,%ecx # sizeof(CONTEXT)
4009 .long 0xa548f3fc # cld; rep movsq
4010
4011 mov $disp,%rsi
4012 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4013 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4014 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4015 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4016 mov 40(%rsi),%r10 # disp->ContextRecord
4017 lea 56(%rsi),%r11 # &disp->HandlerData
4018 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4019 mov %r10,32(%rsp) # arg5
4020 mov %r11,40(%rsp) # arg6
4021 mov %r12,48(%rsp) # arg7
4022 mov %rcx,56(%rsp) # arg8, (NULL)
4023 call *__imp_RtlVirtualUnwind(%rip)
4024
4025 mov \$1,%eax # ExceptionContinueSearch
4026 add \$64,%rsp
4027 popfq
4028 pop %r15
4029 pop %r14
4030 pop %r13
4031 pop %r12
4032 pop %rbp
4033 pop %rbx
4034 pop %rdi
4035 pop %rsi
4036 ret
4037.size avx_handler,.-avx_handler
4038
4039.section .pdata
4040.align 4
4041 .rva .LSEH_begin_poly1305_init
4042 .rva .LSEH_end_poly1305_init
4043 .rva .LSEH_info_poly1305_init
4044
4045 .rva .LSEH_begin_poly1305_blocks
4046 .rva .LSEH_end_poly1305_blocks
4047 .rva .LSEH_info_poly1305_blocks
4048
4049 .rva .LSEH_begin_poly1305_emit
4050 .rva .LSEH_end_poly1305_emit
4051 .rva .LSEH_info_poly1305_emit
4052___
4053$code.=<<___ if ($avx);
4054 .rva .LSEH_begin_poly1305_blocks_avx
4055 .rva .Lbase2_64_avx
4056 .rva .LSEH_info_poly1305_blocks_avx_1
4057
4058 .rva .Lbase2_64_avx
4059 .rva .Leven_avx
4060 .rva .LSEH_info_poly1305_blocks_avx_2
4061
4062 .rva .Leven_avx
4063 .rva .LSEH_end_poly1305_blocks_avx
4064 .rva .LSEH_info_poly1305_blocks_avx_3
4065
4066 .rva .LSEH_begin_poly1305_emit_avx
4067 .rva .LSEH_end_poly1305_emit_avx
4068 .rva .LSEH_info_poly1305_emit_avx
4069___
4070$code.=<<___ if ($avx>1);
4071 .rva .LSEH_begin_poly1305_blocks_avx2
4072 .rva .Lbase2_64_avx2
4073 .rva .LSEH_info_poly1305_blocks_avx2_1
4074
4075 .rva .Lbase2_64_avx2
4076 .rva .Leven_avx2
4077 .rva .LSEH_info_poly1305_blocks_avx2_2
4078
4079 .rva .Leven_avx2
4080 .rva .LSEH_end_poly1305_blocks_avx2
4081 .rva .LSEH_info_poly1305_blocks_avx2_3
4082___
abb8c44f
AP
4083$code.=<<___ if ($avx>2);
4084 .rva .LSEH_begin_poly1305_blocks_avx512
4085 .rva .LSEH_end_poly1305_blocks_avx512
4086 .rva .LSEH_info_poly1305_blocks_avx512
4087___
a98c648e
AP
4088$code.=<<___;
4089.section .xdata
4090.align 8
4091.LSEH_info_poly1305_init:
4092 .byte 9,0,0,0
4093 .rva se_handler
4094 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4095
4096.LSEH_info_poly1305_blocks:
4097 .byte 9,0,0,0
4098 .rva se_handler
4099 .rva .Lblocks_body,.Lblocks_epilogue
4100
4101.LSEH_info_poly1305_emit:
4102 .byte 9,0,0,0
4103 .rva se_handler
4104 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4105___
4106$code.=<<___ if ($avx);
4107.LSEH_info_poly1305_blocks_avx_1:
4108 .byte 9,0,0,0
4109 .rva se_handler
4110 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4111
4112.LSEH_info_poly1305_blocks_avx_2:
4113 .byte 9,0,0,0
4114 .rva se_handler
4115 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4116
4117.LSEH_info_poly1305_blocks_avx_3:
4118 .byte 9,0,0,0
4119 .rva avx_handler
4120 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4121
4122.LSEH_info_poly1305_emit_avx:
4123 .byte 9,0,0,0
4124 .rva se_handler
4125 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4126___
4127$code.=<<___ if ($avx>1);
4128.LSEH_info_poly1305_blocks_avx2_1:
4129 .byte 9,0,0,0
4130 .rva se_handler
4131 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4132
4133.LSEH_info_poly1305_blocks_avx2_2:
4134 .byte 9,0,0,0
4135 .rva se_handler
4136 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4137
4138.LSEH_info_poly1305_blocks_avx2_3:
4139 .byte 9,0,0,0
4140 .rva avx_handler
4141 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4142___
abb8c44f
AP
4143$code.=<<___ if ($avx>2);
4144.LSEH_info_poly1305_blocks_avx512:
4145 .byte 9,0,0,0
4146 .rva avx_handler
4147 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4148___
a98c648e
AP
4149}
4150
4151foreach (split('\n',$code)) {
4152 s/\`([^\`]*)\`/eval($1)/ge;
4153 s/%r([a-z]+)#d/%e$1/g;
4154 s/%r([0-9]+)#d/%r$1d/g;
abb8c44f 4155 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
a98c648e
AP
4156
4157 print $_,"\n";
4158}
4159close STDOUT;