]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
49d3b641 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a98c648e
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for x86_64.
18#
19# March 2015
20#
abb8c44f
AP
21# Initial release.
22#
23# December 2016
24#
25# Add AVX512F+VL+BW code path.
26#
a8f302e5
AP
27# November 2017
28#
29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
30# executed even on Knights Landing. Trigger for modification was
31# observation that AVX512 code paths can negatively affect overall
32# Skylake-X system performance. Since we are likely to suppress
33# AVX512F capability flag [at least on Skylake-X], conversion serves
34# as kind of "investment protection". Note that next *lake processor,
35# Cannolake, has AVX512IFMA code path to execute...
36#
a98c648e
AP
37# Numbers are cycles per processed byte with poly1305_blocks alone,
38# measured with rdtsc at fixed clock frequency.
39#
64d92d74 40# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
4b8736a2
AP
41# P4 4.46/+120% -
42# Core 2 2.41/+90% -
43# Westmere 1.88/+120% -
a98c648e 44# Sandy Bridge 1.39/+140% 1.10
4b8736a2 45# Haswell 1.14/+175% 1.11 0.65
64d92d74 46# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
a98c648e 47# Silvermont 2.83/+95% -
4dfe4310 48# Knights L 3.60/? 1.65 1.10 0.41(***)
ace05265 49# Goldmont 1.70/+180% -
a98c648e
AP
50# VIA Nano 1.82/+150% -
51# Sledgehammer 1.38/+160% -
4b8736a2 52# Bulldozer 2.30/+130% 0.97
54f8f9a1 53# Ryzen 1.15/+200% 1.08 1.18
a98c648e
AP
54#
55# (*) improvement coefficients relative to clang are more modest and
56# are ~50% on most processors, in both cases we are comparing to
57# __int128 code;
58# (**) SSE2 implementation was attempted, but among non-AVX processors
59# it was faster than integer-only code only on older Intel P4 and
60# Core processors, 50-30%, less newer processor is, but slower on
61# contemporary ones, for example almost 2x slower on Atom, and as
62# former are naturally disappearing, SSE2 is deemed unnecessary;
4dfe4310
AP
63# (***) strangely enough performance seems to vary from core to core,
64# listed result is best case;
a98c648e 65
1aa89a7a
RL
66# $output is the last argument if it looks like a file (it has an extension)
67# $flavour is the first argument if it doesn't look like a file
68$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
69$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
a98c648e
AP
70
71$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
76die "can't locate x86_64-xlate.pl";
77
78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
fd910ef9 80 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
a98c648e
AP
81}
82
83if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1ea01427 84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
fd910ef9
AP
85 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
86 $avx += 2 if ($1==2.11 && $2>=8);
a98c648e
AP
87}
88
89if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
90 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
91 $avx = ($1>=10) + ($1>=12);
92}
93
9bb3e5fd 94if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
a98c648e
AP
95 $avx = ($2>=3.0) + ($2>3.0);
96}
97
1aa89a7a
RL
98open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
99 or die "can't call $xlate: $!";
a98c648e
AP
100*STDOUT=*OUT;
101
102my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
103my ($mac,$nonce)=($inp,$len); # *_emit arguments
104my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
105my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
106
107sub poly1305_iteration {
108# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
109# output: $h0-$h2 *= $r0-$r1
110$code.=<<___;
111 mulq $h0 # h0*r1
112 mov %rax,$d2
113 mov $r0,%rax
114 mov %rdx,$d3
115
116 mulq $h0 # h0*r0
117 mov %rax,$h0 # future $h0
118 mov $r0,%rax
119 mov %rdx,$d1
120
121 mulq $h1 # h1*r0
122 add %rax,$d2
123 mov $s1,%rax
124 adc %rdx,$d3
125
126 mulq $h1 # h1*s1
127 mov $h2,$h1 # borrow $h1
128 add %rax,$h0
129 adc %rdx,$d1
130
131 imulq $s1,$h1 # h2*s1
132 add $h1,$d2
133 mov $d1,$h1
134 adc \$0,$d3
135
136 imulq $r0,$h2 # h2*r0
137 add $d2,$h1
138 mov \$-4,%rax # mask value
139 adc $h2,$d3
140
141 and $d3,%rax # last reduction step
142 mov $d3,$h2
143 shr \$2,$d3
144 and \$3,$h2
145 add $d3,%rax
146 add %rax,$h0
147 adc \$0,$h1
4b8736a2 148 adc \$0,$h2
a98c648e
AP
149___
150}
151
152########################################################################
153# Layout of opaque area is following.
154#
155# unsigned __int64 h[3]; # current hash value base 2^64
156# unsigned __int64 r[2]; # key value base 2^64
157
158$code.=<<___;
159.text
160
161.extern OPENSSL_ia32cap_P
162
163.globl poly1305_init
3992e8c0 164.hidden poly1305_init
4ef29667 165.globl poly1305_blocks
3992e8c0 166.hidden poly1305_blocks
4ef29667 167.globl poly1305_emit
3992e8c0
AP
168.hidden poly1305_emit
169
a85dbf11 170.type poly1305_init,\@function,3
a98c648e
AP
171.align 32
172poly1305_init:
048fa13e 173.cfi_startproc
a98c648e
AP
174 xor %rax,%rax
175 mov %rax,0($ctx) # initialize hash value
176 mov %rax,8($ctx)
177 mov %rax,16($ctx)
178
179 cmp \$0,$inp
180 je .Lno_key
181
182 lea poly1305_blocks(%rip),%r10
183 lea poly1305_emit(%rip),%r11
184___
185$code.=<<___ if ($avx);
186 mov OPENSSL_ia32cap_P+4(%rip),%r9
187 lea poly1305_blocks_avx(%rip),%rax
188 lea poly1305_emit_avx(%rip),%rcx
189 bt \$`60-32`,%r9 # AVX?
190 cmovc %rax,%r10
191 cmovc %rcx,%r11
192___
193$code.=<<___ if ($avx>1);
194 lea poly1305_blocks_avx2(%rip),%rax
195 bt \$`5+32`,%r9 # AVX2?
196 cmovc %rax,%r10
197___
fd910ef9
AP
198$code.=<<___ if ($avx>3);
199 mov \$`(1<<31|1<<21|1<<16)`,%rax
200 shr \$32,%r9
201 and %rax,%r9
202 cmp %rax,%r9
203 je .Linit_base2_44
204___
a98c648e
AP
205$code.=<<___;
206 mov \$0x0ffffffc0fffffff,%rax
207 mov \$0x0ffffffc0ffffffc,%rcx
208 and 0($inp),%rax
209 and 8($inp),%rcx
210 mov %rax,24($ctx)
211 mov %rcx,32($ctx)
2460c7f1
AP
212___
213$code.=<<___ if ($flavour !~ /elf32/);
a98c648e
AP
214 mov %r10,0(%rdx)
215 mov %r11,8(%rdx)
2460c7f1
AP
216___
217$code.=<<___ if ($flavour =~ /elf32/);
218 mov %r10d,0(%rdx)
219 mov %r11d,4(%rdx)
220___
221$code.=<<___;
a98c648e
AP
222 mov \$1,%eax
223.Lno_key:
224 ret
048fa13e 225.cfi_endproc
a98c648e
AP
226.size poly1305_init,.-poly1305_init
227
a98c648e
AP
228.type poly1305_blocks,\@function,4
229.align 32
230poly1305_blocks:
1c47e883 231.cfi_startproc
a85dbf11 232.Lblocks:
4b8736a2
AP
233 shr \$4,$len
234 jz .Lno_data # too short
a98c648e
AP
235
236 push %rbx
1c47e883 237.cfi_push %rbx
a98c648e 238 push %rbp
1c47e883 239.cfi_push %rbp
a98c648e 240 push %r12
1c47e883 241.cfi_push %r12
a98c648e 242 push %r13
1c47e883 243.cfi_push %r13
a98c648e 244 push %r14
1c47e883 245.cfi_push %r14
a98c648e 246 push %r15
1c47e883 247.cfi_push %r15
a98c648e
AP
248.Lblocks_body:
249
250 mov $len,%r15 # reassign $len
251
252 mov 24($ctx),$r0 # load r
253 mov 32($ctx),$s1
254
255 mov 0($ctx),$h0 # load hash value
256 mov 8($ctx),$h1
257 mov 16($ctx),$h2
258
259 mov $s1,$r1
260 shr \$2,$s1
261 mov $r1,%rax
262 add $r1,$s1 # s1 = r1 + (r1 >> 2)
263 jmp .Loop
264
265.align 32
266.Loop:
267 add 0($inp),$h0 # accumulate input
268 adc 8($inp),$h1
269 lea 16($inp),$inp
270 adc $padbit,$h2
271___
272 &poly1305_iteration();
273$code.=<<___;
274 mov $r1,%rax
4b8736a2
AP
275 dec %r15 # len-=16
276 jnz .Loop
a98c648e
AP
277
278 mov $h0,0($ctx) # store hash value
279 mov $h1,8($ctx)
280 mov $h2,16($ctx)
281
282 mov 0(%rsp),%r15
1c47e883 283.cfi_restore %r15
a98c648e 284 mov 8(%rsp),%r14
1c47e883 285.cfi_restore %r14
a98c648e 286 mov 16(%rsp),%r13
1c47e883 287.cfi_restore %r13
a98c648e 288 mov 24(%rsp),%r12
1c47e883 289.cfi_restore %r12
a98c648e 290 mov 32(%rsp),%rbp
1c47e883 291.cfi_restore %rbp
a98c648e 292 mov 40(%rsp),%rbx
1c47e883 293.cfi_restore %rbx
a98c648e 294 lea 48(%rsp),%rsp
1c47e883 295.cfi_adjust_cfa_offset -48
a98c648e
AP
296.Lno_data:
297.Lblocks_epilogue:
298 ret
1c47e883 299.cfi_endproc
a98c648e
AP
300.size poly1305_blocks,.-poly1305_blocks
301
a98c648e
AP
302.type poly1305_emit,\@function,3
303.align 32
304poly1305_emit:
048fa13e 305.cfi_startproc
a85dbf11 306.Lemit:
a98c648e
AP
307 mov 0($ctx),%r8 # load hash value
308 mov 8($ctx),%r9
309 mov 16($ctx),%r10
310
311 mov %r8,%rax
312 add \$5,%r8 # compare to modulus
313 mov %r9,%rcx
314 adc \$0,%r9
315 adc \$0,%r10
46f4e1be 316 shr \$2,%r10 # did 130-bit value overflow?
a98c648e
AP
317 cmovnz %r8,%rax
318 cmovnz %r9,%rcx
319
320 add 0($nonce),%rax # accumulate nonce
321 adc 8($nonce),%rcx
322 mov %rax,0($mac) # write result
323 mov %rcx,8($mac)
324
325 ret
048fa13e 326.cfi_endproc
a98c648e
AP
327.size poly1305_emit,.-poly1305_emit
328___
329if ($avx) {
330
331########################################################################
332# Layout of opaque area is following.
333#
334# unsigned __int32 h[5]; # current hash value base 2^26
335# unsigned __int32 is_base2_26;
336# unsigned __int64 r[2]; # key value base 2^64
337# unsigned __int64 pad;
338# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
339#
340# where r^n are base 2^26 digits of degrees of multiplier key. There are
341# 5 digits, but last four are interleaved with multiples of 5, totalling
342# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
343
344my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
345 map("%xmm$_",(0..15));
346
347$code.=<<___;
348.type __poly1305_block,\@abi-omnipotent
349.align 32
350__poly1305_block:
048fa13e 351.cfi_startproc
a98c648e
AP
352___
353 &poly1305_iteration();
354$code.=<<___;
355 ret
048fa13e 356.cfi_endproc
a98c648e
AP
357.size __poly1305_block,.-__poly1305_block
358
359.type __poly1305_init_avx,\@abi-omnipotent
360.align 32
361__poly1305_init_avx:
048fa13e 362.cfi_startproc
a98c648e
AP
363 mov $r0,$h0
364 mov $r1,$h1
365 xor $h2,$h2
366
367 lea 48+64($ctx),$ctx # size optimization
368
369 mov $r1,%rax
370 call __poly1305_block # r^2
371
372 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
373 mov \$0x3ffffff,%edx
374 mov $h0,$d1
375 and $h0#d,%eax
376 mov $r0,$d2
377 and $r0#d,%edx
378 mov %eax,`16*0+0-64`($ctx)
379 shr \$26,$d1
380 mov %edx,`16*0+4-64`($ctx)
381 shr \$26,$d2
382
383 mov \$0x3ffffff,%eax
384 mov \$0x3ffffff,%edx
385 and $d1#d,%eax
386 and $d2#d,%edx
387 mov %eax,`16*1+0-64`($ctx)
388 lea (%rax,%rax,4),%eax # *5
389 mov %edx,`16*1+4-64`($ctx)
390 lea (%rdx,%rdx,4),%edx # *5
391 mov %eax,`16*2+0-64`($ctx)
392 shr \$26,$d1
393 mov %edx,`16*2+4-64`($ctx)
394 shr \$26,$d2
395
396 mov $h1,%rax
397 mov $r1,%rdx
398 shl \$12,%rax
399 shl \$12,%rdx
400 or $d1,%rax
401 or $d2,%rdx
402 and \$0x3ffffff,%eax
403 and \$0x3ffffff,%edx
404 mov %eax,`16*3+0-64`($ctx)
405 lea (%rax,%rax,4),%eax # *5
406 mov %edx,`16*3+4-64`($ctx)
407 lea (%rdx,%rdx,4),%edx # *5
408 mov %eax,`16*4+0-64`($ctx)
409 mov $h1,$d1
410 mov %edx,`16*4+4-64`($ctx)
411 mov $r1,$d2
412
413 mov \$0x3ffffff,%eax
414 mov \$0x3ffffff,%edx
415 shr \$14,$d1
416 shr \$14,$d2
417 and $d1#d,%eax
418 and $d2#d,%edx
419 mov %eax,`16*5+0-64`($ctx)
420 lea (%rax,%rax,4),%eax # *5
421 mov %edx,`16*5+4-64`($ctx)
422 lea (%rdx,%rdx,4),%edx # *5
423 mov %eax,`16*6+0-64`($ctx)
424 shr \$26,$d1
425 mov %edx,`16*6+4-64`($ctx)
426 shr \$26,$d2
427
428 mov $h2,%rax
429 shl \$24,%rax
430 or %rax,$d1
431 mov $d1#d,`16*7+0-64`($ctx)
432 lea ($d1,$d1,4),$d1 # *5
433 mov $d2#d,`16*7+4-64`($ctx)
434 lea ($d2,$d2,4),$d2 # *5
435 mov $d1#d,`16*8+0-64`($ctx)
436 mov $d2#d,`16*8+4-64`($ctx)
437
438 mov $r1,%rax
439 call __poly1305_block # r^3
440
441 mov \$0x3ffffff,%eax # save r^3 base 2^26
442 mov $h0,$d1
443 and $h0#d,%eax
444 shr \$26,$d1
445 mov %eax,`16*0+12-64`($ctx)
446
447 mov \$0x3ffffff,%edx
448 and $d1#d,%edx
449 mov %edx,`16*1+12-64`($ctx)
450 lea (%rdx,%rdx,4),%edx # *5
451 shr \$26,$d1
452 mov %edx,`16*2+12-64`($ctx)
453
454 mov $h1,%rax
455 shl \$12,%rax
456 or $d1,%rax
457 and \$0x3ffffff,%eax
458 mov %eax,`16*3+12-64`($ctx)
459 lea (%rax,%rax,4),%eax # *5
460 mov $h1,$d1
461 mov %eax,`16*4+12-64`($ctx)
462
463 mov \$0x3ffffff,%edx
464 shr \$14,$d1
465 and $d1#d,%edx
466 mov %edx,`16*5+12-64`($ctx)
467 lea (%rdx,%rdx,4),%edx # *5
468 shr \$26,$d1
469 mov %edx,`16*6+12-64`($ctx)
470
471 mov $h2,%rax
472 shl \$24,%rax
473 or %rax,$d1
474 mov $d1#d,`16*7+12-64`($ctx)
475 lea ($d1,$d1,4),$d1 # *5
476 mov $d1#d,`16*8+12-64`($ctx)
477
478 mov $r1,%rax
479 call __poly1305_block # r^4
480
481 mov \$0x3ffffff,%eax # save r^4 base 2^26
482 mov $h0,$d1
483 and $h0#d,%eax
484 shr \$26,$d1
485 mov %eax,`16*0+8-64`($ctx)
486
487 mov \$0x3ffffff,%edx
488 and $d1#d,%edx
489 mov %edx,`16*1+8-64`($ctx)
490 lea (%rdx,%rdx,4),%edx # *5
491 shr \$26,$d1
492 mov %edx,`16*2+8-64`($ctx)
493
494 mov $h1,%rax
495 shl \$12,%rax
496 or $d1,%rax
497 and \$0x3ffffff,%eax
498 mov %eax,`16*3+8-64`($ctx)
499 lea (%rax,%rax,4),%eax # *5
500 mov $h1,$d1
501 mov %eax,`16*4+8-64`($ctx)
502
503 mov \$0x3ffffff,%edx
504 shr \$14,$d1
505 and $d1#d,%edx
506 mov %edx,`16*5+8-64`($ctx)
507 lea (%rdx,%rdx,4),%edx # *5
508 shr \$26,$d1
509 mov %edx,`16*6+8-64`($ctx)
510
511 mov $h2,%rax
512 shl \$24,%rax
513 or %rax,$d1
514 mov $d1#d,`16*7+8-64`($ctx)
515 lea ($d1,$d1,4),$d1 # *5
516 mov $d1#d,`16*8+8-64`($ctx)
517
518 lea -48-64($ctx),$ctx # size [de-]optimization
519 ret
048fa13e 520.cfi_endproc
a98c648e
AP
521.size __poly1305_init_avx,.-__poly1305_init_avx
522
523.type poly1305_blocks_avx,\@function,4
524.align 32
525poly1305_blocks_avx:
1c47e883 526.cfi_startproc
a98c648e
AP
527 mov 20($ctx),%r8d # is_base2_26
528 cmp \$128,$len
529 jae .Lblocks_avx
530 test %r8d,%r8d
a85dbf11 531 jz .Lblocks
a98c648e
AP
532
533.Lblocks_avx:
534 and \$-16,$len
535 jz .Lno_data_avx
536
537 vzeroupper
538
539 test %r8d,%r8d
540 jz .Lbase2_64_avx
541
542 test \$31,$len
543 jz .Leven_avx
544
545 push %rbx
1c47e883 546.cfi_push %rbx
a98c648e 547 push %rbp
1c47e883 548.cfi_push %rbp
a98c648e 549 push %r12
1c47e883 550.cfi_push %r12
a98c648e 551 push %r13
1c47e883 552.cfi_push %r13
a98c648e 553 push %r14
1c47e883 554.cfi_push %r14
a98c648e 555 push %r15
1c47e883 556.cfi_push %r15
a98c648e
AP
557.Lblocks_avx_body:
558
559 mov $len,%r15 # reassign $len
560
561 mov 0($ctx),$d1 # load hash value
562 mov 8($ctx),$d2
563 mov 16($ctx),$h2#d
564
565 mov 24($ctx),$r0 # load r
566 mov 32($ctx),$s1
567
568 ################################# base 2^26 -> base 2^64
569 mov $d1#d,$h0#d
28411657 570 and \$`-1*(1<<31)`,$d1
a98c648e
AP
571 mov $d2,$r1 # borrow $r1
572 mov $d2#d,$h1#d
28411657 573 and \$`-1*(1<<31)`,$d2
a98c648e
AP
574
575 shr \$6,$d1
576 shl \$52,$r1
577 add $d1,$h0
578 shr \$12,$h1
579 shr \$18,$d2
580 add $r1,$h0
581 adc $d2,$h1
582
583 mov $h2,$d1
584 shl \$40,$d1
585 shr \$24,$h2
586 add $d1,$h1
587 adc \$0,$h2 # can be partially reduced...
588
589 mov \$-4,$d2 # ... so reduce
590 mov $h2,$d1
591 and $h2,$d2
592 shr \$2,$d1
593 and \$3,$h2
594 add $d2,$d1 # =*5
595 add $d1,$h0
596 adc \$0,$h1
4b8736a2 597 adc \$0,$h2
a98c648e
AP
598
599 mov $s1,$r1
600 mov $s1,%rax
601 shr \$2,$s1
602 add $r1,$s1 # s1 = r1 + (r1 >> 2)
603
604 add 0($inp),$h0 # accumulate input
605 adc 8($inp),$h1
606 lea 16($inp),$inp
607 adc $padbit,$h2
608
609 call __poly1305_block
610
611 test $padbit,$padbit # if $padbit is zero,
612 jz .Lstore_base2_64_avx # store hash in base 2^64 format
613
614 ################################# base 2^64 -> base 2^26
615 mov $h0,%rax
616 mov $h0,%rdx
617 shr \$52,$h0
618 mov $h1,$r0
619 mov $h1,$r1
620 shr \$26,%rdx
621 and \$0x3ffffff,%rax # h[0]
622 shl \$12,$r0
623 and \$0x3ffffff,%rdx # h[1]
624 shr \$14,$h1
625 or $r0,$h0
626 shl \$24,$h2
627 and \$0x3ffffff,$h0 # h[2]
628 shr \$40,$r1
629 and \$0x3ffffff,$h1 # h[3]
630 or $r1,$h2 # h[4]
631
632 sub \$16,%r15
633 jz .Lstore_base2_26_avx
634
635 vmovd %rax#d,$H0
636 vmovd %rdx#d,$H1
637 vmovd $h0#d,$H2
638 vmovd $h1#d,$H3
639 vmovd $h2#d,$H4
640 jmp .Lproceed_avx
641
642.align 32
643.Lstore_base2_64_avx:
644 mov $h0,0($ctx)
645 mov $h1,8($ctx)
646 mov $h2,16($ctx) # note that is_base2_26 is zeroed
647 jmp .Ldone_avx
648
649.align 16
650.Lstore_base2_26_avx:
651 mov %rax#d,0($ctx) # store hash value base 2^26
652 mov %rdx#d,4($ctx)
653 mov $h0#d,8($ctx)
654 mov $h1#d,12($ctx)
655 mov $h2#d,16($ctx)
656.align 16
657.Ldone_avx:
658 mov 0(%rsp),%r15
1c47e883 659.cfi_restore %r15
a98c648e 660 mov 8(%rsp),%r14
1c47e883 661.cfi_restore %r14
a98c648e 662 mov 16(%rsp),%r13
1c47e883 663.cfi_restore %r13
a98c648e 664 mov 24(%rsp),%r12
1c47e883 665.cfi_restore %r12
a98c648e 666 mov 32(%rsp),%rbp
1c47e883 667.cfi_restore %rbp
a98c648e 668 mov 40(%rsp),%rbx
1c47e883 669.cfi_restore %rbx
a98c648e 670 lea 48(%rsp),%rsp
1c47e883 671.cfi_adjust_cfa_offset -48
a98c648e
AP
672.Lno_data_avx:
673.Lblocks_avx_epilogue:
674 ret
1c47e883 675.cfi_endproc
a98c648e
AP
676
677.align 32
678.Lbase2_64_avx:
1c47e883 679.cfi_startproc
a98c648e 680 push %rbx
1c47e883 681.cfi_push %rbx
a98c648e 682 push %rbp
1c47e883 683.cfi_push %rbp
a98c648e 684 push %r12
1c47e883 685.cfi_push %r12
a98c648e 686 push %r13
1c47e883 687.cfi_push %r13
a98c648e 688 push %r14
1c47e883 689.cfi_push %r14
a98c648e 690 push %r15
1c47e883 691.cfi_push %r15
a98c648e
AP
692.Lbase2_64_avx_body:
693
694 mov $len,%r15 # reassign $len
695
696 mov 24($ctx),$r0 # load r
697 mov 32($ctx),$s1
698
699 mov 0($ctx),$h0 # load hash value
700 mov 8($ctx),$h1
701 mov 16($ctx),$h2#d
702
703 mov $s1,$r1
704 mov $s1,%rax
705 shr \$2,$s1
706 add $r1,$s1 # s1 = r1 + (r1 >> 2)
707
708 test \$31,$len
709 jz .Linit_avx
710
711 add 0($inp),$h0 # accumulate input
712 adc 8($inp),$h1
713 lea 16($inp),$inp
714 adc $padbit,$h2
715 sub \$16,%r15
716
717 call __poly1305_block
718
719.Linit_avx:
720 ################################# base 2^64 -> base 2^26
721 mov $h0,%rax
722 mov $h0,%rdx
723 shr \$52,$h0
724 mov $h1,$d1
725 mov $h1,$d2
726 shr \$26,%rdx
727 and \$0x3ffffff,%rax # h[0]
728 shl \$12,$d1
729 and \$0x3ffffff,%rdx # h[1]
730 shr \$14,$h1
731 or $d1,$h0
732 shl \$24,$h2
733 and \$0x3ffffff,$h0 # h[2]
734 shr \$40,$d2
735 and \$0x3ffffff,$h1 # h[3]
736 or $d2,$h2 # h[4]
737
738 vmovd %rax#d,$H0
739 vmovd %rdx#d,$H1
740 vmovd $h0#d,$H2
741 vmovd $h1#d,$H3
742 vmovd $h2#d,$H4
743 movl \$1,20($ctx) # set is_base2_26
744
745 call __poly1305_init_avx
746
747.Lproceed_avx:
748 mov %r15,$len
749
750 mov 0(%rsp),%r15
1c47e883 751.cfi_restore %r15
a98c648e 752 mov 8(%rsp),%r14
1c47e883 753.cfi_restore %r14
a98c648e 754 mov 16(%rsp),%r13
1c47e883 755.cfi_restore %r13
a98c648e 756 mov 24(%rsp),%r12
1c47e883 757.cfi_restore %r12
a98c648e 758 mov 32(%rsp),%rbp
1c47e883 759.cfi_restore %rbp
a98c648e 760 mov 40(%rsp),%rbx
1c47e883 761.cfi_restore %rbx
a98c648e
AP
762 lea 48(%rsp),%rax
763 lea 48(%rsp),%rsp
1c47e883 764.cfi_adjust_cfa_offset -48
a98c648e
AP
765.Lbase2_64_avx_epilogue:
766 jmp .Ldo_avx
1c47e883 767.cfi_endproc
a98c648e
AP
768
769.align 32
770.Leven_avx:
1c47e883 771.cfi_startproc
a98c648e
AP
772 vmovd 4*0($ctx),$H0 # load hash value
773 vmovd 4*1($ctx),$H1
774 vmovd 4*2($ctx),$H2
775 vmovd 4*3($ctx),$H3
776 vmovd 4*4($ctx),$H4
777
778.Ldo_avx:
779___
780$code.=<<___ if (!$win64);
781 lea -0x58(%rsp),%r11
1c47e883 782.cfi_def_cfa %r11,0x60
a98c648e
AP
783 sub \$0x178,%rsp
784___
785$code.=<<___ if ($win64);
786 lea -0xf8(%rsp),%r11
787 sub \$0x218,%rsp
788 vmovdqa %xmm6,0x50(%r11)
789 vmovdqa %xmm7,0x60(%r11)
790 vmovdqa %xmm8,0x70(%r11)
791 vmovdqa %xmm9,0x80(%r11)
792 vmovdqa %xmm10,0x90(%r11)
793 vmovdqa %xmm11,0xa0(%r11)
794 vmovdqa %xmm12,0xb0(%r11)
795 vmovdqa %xmm13,0xc0(%r11)
796 vmovdqa %xmm14,0xd0(%r11)
797 vmovdqa %xmm15,0xe0(%r11)
798.Ldo_avx_body:
799___
800$code.=<<___;
801 sub \$64,$len
802 lea -32($inp),%rax
803 cmovc %rax,$inp
804
805 vmovdqu `16*3`($ctx),$D4 # preload r0^2
806 lea `16*3+64`($ctx),$ctx # size optimization
807 lea .Lconst(%rip),%rcx
808
809 ################################################################
810 # load input
811 vmovdqu 16*2($inp),$T0
812 vmovdqu 16*3($inp),$T1
813 vmovdqa 64(%rcx),$MASK # .Lmask26
814
815 vpsrldq \$6,$T0,$T2 # splat input
816 vpsrldq \$6,$T1,$T3
817 vpunpckhqdq $T1,$T0,$T4 # 4
818 vpunpcklqdq $T1,$T0,$T0 # 0:1
819 vpunpcklqdq $T3,$T2,$T3 # 2:3
820
821 vpsrlq \$40,$T4,$T4 # 4
822 vpsrlq \$26,$T0,$T1
823 vpand $MASK,$T0,$T0 # 0
824 vpsrlq \$4,$T3,$T2
825 vpand $MASK,$T1,$T1 # 1
826 vpsrlq \$30,$T3,$T3
827 vpand $MASK,$T2,$T2 # 2
828 vpand $MASK,$T3,$T3 # 3
829 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
830
831 jbe .Lskip_loop_avx
832
833 # expand and copy pre-calculated table to stack
834 vmovdqu `16*1-64`($ctx),$D1
835 vmovdqu `16*2-64`($ctx),$D2
836 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
837 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
838 vmovdqa $D3,-0x90(%r11)
839 vmovdqa $D0,0x00(%rsp)
840 vpshufd \$0xEE,$D1,$D4
841 vmovdqu `16*3-64`($ctx),$D0
842 vpshufd \$0x44,$D1,$D1
843 vmovdqa $D4,-0x80(%r11)
844 vmovdqa $D1,0x10(%rsp)
845 vpshufd \$0xEE,$D2,$D3
846 vmovdqu `16*4-64`($ctx),$D1
847 vpshufd \$0x44,$D2,$D2
848 vmovdqa $D3,-0x70(%r11)
849 vmovdqa $D2,0x20(%rsp)
850 vpshufd \$0xEE,$D0,$D4
851 vmovdqu `16*5-64`($ctx),$D2
852 vpshufd \$0x44,$D0,$D0
853 vmovdqa $D4,-0x60(%r11)
854 vmovdqa $D0,0x30(%rsp)
855 vpshufd \$0xEE,$D1,$D3
856 vmovdqu `16*6-64`($ctx),$D0
857 vpshufd \$0x44,$D1,$D1
858 vmovdqa $D3,-0x50(%r11)
859 vmovdqa $D1,0x40(%rsp)
860 vpshufd \$0xEE,$D2,$D4
861 vmovdqu `16*7-64`($ctx),$D1
862 vpshufd \$0x44,$D2,$D2
863 vmovdqa $D4,-0x40(%r11)
864 vmovdqa $D2,0x50(%rsp)
865 vpshufd \$0xEE,$D0,$D3
866 vmovdqu `16*8-64`($ctx),$D2
867 vpshufd \$0x44,$D0,$D0
868 vmovdqa $D3,-0x30(%r11)
869 vmovdqa $D0,0x60(%rsp)
870 vpshufd \$0xEE,$D1,$D4
871 vpshufd \$0x44,$D1,$D1
872 vmovdqa $D4,-0x20(%r11)
873 vmovdqa $D1,0x70(%rsp)
874 vpshufd \$0xEE,$D2,$D3
875 vmovdqa 0x00(%rsp),$D4 # preload r0^2
876 vpshufd \$0x44,$D2,$D2
877 vmovdqa $D3,-0x10(%r11)
878 vmovdqa $D2,0x80(%rsp)
879
880 jmp .Loop_avx
881
882.align 32
883.Loop_avx:
884 ################################################################
885 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
886 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
887 # \___________________/
888 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
889 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
890 # \___________________/ \____________________/
891 #
892 # Note that we start with inp[2:3]*r^2. This is because it
893 # doesn't depend on reduction in previous iteration.
894 ################################################################
895 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
896 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
897 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
898 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
899 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
900 #
901 # though note that $Tx and $Hx are "reversed" in this section,
902 # and $D4 is preloaded with r0^2...
903
904 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
905 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
906 vmovdqa $H2,0x20(%r11) # offload hash
907 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
908 vmovdqa 0x10(%rsp),$H2 # r1^2
909 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
910 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
911
912 vmovdqa $H0,0x00(%r11) #
913 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
914 vmovdqa $H1,0x10(%r11) #
915 vpmuludq $T3,$H2,$H1 # h3*r1
916 vpaddq $H0,$D0,$D0 # d0 += h4*s1
917 vpaddq $H1,$D4,$D4 # d4 += h3*r1
918 vmovdqa $H3,0x30(%r11) #
919 vpmuludq $T2,$H2,$H0 # h2*r1
920 vpmuludq $T1,$H2,$H1 # h1*r1
921 vpaddq $H0,$D3,$D3 # d3 += h2*r1
922 vmovdqa 0x30(%rsp),$H3 # r2^2
923 vpaddq $H1,$D2,$D2 # d2 += h1*r1
924 vmovdqa $H4,0x40(%r11) #
925 vpmuludq $T0,$H2,$H2 # h0*r1
926 vpmuludq $T2,$H3,$H0 # h2*r2
927 vpaddq $H2,$D1,$D1 # d1 += h0*r1
928
929 vmovdqa 0x40(%rsp),$H4 # s2^2
930 vpaddq $H0,$D4,$D4 # d4 += h2*r2
931 vpmuludq $T1,$H3,$H1 # h1*r2
932 vpmuludq $T0,$H3,$H3 # h0*r2
933 vpaddq $H1,$D3,$D3 # d3 += h1*r2
934 vmovdqa 0x50(%rsp),$H2 # r3^2
935 vpaddq $H3,$D2,$D2 # d2 += h0*r2
936 vpmuludq $T4,$H4,$H0 # h4*s2
937 vpmuludq $T3,$H4,$H4 # h3*s2
938 vpaddq $H0,$D1,$D1 # d1 += h4*s2
939 vmovdqa 0x60(%rsp),$H3 # s3^2
940 vpaddq $H4,$D0,$D0 # d0 += h3*s2
941
942 vmovdqa 0x80(%rsp),$H4 # s4^2
943 vpmuludq $T1,$H2,$H1 # h1*r3
944 vpmuludq $T0,$H2,$H2 # h0*r3
945 vpaddq $H1,$D4,$D4 # d4 += h1*r3
946 vpaddq $H2,$D3,$D3 # d3 += h0*r3
947 vpmuludq $T4,$H3,$H0 # h4*s3
948 vpmuludq $T3,$H3,$H1 # h3*s3
949 vpaddq $H0,$D2,$D2 # d2 += h4*s3
950 vmovdqu 16*0($inp),$H0 # load input
951 vpaddq $H1,$D1,$D1 # d1 += h3*s3
952 vpmuludq $T2,$H3,$H3 # h2*s3
953 vpmuludq $T2,$H4,$T2 # h2*s4
954 vpaddq $H3,$D0,$D0 # d0 += h2*s3
955
956 vmovdqu 16*1($inp),$H1 #
957 vpaddq $T2,$D1,$D1 # d1 += h2*s4
958 vpmuludq $T3,$H4,$T3 # h3*s4
959 vpmuludq $T4,$H4,$T4 # h4*s4
960 vpsrldq \$6,$H0,$H2 # splat input
961 vpaddq $T3,$D2,$D2 # d2 += h3*s4
962 vpaddq $T4,$D3,$D3 # d3 += h4*s4
963 vpsrldq \$6,$H1,$H3 #
964 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
965 vpmuludq $T1,$H4,$T0 # h1*s4
966 vpunpckhqdq $H1,$H0,$H4 # 4
967 vpaddq $T4,$D4,$D4 # d4 += h0*r4
968 vmovdqa -0x90(%r11),$T4 # r0^4
969 vpaddq $T0,$D0,$D0 # d0 += h1*s4
970
971 vpunpcklqdq $H1,$H0,$H0 # 0:1
972 vpunpcklqdq $H3,$H2,$H3 # 2:3
973
974 #vpsrlq \$40,$H4,$H4 # 4
975 vpsrldq \$`40/8`,$H4,$H4 # 4
976 vpsrlq \$26,$H0,$H1
977 vpand $MASK,$H0,$H0 # 0
978 vpsrlq \$4,$H3,$H2
979 vpand $MASK,$H1,$H1 # 1
980 vpand 0(%rcx),$H4,$H4 # .Lmask24
981 vpsrlq \$30,$H3,$H3
982 vpand $MASK,$H2,$H2 # 2
983 vpand $MASK,$H3,$H3 # 3
984 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
985
986 vpaddq 0x00(%r11),$H0,$H0 # add hash value
987 vpaddq 0x10(%r11),$H1,$H1
988 vpaddq 0x20(%r11),$H2,$H2
989 vpaddq 0x30(%r11),$H3,$H3
990 vpaddq 0x40(%r11),$H4,$H4
991
992 lea 16*2($inp),%rax
993 lea 16*4($inp),$inp
994 sub \$64,$len
995 cmovc %rax,$inp
996
997 ################################################################
998 # Now we accumulate (inp[0:1]+hash)*r^4
999 ################################################################
1000 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1001 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1002 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1003 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1004 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1005
1006 vpmuludq $H0,$T4,$T0 # h0*r0
1007 vpmuludq $H1,$T4,$T1 # h1*r0
1008 vpaddq $T0,$D0,$D0
1009 vpaddq $T1,$D1,$D1
1010 vmovdqa -0x80(%r11),$T2 # r1^4
1011 vpmuludq $H2,$T4,$T0 # h2*r0
1012 vpmuludq $H3,$T4,$T1 # h3*r0
1013 vpaddq $T0,$D2,$D2
1014 vpaddq $T1,$D3,$D3
1015 vpmuludq $H4,$T4,$T4 # h4*r0
1016 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1017 vpaddq $T4,$D4,$D4
1018
1019 vpaddq $T0,$D0,$D0 # d0 += h4*s1
1020 vpmuludq $H2,$T2,$T1 # h2*r1
1021 vpmuludq $H3,$T2,$T0 # h3*r1
1022 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1023 vmovdqa -0x60(%r11),$T3 # r2^4
1024 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1025 vpmuludq $H1,$T2,$T1 # h1*r1
1026 vpmuludq $H0,$T2,$T2 # h0*r1
1027 vpaddq $T1,$D2,$D2 # d2 += h1*r1
1028 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1029
1030 vmovdqa -0x50(%r11),$T4 # s2^4
1031 vpmuludq $H2,$T3,$T0 # h2*r2
1032 vpmuludq $H1,$T3,$T1 # h1*r2
1033 vpaddq $T0,$D4,$D4 # d4 += h2*r2
1034 vpaddq $T1,$D3,$D3 # d3 += h1*r2
1035 vmovdqa -0x40(%r11),$T2 # r3^4
1036 vpmuludq $H0,$T3,$T3 # h0*r2
1037 vpmuludq $H4,$T4,$T0 # h4*s2
1038 vpaddq $T3,$D2,$D2 # d2 += h0*r2
1039 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1040 vmovdqa -0x30(%r11),$T3 # s3^4
1041 vpmuludq $H3,$T4,$T4 # h3*s2
1042 vpmuludq $H1,$T2,$T1 # h1*r3
1043 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1044
1045 vmovdqa -0x10(%r11),$T4 # s4^4
1046 vpaddq $T1,$D4,$D4 # d4 += h1*r3
1047 vpmuludq $H0,$T2,$T2 # h0*r3
1048 vpmuludq $H4,$T3,$T0 # h4*s3
1049 vpaddq $T2,$D3,$D3 # d3 += h0*r3
1050 vpaddq $T0,$D2,$D2 # d2 += h4*s3
1051 vmovdqu 16*2($inp),$T0 # load input
1052 vpmuludq $H3,$T3,$T2 # h3*s3
1053 vpmuludq $H2,$T3,$T3 # h2*s3
1054 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1055 vmovdqu 16*3($inp),$T1 #
1056 vpaddq $T3,$D0,$D0 # d0 += h2*s3
1057
1058 vpmuludq $H2,$T4,$H2 # h2*s4
1059 vpmuludq $H3,$T4,$H3 # h3*s4
1060 vpsrldq \$6,$T0,$T2 # splat input
1061 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1062 vpmuludq $H4,$T4,$H4 # h4*s4
1063 vpsrldq \$6,$T1,$T3 #
1064 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1065 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1066 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1067 vpmuludq $H1,$T4,$H0
1068 vpunpckhqdq $T1,$T0,$T4 # 4
1069 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1070 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1071
1072 vpunpcklqdq $T1,$T0,$T0 # 0:1
1073 vpunpcklqdq $T3,$T2,$T3 # 2:3
1074
1075 #vpsrlq \$40,$T4,$T4 # 4
1076 vpsrldq \$`40/8`,$T4,$T4 # 4
1077 vpsrlq \$26,$T0,$T1
1078 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1079 vpand $MASK,$T0,$T0 # 0
1080 vpsrlq \$4,$T3,$T2
1081 vpand $MASK,$T1,$T1 # 1
1082 vpand 0(%rcx),$T4,$T4 # .Lmask24
1083 vpsrlq \$30,$T3,$T3
1084 vpand $MASK,$T2,$T2 # 2
1085 vpand $MASK,$T3,$T3 # 3
1086 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1087
1088 ################################################################
1089 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1090 # and P. Schwabe
1091
1092 vpsrlq \$26,$H3,$D3
1093 vpand $MASK,$H3,$H3
1094 vpaddq $D3,$H4,$H4 # h3 -> h4
1095
1096 vpsrlq \$26,$H0,$D0
1097 vpand $MASK,$H0,$H0
1098 vpaddq $D0,$D1,$H1 # h0 -> h1
1099
1100 vpsrlq \$26,$H4,$D0
1101 vpand $MASK,$H4,$H4
1102
1103 vpsrlq \$26,$H1,$D1
1104 vpand $MASK,$H1,$H1
1105 vpaddq $D1,$H2,$H2 # h1 -> h2
1106
1107 vpaddq $D0,$H0,$H0
1108 vpsllq \$2,$D0,$D0
1109 vpaddq $D0,$H0,$H0 # h4 -> h0
1110
1111 vpsrlq \$26,$H2,$D2
1112 vpand $MASK,$H2,$H2
1113 vpaddq $D2,$H3,$H3 # h2 -> h3
1114
1115 vpsrlq \$26,$H0,$D0
1116 vpand $MASK,$H0,$H0
1117 vpaddq $D0,$H1,$H1 # h0 -> h1
1118
1119 vpsrlq \$26,$H3,$D3
1120 vpand $MASK,$H3,$H3
1121 vpaddq $D3,$H4,$H4 # h3 -> h4
1122
1123 ja .Loop_avx
1124
1125.Lskip_loop_avx:
1126 ################################################################
1127 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1128
1129 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1130 add \$32,$len
1131 jnz .Long_tail_avx
1132
1133 vpaddq $H2,$T2,$T2
1134 vpaddq $H0,$T0,$T0
1135 vpaddq $H1,$T1,$T1
1136 vpaddq $H3,$T3,$T3
1137 vpaddq $H4,$T4,$T4
1138
1139.Long_tail_avx:
1140 vmovdqa $H2,0x20(%r11)
1141 vmovdqa $H0,0x00(%r11)
1142 vmovdqa $H1,0x10(%r11)
1143 vmovdqa $H3,0x30(%r11)
1144 vmovdqa $H4,0x40(%r11)
1145
1146 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1147 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1148 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1149 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1150 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1151
1152 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1153 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1154 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1155 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1156 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1157 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1158
1159 vpmuludq $T3,$H2,$H0 # h3*r1
1160 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1161 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1162 vpmuludq $T2,$H2,$H1 # h2*r1
1163 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1164 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1165 vpmuludq $T1,$H2,$H0 # h1*r1
1166 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1167 vpmuludq $T0,$H2,$H2 # h0*r1
1168 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1169 vpmuludq $T4,$H3,$H3 # h4*s1
1170 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1171
1172 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1173 vpmuludq $T2,$H4,$H1 # h2*r2
1174 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1175 vpmuludq $T1,$H4,$H0 # h1*r2
1176 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1177 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1178 vpmuludq $T0,$H4,$H4 # h0*r2
1179 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1180 vpmuludq $T4,$H2,$H1 # h4*s2
1181 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1182 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1183 vpmuludq $T3,$H2,$H2 # h3*s2
1184 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1185
1186 vpmuludq $T1,$H3,$H0 # h1*r3
1187 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1188 vpmuludq $T0,$H3,$H3 # h0*r3
1189 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1190 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1191 vpmuludq $T4,$H4,$H1 # h4*s3
1192 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1193 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1194 vpmuludq $T3,$H4,$H0 # h3*s3
1195 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1196 vpmuludq $T2,$H4,$H4 # h2*s3
1197 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1198
1199 vpmuludq $T0,$H2,$H2 # h0*r4
1200 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1201 vpmuludq $T4,$H3,$H1 # h4*s4
1202 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1203 vpmuludq $T3,$H3,$H0 # h3*s4
1204 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1205 vpmuludq $T2,$H3,$H1 # h2*s4
1206 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1207 vpmuludq $T1,$H3,$H3 # h1*s4
1208 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1209
1210 jz .Lshort_tail_avx
1211
1212 vmovdqu 16*0($inp),$H0 # load input
1213 vmovdqu 16*1($inp),$H1
1214
1215 vpsrldq \$6,$H0,$H2 # splat input
1216 vpsrldq \$6,$H1,$H3
1217 vpunpckhqdq $H1,$H0,$H4 # 4
1218 vpunpcklqdq $H1,$H0,$H0 # 0:1
1219 vpunpcklqdq $H3,$H2,$H3 # 2:3
1220
1221 vpsrlq \$40,$H4,$H4 # 4
1222 vpsrlq \$26,$H0,$H1
1223 vpand $MASK,$H0,$H0 # 0
1224 vpsrlq \$4,$H3,$H2
1225 vpand $MASK,$H1,$H1 # 1
1226 vpsrlq \$30,$H3,$H3
1227 vpand $MASK,$H2,$H2 # 2
1228 vpand $MASK,$H3,$H3 # 3
1229 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1230
1231 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1232 vpaddq 0x00(%r11),$H0,$H0
1233 vpaddq 0x10(%r11),$H1,$H1
1234 vpaddq 0x20(%r11),$H2,$H2
1235 vpaddq 0x30(%r11),$H3,$H3
1236 vpaddq 0x40(%r11),$H4,$H4
1237
1238 ################################################################
1239 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1240
1241 vpmuludq $H0,$T4,$T0 # h0*r0
1242 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1243 vpmuludq $H1,$T4,$T1 # h1*r0
1244 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1245 vpmuludq $H2,$T4,$T0 # h2*r0
1246 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1247 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1248 vpmuludq $H3,$T4,$T1 # h3*r0
1249 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1250 vpmuludq $H4,$T4,$T4 # h4*r0
1251 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1252
1253 vpmuludq $H3,$T2,$T0 # h3*r1
1254 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1255 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1256 vpmuludq $H2,$T2,$T1 # h2*r1
1257 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1258 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1259 vpmuludq $H1,$T2,$T0 # h1*r1
1260 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1261 vpmuludq $H0,$T2,$T2 # h0*r1
1262 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1263 vpmuludq $H4,$T3,$T3 # h4*s1
1264 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1265
1266 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1267 vpmuludq $H2,$T4,$T1 # h2*r2
1268 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1269 vpmuludq $H1,$T4,$T0 # h1*r2
1270 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1271 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1272 vpmuludq $H0,$T4,$T4 # h0*r2
1273 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1274 vpmuludq $H4,$T2,$T1 # h4*s2
1275 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1276 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1277 vpmuludq $H3,$T2,$T2 # h3*s2
1278 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1279
1280 vpmuludq $H1,$T3,$T0 # h1*r3
1281 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1282 vpmuludq $H0,$T3,$T3 # h0*r3
1283 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1284 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1285 vpmuludq $H4,$T4,$T1 # h4*s3
1286 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1287 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1288 vpmuludq $H3,$T4,$T0 # h3*s3
1289 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1290 vpmuludq $H2,$T4,$T4 # h2*s3
1291 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1292
1293 vpmuludq $H0,$T2,$T2 # h0*r4
1294 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1295 vpmuludq $H4,$T3,$T1 # h4*s4
1296 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1297 vpmuludq $H3,$T3,$T0 # h3*s4
1298 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1299 vpmuludq $H2,$T3,$T1 # h2*s4
1300 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1301 vpmuludq $H1,$T3,$T3 # h1*s4
1302 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1303
1304.Lshort_tail_avx:
1ea8ae50
AP
1305 ################################################################
1306 # horizontal addition
1307
1308 vpsrldq \$8,$D4,$T4
1309 vpsrldq \$8,$D3,$T3
1310 vpsrldq \$8,$D1,$T1
1311 vpsrldq \$8,$D0,$T0
1312 vpsrldq \$8,$D2,$T2
1313 vpaddq $T3,$D3,$D3
1314 vpaddq $T4,$D4,$D4
1315 vpaddq $T0,$D0,$D0
1316 vpaddq $T1,$D1,$D1
1317 vpaddq $T2,$D2,$D2
1318
a98c648e
AP
1319 ################################################################
1320 # lazy reduction
1321
1322 vpsrlq \$26,$D3,$H3
1323 vpand $MASK,$D3,$D3
1324 vpaddq $H3,$D4,$D4 # h3 -> h4
1325
1326 vpsrlq \$26,$D0,$H0
1327 vpand $MASK,$D0,$D0
1328 vpaddq $H0,$D1,$D1 # h0 -> h1
1329
1330 vpsrlq \$26,$D4,$H4
1331 vpand $MASK,$D4,$D4
1332
1333 vpsrlq \$26,$D1,$H1
1334 vpand $MASK,$D1,$D1
1335 vpaddq $H1,$D2,$D2 # h1 -> h2
1336
1337 vpaddq $H4,$D0,$D0
1338 vpsllq \$2,$H4,$H4
1339 vpaddq $H4,$D0,$D0 # h4 -> h0
1340
1341 vpsrlq \$26,$D2,$H2
1342 vpand $MASK,$D2,$D2
1343 vpaddq $H2,$D3,$D3 # h2 -> h3
1344
1345 vpsrlq \$26,$D0,$H0
1346 vpand $MASK,$D0,$D0
1347 vpaddq $H0,$D1,$D1 # h0 -> h1
1348
1349 vpsrlq \$26,$D3,$H3
1350 vpand $MASK,$D3,$D3
1351 vpaddq $H3,$D4,$D4 # h3 -> h4
1352
1ea8ae50
AP
1353 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1354 vmovd $D1,`4*1-48-64`($ctx)
1355 vmovd $D2,`4*2-48-64`($ctx)
1356 vmovd $D3,`4*3-48-64`($ctx)
1357 vmovd $D4,`4*4-48-64`($ctx)
a98c648e
AP
1358___
1359$code.=<<___ if ($win64);
1360 vmovdqa 0x50(%r11),%xmm6
1361 vmovdqa 0x60(%r11),%xmm7
1362 vmovdqa 0x70(%r11),%xmm8
1363 vmovdqa 0x80(%r11),%xmm9
1364 vmovdqa 0x90(%r11),%xmm10
1365 vmovdqa 0xa0(%r11),%xmm11
1366 vmovdqa 0xb0(%r11),%xmm12
1367 vmovdqa 0xc0(%r11),%xmm13
1368 vmovdqa 0xd0(%r11),%xmm14
1369 vmovdqa 0xe0(%r11),%xmm15
1370 lea 0xf8(%r11),%rsp
1371.Ldo_avx_epilogue:
1372___
1373$code.=<<___ if (!$win64);
1374 lea 0x58(%r11),%rsp
1c47e883 1375.cfi_def_cfa %rsp,8
a98c648e
AP
1376___
1377$code.=<<___;
1378 vzeroupper
1379 ret
1c47e883 1380.cfi_endproc
a98c648e
AP
1381.size poly1305_blocks_avx,.-poly1305_blocks_avx
1382
1383.type poly1305_emit_avx,\@function,3
1384.align 32
1385poly1305_emit_avx:
048fa13e 1386.cfi_startproc
a98c648e 1387 cmpl \$0,20($ctx) # is_base2_26?
a85dbf11 1388 je .Lemit
a98c648e
AP
1389
1390 mov 0($ctx),%eax # load hash value base 2^26
1391 mov 4($ctx),%ecx
1392 mov 8($ctx),%r8d
1393 mov 12($ctx),%r11d
1394 mov 16($ctx),%r10d
1395
1396 shl \$26,%rcx # base 2^26 -> base 2^64
1397 mov %r8,%r9
1398 shl \$52,%r8
1399 add %rcx,%rax
1400 shr \$12,%r9
1401 add %rax,%r8 # h0
1402 adc \$0,%r9
1403
1404 shl \$14,%r11
1405 mov %r10,%rax
1406 shr \$24,%r10
1407 add %r11,%r9
1408 shl \$40,%rax
1409 add %rax,%r9 # h1
1410 adc \$0,%r10 # h2
1411
1412 mov %r10,%rax # could be partially reduced, so reduce
1413 mov %r10,%rcx
1414 and \$3,%r10
1415 shr \$2,%rax
1416 and \$-4,%rcx
1417 add %rcx,%rax
1418 add %rax,%r8
1419 adc \$0,%r9
4b8736a2 1420 adc \$0,%r10
a98c648e
AP
1421
1422 mov %r8,%rax
1423 add \$5,%r8 # compare to modulus
1424 mov %r9,%rcx
1425 adc \$0,%r9
1426 adc \$0,%r10
46f4e1be 1427 shr \$2,%r10 # did 130-bit value overflow?
a98c648e
AP
1428 cmovnz %r8,%rax
1429 cmovnz %r9,%rcx
1430
1431 add 0($nonce),%rax # accumulate nonce
1432 adc 8($nonce),%rcx
1433 mov %rax,0($mac) # write result
1434 mov %rcx,8($mac)
1435
1436 ret
048fa13e 1437.cfi_endproc
a98c648e
AP
1438.size poly1305_emit_avx,.-poly1305_emit_avx
1439___
1440
1441if ($avx>1) {
1442my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1443 map("%ymm$_",(0..15));
1444my $S4=$MASK;
1445
1446$code.=<<___;
1447.type poly1305_blocks_avx2,\@function,4
1448.align 32
1449poly1305_blocks_avx2:
1c47e883 1450.cfi_startproc
a98c648e
AP
1451 mov 20($ctx),%r8d # is_base2_26
1452 cmp \$128,$len
1453 jae .Lblocks_avx2
1454 test %r8d,%r8d
a85dbf11 1455 jz .Lblocks
a98c648e
AP
1456
1457.Lblocks_avx2:
1458 and \$-16,$len
1459 jz .Lno_data_avx2
1460
1461 vzeroupper
1462
1463 test %r8d,%r8d
1464 jz .Lbase2_64_avx2
1465
1466 test \$63,$len
1467 jz .Leven_avx2
1468
1469 push %rbx
1c47e883 1470.cfi_push %rbx
a98c648e 1471 push %rbp
1c47e883 1472.cfi_push %rbp
a98c648e 1473 push %r12
1c47e883 1474.cfi_push %r12
a98c648e 1475 push %r13
1c47e883 1476.cfi_push %r13
a98c648e 1477 push %r14
1c47e883 1478.cfi_push %r14
a98c648e 1479 push %r15
1c47e883 1480.cfi_push %r15
a98c648e
AP
1481.Lblocks_avx2_body:
1482
1483 mov $len,%r15 # reassign $len
1484
1485 mov 0($ctx),$d1 # load hash value
1486 mov 8($ctx),$d2
1487 mov 16($ctx),$h2#d
1488
1489 mov 24($ctx),$r0 # load r
1490 mov 32($ctx),$s1
1491
1492 ################################# base 2^26 -> base 2^64
1493 mov $d1#d,$h0#d
28411657 1494 and \$`-1*(1<<31)`,$d1
a98c648e
AP
1495 mov $d2,$r1 # borrow $r1
1496 mov $d2#d,$h1#d
28411657 1497 and \$`-1*(1<<31)`,$d2
a98c648e
AP
1498
1499 shr \$6,$d1
1500 shl \$52,$r1
1501 add $d1,$h0
1502 shr \$12,$h1
1503 shr \$18,$d2
1504 add $r1,$h0
1505 adc $d2,$h1
1506
1507 mov $h2,$d1
1508 shl \$40,$d1
1509 shr \$24,$h2
1510 add $d1,$h1
1511 adc \$0,$h2 # can be partially reduced...
1512
1513 mov \$-4,$d2 # ... so reduce
1514 mov $h2,$d1
1515 and $h2,$d2
1516 shr \$2,$d1
1517 and \$3,$h2
1518 add $d2,$d1 # =*5
1519 add $d1,$h0
1520 adc \$0,$h1
4b8736a2 1521 adc \$0,$h2
a98c648e
AP
1522
1523 mov $s1,$r1
1524 mov $s1,%rax
1525 shr \$2,$s1
1526 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1527
1528.Lbase2_26_pre_avx2:
1529 add 0($inp),$h0 # accumulate input
1530 adc 8($inp),$h1
1531 lea 16($inp),$inp
1532 adc $padbit,$h2
1533 sub \$16,%r15
1534
1535 call __poly1305_block
1536 mov $r1,%rax
1537
1538 test \$63,%r15
1539 jnz .Lbase2_26_pre_avx2
1540
1541 test $padbit,$padbit # if $padbit is zero,
1542 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1543
1544 ################################# base 2^64 -> base 2^26
1545 mov $h0,%rax
1546 mov $h0,%rdx
1547 shr \$52,$h0
1548 mov $h1,$r0
1549 mov $h1,$r1
1550 shr \$26,%rdx
1551 and \$0x3ffffff,%rax # h[0]
1552 shl \$12,$r0
1553 and \$0x3ffffff,%rdx # h[1]
1554 shr \$14,$h1
1555 or $r0,$h0
1556 shl \$24,$h2
1557 and \$0x3ffffff,$h0 # h[2]
1558 shr \$40,$r1
1559 and \$0x3ffffff,$h1 # h[3]
1560 or $r1,$h2 # h[4]
1561
1562 test %r15,%r15
1563 jz .Lstore_base2_26_avx2
1564
1565 vmovd %rax#d,%x#$H0
1566 vmovd %rdx#d,%x#$H1
1567 vmovd $h0#d,%x#$H2
1568 vmovd $h1#d,%x#$H3
1569 vmovd $h2#d,%x#$H4
1570 jmp .Lproceed_avx2
1571
1572.align 32
1573.Lstore_base2_64_avx2:
1574 mov $h0,0($ctx)
1575 mov $h1,8($ctx)
1576 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1577 jmp .Ldone_avx2
1578
1579.align 16
1580.Lstore_base2_26_avx2:
1581 mov %rax#d,0($ctx) # store hash value base 2^26
1582 mov %rdx#d,4($ctx)
1583 mov $h0#d,8($ctx)
1584 mov $h1#d,12($ctx)
1585 mov $h2#d,16($ctx)
1586.align 16
1587.Ldone_avx2:
1588 mov 0(%rsp),%r15
1c47e883 1589.cfi_restore %r15
a98c648e 1590 mov 8(%rsp),%r14
1c47e883 1591.cfi_restore %r14
a98c648e 1592 mov 16(%rsp),%r13
1c47e883 1593.cfi_restore %r13
a98c648e 1594 mov 24(%rsp),%r12
1c47e883 1595.cfi_restore %r12
a98c648e 1596 mov 32(%rsp),%rbp
1c47e883 1597.cfi_restore %rbp
a98c648e 1598 mov 40(%rsp),%rbx
1c47e883 1599.cfi_restore %rbx
a98c648e 1600 lea 48(%rsp),%rsp
1c47e883 1601.cfi_adjust_cfa_offset -48
a98c648e
AP
1602.Lno_data_avx2:
1603.Lblocks_avx2_epilogue:
1604 ret
1c47e883 1605.cfi_endproc
a98c648e
AP
1606
1607.align 32
1608.Lbase2_64_avx2:
1c47e883 1609.cfi_startproc
a98c648e 1610 push %rbx
1c47e883 1611.cfi_push %rbx
a98c648e 1612 push %rbp
1c47e883 1613.cfi_push %rbp
a98c648e 1614 push %r12
1c47e883 1615.cfi_push %r12
a98c648e 1616 push %r13
1c47e883 1617.cfi_push %r13
a98c648e 1618 push %r14
1c47e883 1619.cfi_push %r14
a98c648e 1620 push %r15
1c47e883 1621.cfi_push %r15
a98c648e
AP
1622.Lbase2_64_avx2_body:
1623
1624 mov $len,%r15 # reassign $len
1625
1626 mov 24($ctx),$r0 # load r
1627 mov 32($ctx),$s1
1628
1629 mov 0($ctx),$h0 # load hash value
1630 mov 8($ctx),$h1
1631 mov 16($ctx),$h2#d
1632
1633 mov $s1,$r1
1634 mov $s1,%rax
1635 shr \$2,$s1
1636 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1637
1638 test \$63,$len
1639 jz .Linit_avx2
1640
1641.Lbase2_64_pre_avx2:
1642 add 0($inp),$h0 # accumulate input
1643 adc 8($inp),$h1
1644 lea 16($inp),$inp
1645 adc $padbit,$h2
1646 sub \$16,%r15
1647
1648 call __poly1305_block
1649 mov $r1,%rax
1650
1651 test \$63,%r15
1652 jnz .Lbase2_64_pre_avx2
1653
1654.Linit_avx2:
1655 ################################# base 2^64 -> base 2^26
1656 mov $h0,%rax
1657 mov $h0,%rdx
1658 shr \$52,$h0
1659 mov $h1,$d1
1660 mov $h1,$d2
1661 shr \$26,%rdx
1662 and \$0x3ffffff,%rax # h[0]
1663 shl \$12,$d1
1664 and \$0x3ffffff,%rdx # h[1]
1665 shr \$14,$h1
1666 or $d1,$h0
1667 shl \$24,$h2
1668 and \$0x3ffffff,$h0 # h[2]
1669 shr \$40,$d2
1670 and \$0x3ffffff,$h1 # h[3]
1671 or $d2,$h2 # h[4]
1672
1673 vmovd %rax#d,%x#$H0
1674 vmovd %rdx#d,%x#$H1
1675 vmovd $h0#d,%x#$H2
1676 vmovd $h1#d,%x#$H3
1677 vmovd $h2#d,%x#$H4
1678 movl \$1,20($ctx) # set is_base2_26
1679
1680 call __poly1305_init_avx
1681
1682.Lproceed_avx2:
abb8c44f
AP
1683 mov %r15,$len # restore $len
1684 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1685 mov \$`(1<<31|1<<30|1<<16)`,%r11d
a98c648e
AP
1686
1687 mov 0(%rsp),%r15
1c47e883 1688.cfi_restore %r15
a98c648e 1689 mov 8(%rsp),%r14
1c47e883 1690.cfi_restore %r14
a98c648e 1691 mov 16(%rsp),%r13
1c47e883 1692.cfi_restore %r13
a98c648e 1693 mov 24(%rsp),%r12
1c47e883 1694.cfi_restore %r12
a98c648e 1695 mov 32(%rsp),%rbp
1c47e883 1696.cfi_restore %rbp
a98c648e 1697 mov 40(%rsp),%rbx
1c47e883 1698.cfi_restore %rbx
a98c648e
AP
1699 lea 48(%rsp),%rax
1700 lea 48(%rsp),%rsp
1c47e883 1701.cfi_adjust_cfa_offset -48
a98c648e
AP
1702.Lbase2_64_avx2_epilogue:
1703 jmp .Ldo_avx2
1c47e883 1704.cfi_endproc
a98c648e
AP
1705
1706.align 32
1707.Leven_avx2:
1c47e883 1708.cfi_startproc
abb8c44f 1709 mov OPENSSL_ia32cap_P+8(%rip),%r10d
a98c648e
AP
1710 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1711 vmovd 4*1($ctx),%x#$H1
1712 vmovd 4*2($ctx),%x#$H2
1713 vmovd 4*3($ctx),%x#$H3
1714 vmovd 4*4($ctx),%x#$H4
1715
1716.Ldo_avx2:
1717___
abb8c44f
AP
1718$code.=<<___ if ($avx>2);
1719 cmp \$512,$len
1720 jb .Lskip_avx512
1721 and %r11d,%r10d
a8f302e5
AP
1722 test \$`1<<16`,%r10d # check for AVX512F
1723 jnz .Lblocks_avx512
abb8c44f
AP
1724.Lskip_avx512:
1725___
a98c648e
AP
1726$code.=<<___ if (!$win64);
1727 lea -8(%rsp),%r11
1c47e883 1728.cfi_def_cfa %r11,16
a98c648e
AP
1729 sub \$0x128,%rsp
1730___
1731$code.=<<___ if ($win64);
1732 lea -0xf8(%rsp),%r11
1733 sub \$0x1c8,%rsp
1734 vmovdqa %xmm6,0x50(%r11)
1735 vmovdqa %xmm7,0x60(%r11)
1736 vmovdqa %xmm8,0x70(%r11)
1737 vmovdqa %xmm9,0x80(%r11)
1738 vmovdqa %xmm10,0x90(%r11)
1739 vmovdqa %xmm11,0xa0(%r11)
1740 vmovdqa %xmm12,0xb0(%r11)
1741 vmovdqa %xmm13,0xc0(%r11)
1742 vmovdqa %xmm14,0xd0(%r11)
1743 vmovdqa %xmm15,0xe0(%r11)
1744.Ldo_avx2_body:
1745___
1746$code.=<<___;
a98c648e 1747 lea .Lconst(%rip),%rcx
73e8a5c8
AP
1748 lea 48+64($ctx),$ctx # size optimization
1749 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
a98c648e
AP
1750
1751 # expand and copy pre-calculated table to stack
1752 vmovdqu `16*0-64`($ctx),%x#$T2
1753 and \$-512,%rsp
1754 vmovdqu `16*1-64`($ctx),%x#$T3
1755 vmovdqu `16*2-64`($ctx),%x#$T4
1756 vmovdqu `16*3-64`($ctx),%x#$D0
1757 vmovdqu `16*4-64`($ctx),%x#$D1
1758 vmovdqu `16*5-64`($ctx),%x#$D2
73e8a5c8 1759 lea 0x90(%rsp),%rax # size optimization
a98c648e 1760 vmovdqu `16*6-64`($ctx),%x#$D3
73e8a5c8 1761 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
a98c648e 1762 vmovdqu `16*7-64`($ctx),%x#$D4
73e8a5c8 1763 vpermd $T3,$T0,$T3
a98c648e 1764 vmovdqu `16*8-64`($ctx),%x#$MASK
73e8a5c8 1765 vpermd $T4,$T0,$T4
a98c648e 1766 vmovdqa $T2,0x00(%rsp)
73e8a5c8
AP
1767 vpermd $D0,$T0,$D0
1768 vmovdqa $T3,0x20-0x90(%rax)
1769 vpermd $D1,$T0,$D1
1770 vmovdqa $T4,0x40-0x90(%rax)
1771 vpermd $D2,$T0,$D2
1772 vmovdqa $D0,0x60-0x90(%rax)
1773 vpermd $D3,$T0,$D3
1774 vmovdqa $D1,0x80-0x90(%rax)
1775 vpermd $D4,$T0,$D4
1776 vmovdqa $D2,0xa0-0x90(%rax)
1777 vpermd $MASK,$T0,$MASK
1778 vmovdqa $D3,0xc0-0x90(%rax)
1779 vmovdqa $D4,0xe0-0x90(%rax)
1780 vmovdqa $MASK,0x100-0x90(%rax)
a98c648e
AP
1781 vmovdqa 64(%rcx),$MASK # .Lmask26
1782
1783 ################################################################
1784 # load input
1785 vmovdqu 16*0($inp),%x#$T0
1786 vmovdqu 16*1($inp),%x#$T1
1787 vinserti128 \$1,16*2($inp),$T0,$T0
1788 vinserti128 \$1,16*3($inp),$T1,$T1
1789 lea 16*4($inp),$inp
1790
1791 vpsrldq \$6,$T0,$T2 # splat input
1792 vpsrldq \$6,$T1,$T3
1793 vpunpckhqdq $T1,$T0,$T4 # 4
1794 vpunpcklqdq $T3,$T2,$T2 # 2:3
1795 vpunpcklqdq $T1,$T0,$T0 # 0:1
1796
1797 vpsrlq \$30,$T2,$T3
1798 vpsrlq \$4,$T2,$T2
1799 vpsrlq \$26,$T0,$T1
1800 vpsrlq \$40,$T4,$T4 # 4
1801 vpand $MASK,$T2,$T2 # 2
1802 vpand $MASK,$T0,$T0 # 0
1803 vpand $MASK,$T1,$T1 # 1
1804 vpand $MASK,$T3,$T3 # 3
1805 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1806
a98c648e
AP
1807 vpaddq $H2,$T2,$H2 # accumulate input
1808 sub \$64,$len
1809 jz .Ltail_avx2
1810 jmp .Loop_avx2
1811
1812.align 32
1813.Loop_avx2:
1814 ################################################################
abb8c44f
AP
1815 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1816 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1817 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1818 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1819 # \________/\__________/
a98c648e
AP
1820 ################################################################
1821 #vpaddq $H2,$T2,$H2 # accumulate input
1822 vpaddq $H0,$T0,$H0
1823 vmovdqa `32*0`(%rsp),$T0 # r0^4
1824 vpaddq $H1,$T1,$H1
1825 vmovdqa `32*1`(%rsp),$T1 # r1^4
1826 vpaddq $H3,$T3,$H3
1827 vmovdqa `32*3`(%rsp),$T2 # r2^4
1828 vpaddq $H4,$T4,$H4
1829 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1830 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1831
1832 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1833 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1834 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1835 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1836 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1837 #
1838 # however, as h2 is "chronologically" first one available pull
1839 # corresponding operations up, so it's
1840 #
1841 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1842 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1843 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1844 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1845 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1846
1847 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1848 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1849 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1850 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1851 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1852
1853 vpmuludq $H0,$T1,$T4 # h0*r1
1854 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1855 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1856 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1857 vpmuludq $H3,$T1,$T4 # h3*r1
1858 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1859 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1860 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1861 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1862
1863 vpmuludq $H0,$T0,$T4 # h0*r0
1864 vpmuludq $H1,$T0,$H2 # h1*r0
1865 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1866 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1867 vpmuludq $H3,$T0,$T4 # h3*r0
1868 vpmuludq $H4,$T0,$H2 # h4*r0
1869 vmovdqu 16*0($inp),%x#$T0 # load input
1870 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1871 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1872 vinserti128 \$1,16*2($inp),$T0,$T0
1873
1874 vpmuludq $H3,$T1,$T4 # h3*s2
1875 vpmuludq $H4,$T1,$H2 # h4*s2
1876 vmovdqu 16*1($inp),%x#$T1
1877 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1878 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1879 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1880 vpmuludq $H1,$T2,$T4 # h1*r2
1881 vpmuludq $H0,$T2,$T2 # h0*r2
1882 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1883 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1884 vinserti128 \$1,16*3($inp),$T1,$T1
1885 lea 16*4($inp),$inp
1886
1887 vpmuludq $H1,$H2,$T4 # h1*r3
1888 vpmuludq $H0,$H2,$H2 # h0*r3
1889 vpsrldq \$6,$T0,$T2 # splat input
1890 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1891 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1892 vpmuludq $H3,$T3,$T4 # h3*s3
1893 vpmuludq $H4,$T3,$H2 # h4*s3
1894 vpsrldq \$6,$T1,$T3
1895 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1896 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1897 vpunpckhqdq $T1,$T0,$T4 # 4
1898
1899 vpmuludq $H3,$S4,$H3 # h3*s4
1900 vpmuludq $H4,$S4,$H4 # h4*s4
1901 vpunpcklqdq $T1,$T0,$T0 # 0:1
1902 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1903 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1904 vpunpcklqdq $T3,$T2,$T3 # 2:3
1905 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1906 vpmuludq $H1,$S4,$H0 # h1*s4
1907 vmovdqa 64(%rcx),$MASK # .Lmask26
1908 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1909 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1910
1911 ################################################################
1912 # lazy reduction (interleaved with tail of input splat)
1913
1914 vpsrlq \$26,$H3,$D3
1915 vpand $MASK,$H3,$H3
1916 vpaddq $D3,$H4,$H4 # h3 -> h4
1917
1918 vpsrlq \$26,$H0,$D0
1919 vpand $MASK,$H0,$H0
1920 vpaddq $D0,$D1,$H1 # h0 -> h1
1921
1922 vpsrlq \$26,$H4,$D4
1923 vpand $MASK,$H4,$H4
1924
1925 vpsrlq \$4,$T3,$T2
1926
1927 vpsrlq \$26,$H1,$D1
1928 vpand $MASK,$H1,$H1
1929 vpaddq $D1,$H2,$H2 # h1 -> h2
1930
1931 vpaddq $D4,$H0,$H0
1932 vpsllq \$2,$D4,$D4
1933 vpaddq $D4,$H0,$H0 # h4 -> h0
1934
1935 vpand $MASK,$T2,$T2 # 2
1936 vpsrlq \$26,$T0,$T1
1937
1938 vpsrlq \$26,$H2,$D2
1939 vpand $MASK,$H2,$H2
1940 vpaddq $D2,$H3,$H3 # h2 -> h3
1941
1942 vpaddq $T2,$H2,$H2 # modulo-scheduled
1943 vpsrlq \$30,$T3,$T3
1944
1945 vpsrlq \$26,$H0,$D0
1946 vpand $MASK,$H0,$H0
1947 vpaddq $D0,$H1,$H1 # h0 -> h1
1948
1949 vpsrlq \$40,$T4,$T4 # 4
1950
1951 vpsrlq \$26,$H3,$D3
1952 vpand $MASK,$H3,$H3
1953 vpaddq $D3,$H4,$H4 # h3 -> h4
1954
1955 vpand $MASK,$T0,$T0 # 0
1956 vpand $MASK,$T1,$T1 # 1
1957 vpand $MASK,$T3,$T3 # 3
1958 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1959
1960 sub \$64,$len
1961 jnz .Loop_avx2
1962
1963 .byte 0x66,0x90
1964.Ltail_avx2:
1965 ################################################################
1966 # while above multiplications were by r^4 in all lanes, in last
1967 # iteration we multiply least significant lane by r^4 and most
1968 # significant one by r, so copy of above except that references
1969 # to the precomputed table are displaced by 4...
1970
1971 #vpaddq $H2,$T2,$H2 # accumulate input
1972 vpaddq $H0,$T0,$H0
1973 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1974 vpaddq $H1,$T1,$H1
1975 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1976 vpaddq $H3,$T3,$H3
1977 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1978 vpaddq $H4,$T4,$H4
1979 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1980 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1981
1982 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1983 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1984 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1985 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1986 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1987
1988 vpmuludq $H0,$T1,$T4 # h0*r1
1989 vpmuludq $H1,$T1,$H2 # h1*r1
1990 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1991 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1992 vpmuludq $H3,$T1,$T4 # h3*r1
1993 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1994 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1995 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1996
1997 vpmuludq $H0,$T0,$T4 # h0*r0
1998 vpmuludq $H1,$T0,$H2 # h1*r0
1999 vpaddq $T4,$D0,$D0 # d0 += h0*r0
2000 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2001 vpaddq $H2,$D1,$D1 # d1 += h1*r0
2002 vpmuludq $H3,$T0,$T4 # h3*r0
2003 vpmuludq $H4,$T0,$H2 # h4*r0
2004 vpaddq $T4,$D3,$D3 # d3 += h3*r0
2005 vpaddq $H2,$D4,$D4 # d4 += h4*r0
2006
2007 vpmuludq $H3,$T1,$T4 # h3*s2
2008 vpmuludq $H4,$T1,$H2 # h4*s2
2009 vpaddq $T4,$D0,$D0 # d0 += h3*s2
2010 vpaddq $H2,$D1,$D1 # d1 += h4*s2
2011 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2012 vpmuludq $H1,$T2,$T4 # h1*r2
2013 vpmuludq $H0,$T2,$T2 # h0*r2
2014 vpaddq $T4,$D3,$D3 # d3 += h1*r2
2015 vpaddq $T2,$D2,$D2 # d2 += h0*r2
2016
2017 vpmuludq $H1,$H2,$T4 # h1*r3
2018 vpmuludq $H0,$H2,$H2 # h0*r3
2019 vpaddq $T4,$D4,$D4 # d4 += h1*r3
2020 vpaddq $H2,$D3,$D3 # d3 += h0*r3
2021 vpmuludq $H3,$T3,$T4 # h3*s3
2022 vpmuludq $H4,$T3,$H2 # h4*s3
2023 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2024 vpaddq $H2,$D2,$D2 # d2 += h4*s3
2025
2026 vpmuludq $H3,$S4,$H3 # h3*s4
2027 vpmuludq $H4,$S4,$H4 # h4*s4
2028 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2029 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2030 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2031 vpmuludq $H1,$S4,$H0 # h1*s4
2032 vmovdqa 64(%rcx),$MASK # .Lmask26
2033 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2034 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2035
1ea8ae50
AP
2036 ################################################################
2037 # horizontal addition
2038
2039 vpsrldq \$8,$D1,$T1
2040 vpsrldq \$8,$H2,$T2
2041 vpsrldq \$8,$H3,$T3
2042 vpsrldq \$8,$H4,$T4
2043 vpsrldq \$8,$H0,$T0
2044 vpaddq $T1,$D1,$D1
2045 vpaddq $T2,$H2,$H2
2046 vpaddq $T3,$H3,$H3
2047 vpaddq $T4,$H4,$H4
2048 vpaddq $T0,$H0,$H0
2049
2050 vpermq \$0x2,$H3,$T3
2051 vpermq \$0x2,$H4,$T4
2052 vpermq \$0x2,$H0,$T0
2053 vpermq \$0x2,$D1,$T1
2054 vpermq \$0x2,$H2,$T2
2055 vpaddq $T3,$H3,$H3
2056 vpaddq $T4,$H4,$H4
2057 vpaddq $T0,$H0,$H0
2058 vpaddq $T1,$D1,$D1
2059 vpaddq $T2,$H2,$H2
2060
a98c648e
AP
2061 ################################################################
2062 # lazy reduction
2063
2064 vpsrlq \$26,$H3,$D3
2065 vpand $MASK,$H3,$H3
2066 vpaddq $D3,$H4,$H4 # h3 -> h4
2067
2068 vpsrlq \$26,$H0,$D0
2069 vpand $MASK,$H0,$H0
2070 vpaddq $D0,$D1,$H1 # h0 -> h1
2071
2072 vpsrlq \$26,$H4,$D4
2073 vpand $MASK,$H4,$H4
2074
2075 vpsrlq \$26,$H1,$D1
2076 vpand $MASK,$H1,$H1
2077 vpaddq $D1,$H2,$H2 # h1 -> h2
2078
2079 vpaddq $D4,$H0,$H0
2080 vpsllq \$2,$D4,$D4
2081 vpaddq $D4,$H0,$H0 # h4 -> h0
2082
2083 vpsrlq \$26,$H2,$D2
2084 vpand $MASK,$H2,$H2
2085 vpaddq $D2,$H3,$H3 # h2 -> h3
2086
2087 vpsrlq \$26,$H0,$D0
2088 vpand $MASK,$H0,$H0
2089 vpaddq $D0,$H1,$H1 # h0 -> h1
2090
2091 vpsrlq \$26,$H3,$D3
2092 vpand $MASK,$H3,$H3
2093 vpaddq $D3,$H4,$H4 # h3 -> h4
2094
a98c648e
AP
2095 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2096 vmovd %x#$H1,`4*1-48-64`($ctx)
2097 vmovd %x#$H2,`4*2-48-64`($ctx)
2098 vmovd %x#$H3,`4*3-48-64`($ctx)
2099 vmovd %x#$H4,`4*4-48-64`($ctx)
2100___
2101$code.=<<___ if ($win64);
2102 vmovdqa 0x50(%r11),%xmm6
2103 vmovdqa 0x60(%r11),%xmm7
2104 vmovdqa 0x70(%r11),%xmm8
2105 vmovdqa 0x80(%r11),%xmm9
2106 vmovdqa 0x90(%r11),%xmm10
2107 vmovdqa 0xa0(%r11),%xmm11
2108 vmovdqa 0xb0(%r11),%xmm12
2109 vmovdqa 0xc0(%r11),%xmm13
2110 vmovdqa 0xd0(%r11),%xmm14
2111 vmovdqa 0xe0(%r11),%xmm15
2112 lea 0xf8(%r11),%rsp
2113.Ldo_avx2_epilogue:
2114___
2115$code.=<<___ if (!$win64);
2116 lea 8(%r11),%rsp
1c47e883 2117.cfi_def_cfa %rsp,8
a98c648e
AP
2118___
2119$code.=<<___;
2120 vzeroupper
2121 ret
1c47e883 2122.cfi_endproc
a98c648e
AP
2123.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2124___
abb8c44f
AP
2125#######################################################################
2126if ($avx>2) {
2127# On entry we have input length divisible by 64. But since inner loop
2128# processes 128 bytes per iteration, cases when length is not divisible
2129# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2130# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2131# for this tail, we wouldn't have to even allocate stack frame...
2132
a8f302e5
AP
2133my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2134my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
abb8c44f 2135my $PADBIT="%zmm30";
a8f302e5
AP
2136
2137map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2138map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2139map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2140map(s/%y/%z/,($MASK));
abb8c44f
AP
2141
2142$code.=<<___;
2143.type poly1305_blocks_avx512,\@function,4
2144.align 32
2145poly1305_blocks_avx512:
1c47e883 2146.cfi_startproc
abb8c44f 2147.Lblocks_avx512:
a8f302e5
AP
2148 mov \$15,%eax
2149 kmovw %eax,%k2
abb8c44f
AP
2150___
2151$code.=<<___ if (!$win64);
2152 lea -8(%rsp),%r11
1c47e883 2153.cfi_def_cfa %r11,16
abb8c44f
AP
2154 sub \$0x128,%rsp
2155___
2156$code.=<<___ if ($win64);
2157 lea -0xf8(%rsp),%r11
2158 sub \$0x1c8,%rsp
2159 vmovdqa %xmm6,0x50(%r11)
2160 vmovdqa %xmm7,0x60(%r11)
2161 vmovdqa %xmm8,0x70(%r11)
a8f302e5
AP
2162 vmovdqa %xmm9,0x80(%r11)
2163 vmovdqa %xmm10,0x90(%r11)
2164 vmovdqa %xmm11,0xa0(%r11)
2165 vmovdqa %xmm12,0xb0(%r11)
2166 vmovdqa %xmm13,0xc0(%r11)
2167 vmovdqa %xmm14,0xd0(%r11)
2168 vmovdqa %xmm15,0xe0(%r11)
abb8c44f
AP
2169.Ldo_avx512_body:
2170___
2171$code.=<<___;
abb8c44f 2172 lea .Lconst(%rip),%rcx
73e8a5c8 2173 lea 48+64($ctx),$ctx # size optimization
a8f302e5 2174 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
abb8c44f
AP
2175
2176 # expand pre-calculated table
4dfe4310 2177 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
abb8c44f 2178 and \$-512,%rsp
4dfe4310 2179 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
a8f302e5 2180 mov \$0x20,%rax
4dfe4310
AP
2181 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2182 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2183 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2184 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2185 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2186 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2187 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2188 vpermd $D0,$T2,$R0 # 00003412 -> 14243444
a8f302e5 2189 vpbroadcastq 64(%rcx),$MASK # .Lmask26
4dfe4310
AP
2190 vpermd $D1,$T2,$R1
2191 vpermd $T0,$T2,$S1
2192 vpermd $D2,$T2,$R2
a8f302e5 2193 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
abb8c44f 2194 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
4dfe4310 2195 vpermd $T1,$T2,$S2
a8f302e5 2196 vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
abb8c44f 2197 vpsrlq \$32,$R1,$T1
4dfe4310 2198 vpermd $D3,$T2,$R3
a8f302e5 2199 vmovdqa64 $S1,0x40(%rsp){%k2}
4dfe4310
AP
2200 vpermd $T3,$T2,$S3
2201 vpermd $D4,$T2,$R4
a8f302e5 2202 vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
4dfe4310 2203 vpermd $T4,$T2,$S4
a8f302e5
AP
2204 vmovdqa64 $S2,0x80(%rsp){%k2}
2205 vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2206 vmovdqa64 $S3,0xc0(%rsp){%k2}
2207 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2208 vmovdqa64 $S4,0x100(%rsp){%k2}
abb8c44f
AP
2209
2210 ################################################################
2211 # calculate 5th through 8th powers of the key
2212 #
2213 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2214 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2215 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2216 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2217 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2218
2219 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2220 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2221 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2222 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2223 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2224 vpsrlq \$32,$R2,$T2
2225
2226 vpmuludq $T1,$S4,$M0
2227 vpmuludq $T1,$R0,$M1
2228 vpmuludq $T1,$R1,$M2
2229 vpmuludq $T1,$R2,$M3
2230 vpmuludq $T1,$R3,$M4
2231 vpsrlq \$32,$R3,$T3
2232 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2233 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2234 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2235 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2236 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2237
2238 vpmuludq $T2,$S3,$M0
2239 vpmuludq $T2,$S4,$M1
2240 vpmuludq $T2,$R1,$M3
2241 vpmuludq $T2,$R2,$M4
2242 vpmuludq $T2,$R0,$M2
2243 vpsrlq \$32,$R4,$T4
2244 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2245 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2246 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2247 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2248 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2249
2250 vpmuludq $T3,$S2,$M0
2251 vpmuludq $T3,$R0,$M3
2252 vpmuludq $T3,$R1,$M4
2253 vpmuludq $T3,$S3,$M1
2254 vpmuludq $T3,$S4,$M2
2255 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2256 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2257 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2258 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2259 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2260
2261 vpmuludq $T4,$S4,$M3
2262 vpmuludq $T4,$R0,$M4
2263 vpmuludq $T4,$S1,$M0
2264 vpmuludq $T4,$S2,$M1
2265 vpmuludq $T4,$S3,$M2
2266 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2267 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2268 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2269 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2270 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2271
2272 ################################################################
2273 # load input
c1e1fc50
AP
2274 vmovdqu64 16*0($inp),%z#$T3
2275 vmovdqu64 16*4($inp),%z#$T4
2276 lea 16*8($inp),$inp
abb8c44f
AP
2277
2278 ################################################################
2279 # lazy reduction
2280
2281 vpsrlq \$26,$D3,$M3
2282 vpandq $MASK,$D3,$D3
2283 vpaddq $M3,$D4,$D4 # d3 -> d4
2284
2285 vpsrlq \$26,$D0,$M0
2286 vpandq $MASK,$D0,$D0
2287 vpaddq $M0,$D1,$D1 # d0 -> d1
2288
2289 vpsrlq \$26,$D4,$M4
2290 vpandq $MASK,$D4,$D4
2291
2292 vpsrlq \$26,$D1,$M1
2293 vpandq $MASK,$D1,$D1
2294 vpaddq $M1,$D2,$D2 # d1 -> d2
2295
2296 vpaddq $M4,$D0,$D0
2297 vpsllq \$2,$M4,$M4
2298 vpaddq $M4,$D0,$D0 # d4 -> d0
2299
2300 vpsrlq \$26,$D2,$M2
2301 vpandq $MASK,$D2,$D2
2302 vpaddq $M2,$D3,$D3 # d2 -> d3
2303
2304 vpsrlq \$26,$D0,$M0
2305 vpandq $MASK,$D0,$D0
2306 vpaddq $M0,$D1,$D1 # d0 -> d1
2307
2308 vpsrlq \$26,$D3,$M3
2309 vpandq $MASK,$D3,$D3
2310 vpaddq $M3,$D4,$D4 # d3 -> d4
2311
abb8c44f 2312 ################################################################
c1e1fc50
AP
2313 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2314 # $D0-$D4, ...
abb8c44f 2315
c1e1fc50
AP
2316 vpunpcklqdq $T4,$T3,$T0 # transpose input
2317 vpunpckhqdq $T4,$T3,$T4
abb8c44f 2318
c1e1fc50
AP
2319 # ... since input 64-bit lanes are ordered as 73625140, we could
2320 # "vperm" it to 76543210 (here and in each loop iteration), *or*
2321 # we could just flow along, hence the goal for $R0-$S4 is
2322 # 1858286838784888 ...
2323
e052083c
AP
2324 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2325 mov \$0x7777,%eax
c1e1fc50 2326 kmovw %eax,%k1
e052083c
AP
2327
2328 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2329 vpermd $R1,$M0,$R1
2330 vpermd $R2,$M0,$R2
2331 vpermd $R3,$M0,$R3
2332 vpermd $R4,$M0,$R4
2333
2334 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2335 vpermd $D1,$M0,${R1}{%k1}
2336 vpermd $D2,$M0,${R2}{%k1}
2337 vpermd $D3,$M0,${R3}{%k1}
2338 vpermd $D4,$M0,${R4}{%k1}
abb8c44f
AP
2339
2340 vpslld \$2,$R1,$S1 # *5
2341 vpslld \$2,$R2,$S2
2342 vpslld \$2,$R3,$S3
2343 vpslld \$2,$R4,$S4
2344 vpaddd $R1,$S1,$S1
2345 vpaddd $R2,$S2,$S2
2346 vpaddd $R3,$S3,$S3
2347 vpaddd $R4,$S4,$S4
2348
c1e1fc50 2349 vpbroadcastq 32(%rcx),$PADBIT # .L129
abb8c44f 2350
c1e1fc50
AP
2351 vpsrlq \$52,$T0,$T2 # splat input
2352 vpsllq \$12,$T4,$T3
2353 vporq $T3,$T2,$T2
abb8c44f 2354 vpsrlq \$26,$T0,$T1
c1e1fc50 2355 vpsrlq \$14,$T4,$T3
abb8c44f
AP
2356 vpsrlq \$40,$T4,$T4 # 4
2357 vpandq $MASK,$T2,$T2 # 2
2358 vpandq $MASK,$T0,$T0 # 0
e052083c
AP
2359 #vpandq $MASK,$T1,$T1 # 1
2360 #vpandq $MASK,$T3,$T3 # 3
abb8c44f
AP
2361 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2362
2363 vpaddq $H2,$T2,$H2 # accumulate input
abb8c44f
AP
2364 sub \$192,$len
2365 jbe .Ltail_avx512
a8f302e5 2366 jmp .Loop_avx512
abb8c44f 2367
73e8a5c8 2368.align 32
abb8c44f
AP
2369.Loop_avx512:
2370 ################################################################
2371 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2372 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2373 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2374 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2375 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2376 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2377 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2378 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2379 # \________/\___________/
2380 ################################################################
2381 #vpaddq $H2,$T2,$H2 # accumulate input
2382
2383 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2384 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2385 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2386 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2387 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2388 #
2389 # however, as h2 is "chronologically" first one available pull
2390 # corresponding operations up, so it's
2391 #
2392 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2393 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2394 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2395 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2396 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2397
2398 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2399 vpaddq $H0,$T0,$H0
abb8c44f 2400 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
e052083c 2401 vpandq $MASK,$T1,$T1 # 1
abb8c44f 2402 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
e052083c 2403 vpandq $MASK,$T3,$T3 # 3
abb8c44f
AP
2404 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2405 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2406 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2407 vpaddq $H1,$T1,$H1 # accumulate input
2408 vpaddq $H3,$T3,$H3
2409 vpaddq $H4,$T4,$H4
2410
c1e1fc50
AP
2411 vmovdqu64 16*0($inp),$T3 # load input
2412 vmovdqu64 16*4($inp),$T4
2413 lea 16*8($inp),$inp
abb8c44f
AP
2414 vpmuludq $H0,$R3,$M3
2415 vpmuludq $H0,$R4,$M4
2416 vpmuludq $H0,$R0,$M0
2417 vpmuludq $H0,$R1,$M1
2418 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2419 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2420 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2421 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2422
abb8c44f
AP
2423 vpmuludq $H1,$R2,$M3
2424 vpmuludq $H1,$R3,$M4
2425 vpmuludq $H1,$S4,$M0
2426 vpmuludq $H0,$R2,$M2
2427 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2428 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2429 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2430 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2431
c1e1fc50
AP
2432 vpunpcklqdq $T4,$T3,$T0 # transpose input
2433 vpunpckhqdq $T4,$T3,$T4
2434
abb8c44f
AP
2435 vpmuludq $H3,$R0,$M3
2436 vpmuludq $H3,$R1,$M4
2437 vpmuludq $H1,$R0,$M1
2438 vpmuludq $H1,$R1,$M2
2439 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2440 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2441 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2442 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2443
abb8c44f
AP
2444 vpmuludq $H4,$S4,$M3
2445 vpmuludq $H4,$R0,$M4
2446 vpmuludq $H3,$S2,$M0
2447 vpmuludq $H3,$S3,$M1
2448 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2449 vpmuludq $H3,$S4,$M2
2450 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2451 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2452 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2453 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2454
abb8c44f
AP
2455 vpmuludq $H4,$S1,$M0
2456 vpmuludq $H4,$S2,$M1
2457 vpmuludq $H4,$S3,$M2
2458 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2459 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2460 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2461
2462 ################################################################
c1e1fc50
AP
2463 # lazy reduction (interleaved with input splat)
2464
2465 vpsrlq \$52,$T0,$T2 # splat input
2466 vpsllq \$12,$T4,$T3
abb8c44f
AP
2467
2468 vpsrlq \$26,$D3,$H3
2469 vpandq $MASK,$D3,$D3
2470 vpaddq $H3,$D4,$H4 # h3 -> h4
2471
c1e1fc50
AP
2472 vporq $T3,$T2,$T2
2473
abb8c44f
AP
2474 vpsrlq \$26,$H0,$D0
2475 vpandq $MASK,$H0,$H0
2476 vpaddq $D0,$H1,$H1 # h0 -> h1
2477
c1e1fc50
AP
2478 vpandq $MASK,$T2,$T2 # 2
2479
abb8c44f
AP
2480 vpsrlq \$26,$H4,$D4
2481 vpandq $MASK,$H4,$H4
2482
abb8c44f
AP
2483 vpsrlq \$26,$H1,$D1
2484 vpandq $MASK,$H1,$H1
2485 vpaddq $D1,$H2,$H2 # h1 -> h2
2486
2487 vpaddq $D4,$H0,$H0
2488 vpsllq \$2,$D4,$D4
2489 vpaddq $D4,$H0,$H0 # h4 -> h0
2490
c1e1fc50 2491 vpaddq $T2,$H2,$H2 # modulo-scheduled
abb8c44f
AP
2492 vpsrlq \$26,$T0,$T1
2493
2494 vpsrlq \$26,$H2,$D2
2495 vpandq $MASK,$H2,$H2
2496 vpaddq $D2,$D3,$H3 # h2 -> h3
2497
c1e1fc50 2498 vpsrlq \$14,$T4,$T3
abb8c44f
AP
2499
2500 vpsrlq \$26,$H0,$D0
2501 vpandq $MASK,$H0,$H0
2502 vpaddq $D0,$H1,$H1 # h0 -> h1
2503
2504 vpsrlq \$40,$T4,$T4 # 4
2505
2506 vpsrlq \$26,$H3,$D3
2507 vpandq $MASK,$H3,$H3
2508 vpaddq $D3,$H4,$H4 # h3 -> h4
2509
2510 vpandq $MASK,$T0,$T0 # 0
e052083c
AP
2511 #vpandq $MASK,$T1,$T1 # 1
2512 #vpandq $MASK,$T3,$T3 # 3
abb8c44f
AP
2513 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2514
2515 sub \$128,$len
2516 ja .Loop_avx512
2517
2518.Ltail_avx512:
2519 ################################################################
2520 # while above multiplications were by r^8 in all lanes, in last
2521 # iteration we multiply least significant lane by r^8 and most
2522 # significant one by r, that's why table gets shifted...
2523
c1e1fc50 2524 vpsrlq \$32,$R0,$R0 # 0105020603070408
abb8c44f
AP
2525 vpsrlq \$32,$R1,$R1
2526 vpsrlq \$32,$R2,$R2
2527 vpsrlq \$32,$S3,$S3
2528 vpsrlq \$32,$S4,$S4
2529 vpsrlq \$32,$R3,$R3
2530 vpsrlq \$32,$R4,$R4
2531 vpsrlq \$32,$S1,$S1
2532 vpsrlq \$32,$S2,$S2
2533
2534 ################################################################
2535 # load either next or last 64 byte of input
2536 lea ($inp,$len),$inp
2537
2538 #vpaddq $H2,$T2,$H2 # accumulate input
2539 vpaddq $H0,$T0,$H0
2540
2541 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2542 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2543 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
e052083c 2544 vpandq $MASK,$T1,$T1 # 1
abb8c44f 2545 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
e052083c 2546 vpandq $MASK,$T3,$T3 # 3
abb8c44f 2547 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
abb8c44f
AP
2548 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2549 vpaddq $H1,$T1,$H1 # accumulate input
2550 vpaddq $H3,$T3,$H3
2551 vpaddq $H4,$T4,$H4
2552
a8f302e5 2553 vmovdqu 16*0($inp),%x#$T0
abb8c44f
AP
2554 vpmuludq $H0,$R3,$M3
2555 vpmuludq $H0,$R4,$M4
2556 vpmuludq $H0,$R0,$M0
2557 vpmuludq $H0,$R1,$M1
2558 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2559 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2560 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2561 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2562
a8f302e5 2563 vmovdqu 16*1($inp),%x#$T1
abb8c44f
AP
2564 vpmuludq $H1,$R2,$M3
2565 vpmuludq $H1,$R3,$M4
2566 vpmuludq $H1,$S4,$M0
2567 vpmuludq $H0,$R2,$M2
2568 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2569 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2570 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2571 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2572
a8f302e5 2573 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
abb8c44f
AP
2574 vpmuludq $H3,$R0,$M3
2575 vpmuludq $H3,$R1,$M4
2576 vpmuludq $H1,$R0,$M1
2577 vpmuludq $H1,$R1,$M2
2578 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2579 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2580 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2581 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2582
a8f302e5 2583 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
abb8c44f
AP
2584 vpmuludq $H4,$S4,$M3
2585 vpmuludq $H4,$R0,$M4
2586 vpmuludq $H3,$S2,$M0
2587 vpmuludq $H3,$S3,$M1
2588 vpmuludq $H3,$S4,$M2
2589 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2590 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2591 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2592 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2593 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2594
2595 vpmuludq $H4,$S1,$M0
2596 vpmuludq $H4,$S2,$M1
2597 vpmuludq $H4,$S3,$M2
2598 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2599 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2600 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2601
2602 ################################################################
2603 # horizontal addition
2604
2605 mov \$1,%eax
a8f302e5
AP
2606 vpermq \$0xb1,$H3,$D3
2607 vpermq \$0xb1,$D4,$H4
2608 vpermq \$0xb1,$H0,$D0
2609 vpermq \$0xb1,$H1,$D1
2610 vpermq \$0xb1,$H2,$D2
abb8c44f
AP
2611 vpaddq $D3,$H3,$H3
2612 vpaddq $D4,$H4,$H4
2613 vpaddq $D0,$H0,$H0
2614 vpaddq $D1,$H1,$H1
2615 vpaddq $D2,$H2,$H2
2616
2617 kmovw %eax,%k3
2618 vpermq \$0x2,$H3,$D3
2619 vpermq \$0x2,$H4,$D4
2620 vpermq \$0x2,$H0,$D0
2621 vpermq \$0x2,$H1,$D1
2622 vpermq \$0x2,$H2,$D2
2623 vpaddq $D3,$H3,$H3
2624 vpaddq $D4,$H4,$H4
2625 vpaddq $D0,$H0,$H0
2626 vpaddq $D1,$H1,$H1
2627 vpaddq $D2,$H2,$H2
2628
2629 vextracti64x4 \$0x1,$H3,%y#$D3
2630 vextracti64x4 \$0x1,$H4,%y#$D4
2631 vextracti64x4 \$0x1,$H0,%y#$D0
2632 vextracti64x4 \$0x1,$H1,%y#$D1
2633 vextracti64x4 \$0x1,$H2,%y#$D2
2634 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2635 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2636 vpaddq $D0,$H0,${H0}{%k3}{z}
2637 vpaddq $D1,$H1,${H1}{%k3}{z}
2638 vpaddq $D2,$H2,${H2}{%k3}{z}
2639___
2640map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2641map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2642$code.=<<___;
2643 ################################################################
2644 # lazy reduction (interleaved with input splat)
2645
2646 vpsrlq \$26,$H3,$D3
a8f302e5 2647 vpand $MASK,$H3,$H3
abb8c44f
AP
2648 vpsrldq \$6,$T0,$T2 # splat input
2649 vpsrldq \$6,$T1,$T3
2650 vpunpckhqdq $T1,$T0,$T4 # 4
2651 vpaddq $D3,$H4,$H4 # h3 -> h4
2652
2653 vpsrlq \$26,$H0,$D0
a8f302e5 2654 vpand $MASK,$H0,$H0
abb8c44f
AP
2655 vpunpcklqdq $T3,$T2,$T2 # 2:3
2656 vpunpcklqdq $T1,$T0,$T0 # 0:1
2657 vpaddq $D0,$H1,$H1 # h0 -> h1
2658
2659 vpsrlq \$26,$H4,$D4
a8f302e5 2660 vpand $MASK,$H4,$H4
abb8c44f
AP
2661
2662 vpsrlq \$26,$H1,$D1
a8f302e5 2663 vpand $MASK,$H1,$H1
abb8c44f
AP
2664 vpsrlq \$30,$T2,$T3
2665 vpsrlq \$4,$T2,$T2
2666 vpaddq $D1,$H2,$H2 # h1 -> h2
2667
2668 vpaddq $D4,$H0,$H0
2669 vpsllq \$2,$D4,$D4
2670 vpsrlq \$26,$T0,$T1
2671 vpsrlq \$40,$T4,$T4 # 4
2672 vpaddq $D4,$H0,$H0 # h4 -> h0
2673
2674 vpsrlq \$26,$H2,$D2
a8f302e5
AP
2675 vpand $MASK,$H2,$H2
2676 vpand $MASK,$T2,$T2 # 2
2677 vpand $MASK,$T0,$T0 # 0
abb8c44f
AP
2678 vpaddq $D2,$H3,$H3 # h2 -> h3
2679
2680 vpsrlq \$26,$H0,$D0
a8f302e5 2681 vpand $MASK,$H0,$H0
abb8c44f 2682 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
a8f302e5 2683 vpand $MASK,$T1,$T1 # 1
abb8c44f
AP
2684 vpaddq $D0,$H1,$H1 # h0 -> h1
2685
2686 vpsrlq \$26,$H3,$D3
a8f302e5
AP
2687 vpand $MASK,$H3,$H3
2688 vpand $MASK,$T3,$T3 # 3
2689 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
abb8c44f
AP
2690 vpaddq $D3,$H4,$H4 # h3 -> h4
2691
2692 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2693 add \$64,$len
2694 jnz .Ltail_avx2
2695
2696 vpsubq $T2,$H2,$H2 # undo input accumulation
2697 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2698 vmovd %x#$H1,`4*1-48-64`($ctx)
2699 vmovd %x#$H2,`4*2-48-64`($ctx)
2700 vmovd %x#$H3,`4*3-48-64`($ctx)
2701 vmovd %x#$H4,`4*4-48-64`($ctx)
c1e1fc50 2702 vzeroall
abb8c44f
AP
2703___
2704$code.=<<___ if ($win64);
c1e1fc50
AP
2705 movdqa 0x50(%r11),%xmm6
2706 movdqa 0x60(%r11),%xmm7
2707 movdqa 0x70(%r11),%xmm8
2708 movdqa 0x80(%r11),%xmm9
2709 movdqa 0x90(%r11),%xmm10
2710 movdqa 0xa0(%r11),%xmm11
2711 movdqa 0xb0(%r11),%xmm12
2712 movdqa 0xc0(%r11),%xmm13
2713 movdqa 0xd0(%r11),%xmm14
2714 movdqa 0xe0(%r11),%xmm15
abb8c44f
AP
2715 lea 0xf8(%r11),%rsp
2716.Ldo_avx512_epilogue:
2717___
2718$code.=<<___ if (!$win64);
2719 lea 8(%r11),%rsp
1c47e883 2720.cfi_def_cfa %rsp,8
abb8c44f
AP
2721___
2722$code.=<<___;
abb8c44f 2723 ret
1c47e883 2724.cfi_endproc
abb8c44f
AP
2725.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2726___
fd910ef9
AP
2727if ($avx>3) {
2728########################################################################
2729# VPMADD52 version using 2^44 radix.
2730#
2731# One can argue that base 2^52 would be more natural. Well, even though
2732# some operations would be more natural, one has to recognize couple of
2733# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2734# at amount of multiply-n-accumulate operations. Secondly, it makes it
2735# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2736# reference implementations], which means that more such operations
2737# would have to be performed in inner loop, which in turn makes critical
2738# path longer. In other words, even though base 2^44 reduction might
2739# look less elegant, overall critical path is actually shorter...
2740
c2b93590
AP
2741########################################################################
2742# Layout of opaque area is following.
2743#
2744# unsigned __int64 h[3]; # current hash value base 2^44
2745# unsigned __int64 s[2]; # key value*20 base 2^44
2746# unsigned __int64 r[3]; # key value base 2^44
2747# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2748# # r^n positions reflect
2749# # placement in register, not
2750# # memory, R[3] is R[1]*20
2751
fd910ef9
AP
2752$code.=<<___;
2753.type poly1305_init_base2_44,\@function,3
2754.align 32
2755poly1305_init_base2_44:
048fa13e 2756.cfi_startproc
fd910ef9
AP
2757 xor %rax,%rax
2758 mov %rax,0($ctx) # initialize hash value
2759 mov %rax,8($ctx)
2760 mov %rax,16($ctx)
2761
2762.Linit_base2_44:
2763 lea poly1305_blocks_vpmadd52(%rip),%r10
2764 lea poly1305_emit_base2_44(%rip),%r11
2765
2766 mov \$0x0ffffffc0fffffff,%rax
2767 mov \$0x0ffffffc0ffffffc,%rcx
2768 and 0($inp),%rax
2769 mov \$0x00000fffffffffff,%r8
2770 and 8($inp),%rcx
2771 mov \$0x00000fffffffffff,%r9
2772 and %rax,%r8
2773 shrd \$44,%rcx,%rax
2774 mov %r8,40($ctx) # r0
2775 and %r9,%rax
2776 shr \$24,%rcx
2777 mov %rax,48($ctx) # r1
2778 lea (%rax,%rax,4),%rax # *5
2779 mov %rcx,56($ctx) # r2
2780 shl \$2,%rax # magic <<2
2781 lea (%rcx,%rcx,4),%rcx # *5
2782 shl \$2,%rcx # magic <<2
2783 mov %rax,24($ctx) # s1
2784 mov %rcx,32($ctx) # s2
c2b93590 2785 movq \$-1,64($ctx) # write impossible value
fd910ef9
AP
2786___
2787$code.=<<___ if ($flavour !~ /elf32/);
2788 mov %r10,0(%rdx)
2789 mov %r11,8(%rdx)
2790___
2791$code.=<<___ if ($flavour =~ /elf32/);
2792 mov %r10d,0(%rdx)
2793 mov %r11d,4(%rdx)
2794___
2795$code.=<<___;
2796 mov \$1,%eax
2797 ret
048fa13e 2798.cfi_endproc
fd910ef9
AP
2799.size poly1305_init_base2_44,.-poly1305_init_base2_44
2800___
2801{
2802my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2803my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2804my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2805
2806$code.=<<___;
2807.type poly1305_blocks_vpmadd52,\@function,4
2808.align 32
2809poly1305_blocks_vpmadd52:
048fa13e 2810.cfi_startproc
98ad3fe8 2811 endbranch
fd910ef9
AP
2812 shr \$4,$len
2813 jz .Lno_data_vpmadd52 # too short
2814
c2b93590
AP
2815 shl \$40,$padbit
2816 mov 64($ctx),%r8 # peek on power of the key
2817
2818 # if powers of the key are not calculated yet, process up to 3
2819 # blocks with this single-block subroutine, otherwise ensure that
2820 # length is divisible by 2 blocks and pass the rest down to next
2821 # subroutine...
2822
2823 mov \$3,%rax
2824 mov \$1,%r10
2825 cmp \$4,$len # is input long
2826 cmovae %r10,%rax
2827 test %r8,%r8 # is power value impossible?
2828 cmovns %r10,%rax
2829
2830 and $len,%rax # is input of favourable length?
2831 jz .Lblocks_vpmadd52_4x
2832
2833 sub %rax,$len
fd910ef9
AP
2834 mov \$7,%r10d
2835 mov \$1,%r11d
2836 kmovw %r10d,%k7
2837 lea .L2_44_inp_permd(%rip),%r10
fd910ef9
AP
2838 kmovw %r11d,%k1
2839
2840 vmovq $padbit,%x#$PAD
2841 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2842 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2843 vpermq \$0xcf,$PAD,$PAD
2844 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2845
2846 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2847 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2848 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2849 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2850
2851 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2852 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2853
2854 jmp .Loop_vpmadd52
2855
2856.align 32
2857.Loop_vpmadd52:
2858 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2859 lea 16($inp),$inp
2860
2861 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2862 vpsrlvq $inp_shift,$T0,$T0
2863 vpandq $reduc_mask,$T0,$T0
2864 vporq $PAD,$T0,$T0
2865
2866 vpaddq $T0,$Dlo,$Dlo # accumulate input
2867
2868 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2869 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2870 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2871
2872 vpxord $Dlo,$Dlo,$Dlo
2873 vpxord $Dhi,$Dhi,$Dhi
2874
2875 vpmadd52luq $r2r1r0,$H0,$Dlo
2876 vpmadd52huq $r2r1r0,$H0,$Dhi
2877
2878 vpmadd52luq $r1r0s2,$H1,$Dlo
2879 vpmadd52huq $r1r0s2,$H1,$Dhi
2880
2881 vpmadd52luq $r0s2s1,$H2,$Dlo
2882 vpmadd52huq $r0s2s1,$H2,$Dhi
2883
2884 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2885 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2886 vpandq $reduc_mask,$Dlo,$Dlo
2887
2888 vpaddq $T0,$Dhi,$Dhi
2889
2890 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2891
2892 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2893
2894 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2895 vpandq $reduc_mask,$Dlo,$Dlo
2896
2897 vpermq \$0b10010011,$T0,$T0
2898
2899 vpaddq $T0,$Dlo,$Dlo
2900
2901 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2902
2903 vpaddq $T0,$Dlo,$Dlo
2904 vpsllq \$2,$T0,$T0
2905
2906 vpaddq $T0,$Dlo,$Dlo
2907
c2b93590 2908 dec %rax # len-=16
fd910ef9
AP
2909 jnz .Loop_vpmadd52
2910
2911 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2912
c2b93590
AP
2913 test $len,$len
2914 jnz .Lblocks_vpmadd52_4x
2915
fd910ef9
AP
2916.Lno_data_vpmadd52:
2917 ret
048fa13e 2918.cfi_endproc
fd910ef9
AP
2919.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2920___
2921}
c2b93590 2922{
0a5d1a38
AP
2923########################################################################
2924# As implied by its name 4x subroutine processes 4 blocks in parallel
2925# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2926# and is handled in 256-bit %ymm registers.
2927
c2b93590
AP
2928my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2929my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2930my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2931
2932$code.=<<___;
2933.type poly1305_blocks_vpmadd52_4x,\@function,4
2934.align 32
2935poly1305_blocks_vpmadd52_4x:
048fa13e 2936.cfi_startproc
c2b93590
AP
2937 shr \$4,$len
2938 jz .Lno_data_vpmadd52_4x # too short
2939
2940 shl \$40,$padbit
2941 mov 64($ctx),%r8 # peek on power of the key
2942
2943.Lblocks_vpmadd52_4x:
2944 vpbroadcastq $padbit,$PAD
2945
2946 vmovdqa64 .Lx_mask44(%rip),$mask44
2947 mov \$5,%eax
2948 vmovdqa64 .Lx_mask42(%rip),$mask42
2949 kmovw %eax,%k1 # used in 2x path
2950
2951 test %r8,%r8 # is power value impossible?
2952 js .Linit_vpmadd52 # if it is, then init R[4]
2953
2954 vmovq 0($ctx),%x#$H0 # load current hash value
2955 vmovq 8($ctx),%x#$H1
2956 vmovq 16($ctx),%x#$H2
2957
2958 test \$3,$len # is length 4*n+2?
2959 jnz .Lblocks_vpmadd52_2x_do
2960
2961.Lblocks_vpmadd52_4x_do:
2962 vpbroadcastq 64($ctx),$R0 # load 4th power of the key
2963 vpbroadcastq 96($ctx),$R1
2964 vpbroadcastq 128($ctx),$R2
2965 vpbroadcastq 160($ctx),$S1
2966
2967.Lblocks_vpmadd52_4x_key_loaded:
2968 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
2969 vpaddq $R2,$S2,$S2
2970 vpsllq \$2,$S2,$S2
2971
0a5d1a38
AP
2972 test \$7,$len # is len 8*n?
2973 jz .Lblocks_vpmadd52_8x
2974
c2b93590
AP
2975 vmovdqu64 16*0($inp),$T2 # load data
2976 vmovdqu64 16*2($inp),$T3
2977 lea 16*4($inp),$inp
2978
2979 vpunpcklqdq $T3,$T2,$T1 # transpose data
2980 vpunpckhqdq $T3,$T2,$T3
2981
2982 # at this point 64-bit lanes are ordered as 3-1-2-0
2983
2984 vpsrlq \$24,$T3,$T2 # splat the data
2985 vporq $PAD,$T2,$T2
2986 vpaddq $T2,$H2,$H2 # accumulate input
2987 vpandq $mask44,$T1,$T0
2988 vpsrlq \$44,$T1,$T1
2989 vpsllq \$20,$T3,$T3
2990 vporq $T3,$T1,$T1
2991 vpandq $mask44,$T1,$T1
2992
2993 sub \$4,$len
2994 jz .Ltail_vpmadd52_4x
2995 jmp .Loop_vpmadd52_4x
2996 ud2
2997
2998.align 32
2999.Linit_vpmadd52:
3000 vmovq 24($ctx),%x#$S1 # load key
3001 vmovq 56($ctx),%x#$H2
3002 vmovq 32($ctx),%x#$S2
3003 vmovq 40($ctx),%x#$R0
3004 vmovq 48($ctx),%x#$R1
3005
3006 vmovdqa $R0,$H0
3007 vmovdqa $R1,$H1
3008 vmovdqa $H2,$R2
3009
3010 mov \$2,%eax
3011
3012.Lmul_init_vpmadd52:
3013 vpxorq $D0lo,$D0lo,$D0lo
3014 vpmadd52luq $H2,$S1,$D0lo
3015 vpxorq $D0hi,$D0hi,$D0hi
3016 vpmadd52huq $H2,$S1,$D0hi
3017 vpxorq $D1lo,$D1lo,$D1lo
3018 vpmadd52luq $H2,$S2,$D1lo
3019 vpxorq $D1hi,$D1hi,$D1hi
3020 vpmadd52huq $H2,$S2,$D1hi
3021 vpxorq $D2lo,$D2lo,$D2lo
3022 vpmadd52luq $H2,$R0,$D2lo
3023 vpxorq $D2hi,$D2hi,$D2hi
3024 vpmadd52huq $H2,$R0,$D2hi
3025
3026 vpmadd52luq $H0,$R0,$D0lo
3027 vpmadd52huq $H0,$R0,$D0hi
3028 vpmadd52luq $H0,$R1,$D1lo
3029 vpmadd52huq $H0,$R1,$D1hi
3030 vpmadd52luq $H0,$R2,$D2lo
3031 vpmadd52huq $H0,$R2,$D2hi
3032
3033 vpmadd52luq $H1,$S2,$D0lo
3034 vpmadd52huq $H1,$S2,$D0hi
3035 vpmadd52luq $H1,$R0,$D1lo
3036 vpmadd52huq $H1,$R0,$D1hi
3037 vpmadd52luq $H1,$R1,$D2lo
3038 vpmadd52huq $H1,$R1,$D2hi
3039
3040 ################################################################
3041 # partial reduction
3042 vpsrlq \$44,$D0lo,$tmp
3043 vpsllq \$8,$D0hi,$D0hi
3044 vpandq $mask44,$D0lo,$H0
3045 vpaddq $tmp,$D0hi,$D0hi
3046
3047 vpaddq $D0hi,$D1lo,$D1lo
3048
3049 vpsrlq \$44,$D1lo,$tmp
3050 vpsllq \$8,$D1hi,$D1hi
3051 vpandq $mask44,$D1lo,$H1
3052 vpaddq $tmp,$D1hi,$D1hi
3053
3054 vpaddq $D1hi,$D2lo,$D2lo
3055
3056 vpsrlq \$42,$D2lo,$tmp
3057 vpsllq \$10,$D2hi,$D2hi
3058 vpandq $mask42,$D2lo,$H2
3059 vpaddq $tmp,$D2hi,$D2hi
3060
3061 vpaddq $D2hi,$H0,$H0
3062 vpsllq \$2,$D2hi,$D2hi
3063
3064 vpaddq $D2hi,$H0,$H0
3065
3066 vpsrlq \$44,$H0,$tmp # additional step
3067 vpandq $mask44,$H0,$H0
3068
3069 vpaddq $tmp,$H1,$H1
3070
3071 dec %eax
3072 jz .Ldone_init_vpmadd52
3073
3074 vpunpcklqdq $R1,$H1,$R1 # 1,2
3075 vpbroadcastq %x#$H1,%x#$H1 # 2,2
3076 vpunpcklqdq $R2,$H2,$R2
3077 vpbroadcastq %x#$H2,%x#$H2
3078 vpunpcklqdq $R0,$H0,$R0
3079 vpbroadcastq %x#$H0,%x#$H0
3080
3081 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3082 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3083 vpaddq $R1,$S1,$S1
3084 vpaddq $R2,$S2,$S2
3085 vpsllq \$2,$S1,$S1
3086 vpsllq \$2,$S2,$S2
3087
3088 jmp .Lmul_init_vpmadd52
3089 ud2
3090
3091.align 32
3092.Ldone_init_vpmadd52:
3093 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3094 vinserti128 \$1,%x#$R2,$H2,$R2
3095 vinserti128 \$1,%x#$R0,$H0,$R0
3096
3097 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3098 vpermq \$0b11011000,$R2,$R2
3099 vpermq \$0b11011000,$R0,$R0
3100
3101 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3102 vpaddq $R1,$S1,$S1
3103 vpsllq \$2,$S1,$S1
3104
3105 vmovq 0($ctx),%x#$H0 # load current hash value
3106 vmovq 8($ctx),%x#$H1
3107 vmovq 16($ctx),%x#$H2
3108
3109 test \$3,$len # is length 4*n+2?
3110 jnz .Ldone_init_vpmadd52_2x
3111
3112 vmovdqu64 $R0,64($ctx) # save key powers
3113 vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3114 vmovdqu64 $R1,96($ctx)
3115 vpbroadcastq %x#$R1,$R1
3116 vmovdqu64 $R2,128($ctx)
3117 vpbroadcastq %x#$R2,$R2
3118 vmovdqu64 $S1,160($ctx)
3119 vpbroadcastq %x#$S1,$S1
3120
3121 jmp .Lblocks_vpmadd52_4x_key_loaded
3122 ud2
3123
3124.align 32
3125.Ldone_init_vpmadd52_2x:
3126 vmovdqu64 $R0,64($ctx) # save key powers
3127 vpsrldq \$8,$R0,$R0 # 0-1-0-2
3128 vmovdqu64 $R1,96($ctx)
3129 vpsrldq \$8,$R1,$R1
3130 vmovdqu64 $R2,128($ctx)
3131 vpsrldq \$8,$R2,$R2
3132 vmovdqu64 $S1,160($ctx)
3133 vpsrldq \$8,$S1,$S1
3134 jmp .Lblocks_vpmadd52_2x_key_loaded
3135 ud2
3136
3137.align 32
3138.Lblocks_vpmadd52_2x_do:
3139 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3140 vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3141 vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3142 vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3143
3144.Lblocks_vpmadd52_2x_key_loaded:
3145 vmovdqu64 16*0($inp),$T2 # load data
3146 vpxorq $T3,$T3,$T3
3147 lea 16*2($inp),$inp
3148
3149 vpunpcklqdq $T3,$T2,$T1 # transpose data
3150 vpunpckhqdq $T3,$T2,$T3
3151
3152 # at this point 64-bit lanes are ordered as x-1-x-0
3153
3154 vpsrlq \$24,$T3,$T2 # splat the data
3155 vporq $PAD,$T2,$T2
3156 vpaddq $T2,$H2,$H2 # accumulate input
3157 vpandq $mask44,$T1,$T0
3158 vpsrlq \$44,$T1,$T1
3159 vpsllq \$20,$T3,$T3
3160 vporq $T3,$T1,$T1
3161 vpandq $mask44,$T1,$T1
3162
3163 jmp .Ltail_vpmadd52_2x
3164 ud2
3165
3166.align 32
3167.Loop_vpmadd52_4x:
3168 #vpaddq $T2,$H2,$H2 # accumulate input
3169 vpaddq $T0,$H0,$H0
3170 vpaddq $T1,$H1,$H1
3171
3172 vpxorq $D0lo,$D0lo,$D0lo
3173 vpmadd52luq $H2,$S1,$D0lo
3174 vpxorq $D0hi,$D0hi,$D0hi
3175 vpmadd52huq $H2,$S1,$D0hi
3176 vpxorq $D1lo,$D1lo,$D1lo
3177 vpmadd52luq $H2,$S2,$D1lo
3178 vpxorq $D1hi,$D1hi,$D1hi
3179 vpmadd52huq $H2,$S2,$D1hi
3180 vpxorq $D2lo,$D2lo,$D2lo
3181 vpmadd52luq $H2,$R0,$D2lo
3182 vpxorq $D2hi,$D2hi,$D2hi
3183 vpmadd52huq $H2,$R0,$D2hi
3184
3185 vmovdqu64 16*0($inp),$T2 # load data
3186 vmovdqu64 16*2($inp),$T3
3187 lea 16*4($inp),$inp
3188 vpmadd52luq $H0,$R0,$D0lo
3189 vpmadd52huq $H0,$R0,$D0hi
3190 vpmadd52luq $H0,$R1,$D1lo
3191 vpmadd52huq $H0,$R1,$D1hi
3192 vpmadd52luq $H0,$R2,$D2lo
3193 vpmadd52huq $H0,$R2,$D2hi
3194
3195 vpunpcklqdq $T3,$T2,$T1 # transpose data
3196 vpunpckhqdq $T3,$T2,$T3
3197 vpmadd52luq $H1,$S2,$D0lo
3198 vpmadd52huq $H1,$S2,$D0hi
3199 vpmadd52luq $H1,$R0,$D1lo
3200 vpmadd52huq $H1,$R0,$D1hi
3201 vpmadd52luq $H1,$R1,$D2lo
3202 vpmadd52huq $H1,$R1,$D2hi
3203
3204 ################################################################
3205 # partial reduction (interleaved with data splat)
3206 vpsrlq \$44,$D0lo,$tmp
3207 vpsllq \$8,$D0hi,$D0hi
3208 vpandq $mask44,$D0lo,$H0
3209 vpaddq $tmp,$D0hi,$D0hi
3210
3211 vpsrlq \$24,$T3,$T2
3212 vporq $PAD,$T2,$T2
3213 vpaddq $D0hi,$D1lo,$D1lo
3214
3215 vpsrlq \$44,$D1lo,$tmp
3216 vpsllq \$8,$D1hi,$D1hi
3217 vpandq $mask44,$D1lo,$H1
3218 vpaddq $tmp,$D1hi,$D1hi
3219
3220 vpandq $mask44,$T1,$T0
3221 vpsrlq \$44,$T1,$T1
3222 vpsllq \$20,$T3,$T3
3223 vpaddq $D1hi,$D2lo,$D2lo
3224
3225 vpsrlq \$42,$D2lo,$tmp
3226 vpsllq \$10,$D2hi,$D2hi
3227 vpandq $mask42,$D2lo,$H2
3228 vpaddq $tmp,$D2hi,$D2hi
3229
3230 vpaddq $T2,$H2,$H2 # accumulate input
3231 vpaddq $D2hi,$H0,$H0
3232 vpsllq \$2,$D2hi,$D2hi
3233
3234 vpaddq $D2hi,$H0,$H0
3235 vporq $T3,$T1,$T1
3236 vpandq $mask44,$T1,$T1
3237
3238 vpsrlq \$44,$H0,$tmp # additional step
3239 vpandq $mask44,$H0,$H0
3240
3241 vpaddq $tmp,$H1,$H1
3242
3243 sub \$4,$len # len-=64
3244 jnz .Loop_vpmadd52_4x
3245
3246.Ltail_vpmadd52_4x:
3247 vmovdqu64 128($ctx),$R2 # load all key powers
3248 vmovdqu64 160($ctx),$S1
3249 vmovdqu64 64($ctx),$R0
3250 vmovdqu64 96($ctx),$R1
3251
3252.Ltail_vpmadd52_2x:
3253 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3254 vpaddq $R2,$S2,$S2
3255 vpsllq \$2,$S2,$S2
3256
3257 #vpaddq $T2,$H2,$H2 # accumulate input
3258 vpaddq $T0,$H0,$H0
3259 vpaddq $T1,$H1,$H1
3260
3261 vpxorq $D0lo,$D0lo,$D0lo
3262 vpmadd52luq $H2,$S1,$D0lo
3263 vpxorq $D0hi,$D0hi,$D0hi
3264 vpmadd52huq $H2,$S1,$D0hi
3265 vpxorq $D1lo,$D1lo,$D1lo
3266 vpmadd52luq $H2,$S2,$D1lo
3267 vpxorq $D1hi,$D1hi,$D1hi
3268 vpmadd52huq $H2,$S2,$D1hi
3269 vpxorq $D2lo,$D2lo,$D2lo
3270 vpmadd52luq $H2,$R0,$D2lo
3271 vpxorq $D2hi,$D2hi,$D2hi
3272 vpmadd52huq $H2,$R0,$D2hi
3273
3274 vpmadd52luq $H0,$R0,$D0lo
3275 vpmadd52huq $H0,$R0,$D0hi
3276 vpmadd52luq $H0,$R1,$D1lo
3277 vpmadd52huq $H0,$R1,$D1hi
3278 vpmadd52luq $H0,$R2,$D2lo
3279 vpmadd52huq $H0,$R2,$D2hi
3280
3281 vpmadd52luq $H1,$S2,$D0lo
3282 vpmadd52huq $H1,$S2,$D0hi
3283 vpmadd52luq $H1,$R0,$D1lo
3284 vpmadd52huq $H1,$R0,$D1hi
3285 vpmadd52luq $H1,$R1,$D2lo
3286 vpmadd52huq $H1,$R1,$D2hi
3287
3288 ################################################################
3289 # horizontal addition
3290
3291 mov \$1,%eax
3292 kmovw %eax,%k1
3293 vpsrldq \$8,$D0lo,$T0
3294 vpsrldq \$8,$D0hi,$H0
3295 vpsrldq \$8,$D1lo,$T1
3296 vpsrldq \$8,$D1hi,$H1
3297 vpaddq $T0,$D0lo,$D0lo
3298 vpaddq $H0,$D0hi,$D0hi
3299 vpsrldq \$8,$D2lo,$T2
3300 vpsrldq \$8,$D2hi,$H2
3301 vpaddq $T1,$D1lo,$D1lo
3302 vpaddq $H1,$D1hi,$D1hi
3303 vpermq \$0x2,$D0lo,$T0
3304 vpermq \$0x2,$D0hi,$H0
3305 vpaddq $T2,$D2lo,$D2lo
3306 vpaddq $H2,$D2hi,$D2hi
3307
3308 vpermq \$0x2,$D1lo,$T1
3309 vpermq \$0x2,$D1hi,$H1
3310 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3311 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3312 vpermq \$0x2,$D2lo,$T2
3313 vpermq \$0x2,$D2hi,$H2
3314 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3315 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3316 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3317 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3318
3319 ################################################################
3320 # partial reduction
3321 vpsrlq \$44,$D0lo,$tmp
3322 vpsllq \$8,$D0hi,$D0hi
3323 vpandq $mask44,$D0lo,$H0
3324 vpaddq $tmp,$D0hi,$D0hi
3325
3326 vpaddq $D0hi,$D1lo,$D1lo
3327
3328 vpsrlq \$44,$D1lo,$tmp
3329 vpsllq \$8,$D1hi,$D1hi
3330 vpandq $mask44,$D1lo,$H1
3331 vpaddq $tmp,$D1hi,$D1hi
3332
3333 vpaddq $D1hi,$D2lo,$D2lo
3334
3335 vpsrlq \$42,$D2lo,$tmp
3336 vpsllq \$10,$D2hi,$D2hi
3337 vpandq $mask42,$D2lo,$H2
3338 vpaddq $tmp,$D2hi,$D2hi
3339
3340 vpaddq $D2hi,$H0,$H0
3341 vpsllq \$2,$D2hi,$D2hi
3342
3343 vpaddq $D2hi,$H0,$H0
3344
3345 vpsrlq \$44,$H0,$tmp # additional step
3346 vpandq $mask44,$H0,$H0
3347
3348 vpaddq $tmp,$H1,$H1
3349 # at this point $len is
3350 # either 4*n+2 or 0...
3351 sub \$2,$len # len-=32
3352 ja .Lblocks_vpmadd52_4x_do
3353
3354 vmovq %x#$H0,0($ctx)
3355 vmovq %x#$H1,8($ctx)
3356 vmovq %x#$H2,16($ctx)
0a5d1a38 3357 vzeroall
c2b93590
AP
3358
3359.Lno_data_vpmadd52_4x:
3360 ret
048fa13e 3361.cfi_endproc
c2b93590
AP
3362.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3363___
3364}
0a5d1a38
AP
3365{
3366########################################################################
3367# As implied by its name 8x subroutine processes 8 blocks in parallel...
3368# This is intermediate version, as it's used only in cases when input
3369# length is either 8*n, 8*n+1 or 8*n+2...
3370
3371my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3372my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3373my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3374my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3375
3376$code.=<<___;
3377.type poly1305_blocks_vpmadd52_8x,\@function,4
3378.align 32
3379poly1305_blocks_vpmadd52_8x:
048fa13e 3380.cfi_startproc
0a5d1a38
AP
3381 shr \$4,$len
3382 jz .Lno_data_vpmadd52_8x # too short
3383
3384 shl \$40,$padbit
3385 mov 64($ctx),%r8 # peek on power of the key
3386
3387 vmovdqa64 .Lx_mask44(%rip),$mask44
3388 vmovdqa64 .Lx_mask42(%rip),$mask42
3389
3390 test %r8,%r8 # is power value impossible?
3391 js .Linit_vpmadd52 # if it is, then init R[4]
3392
3393 vmovq 0($ctx),%x#$H0 # load current hash value
3394 vmovq 8($ctx),%x#$H1
3395 vmovq 16($ctx),%x#$H2
3396
3397.Lblocks_vpmadd52_8x:
3398 ################################################################
3399 # fist we calculate more key powers
3400
3401 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3402 vmovdqu64 160($ctx),$S1
3403 vmovdqu64 64($ctx),$R0
3404 vmovdqu64 96($ctx),$R1
3405
3406 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3407 vpaddq $R2,$S2,$S2
3408 vpsllq \$2,$S2,$S2
3409
3410 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3411 vpbroadcastq %x#$R0,$RR0
3412 vpbroadcastq %x#$R1,$RR1
3413
3414 vpxorq $D0lo,$D0lo,$D0lo
3415 vpmadd52luq $RR2,$S1,$D0lo
3416 vpxorq $D0hi,$D0hi,$D0hi
3417 vpmadd52huq $RR2,$S1,$D0hi
3418 vpxorq $D1lo,$D1lo,$D1lo
3419 vpmadd52luq $RR2,$S2,$D1lo
3420 vpxorq $D1hi,$D1hi,$D1hi
3421 vpmadd52huq $RR2,$S2,$D1hi
3422 vpxorq $D2lo,$D2lo,$D2lo
3423 vpmadd52luq $RR2,$R0,$D2lo
3424 vpxorq $D2hi,$D2hi,$D2hi
3425 vpmadd52huq $RR2,$R0,$D2hi
3426
3427 vpmadd52luq $RR0,$R0,$D0lo
3428 vpmadd52huq $RR0,$R0,$D0hi
3429 vpmadd52luq $RR0,$R1,$D1lo
3430 vpmadd52huq $RR0,$R1,$D1hi
3431 vpmadd52luq $RR0,$R2,$D2lo
3432 vpmadd52huq $RR0,$R2,$D2hi
3433
3434 vpmadd52luq $RR1,$S2,$D0lo
3435 vpmadd52huq $RR1,$S2,$D0hi
3436 vpmadd52luq $RR1,$R0,$D1lo
3437 vpmadd52huq $RR1,$R0,$D1hi
3438 vpmadd52luq $RR1,$R1,$D2lo
3439 vpmadd52huq $RR1,$R1,$D2hi
3440
3441 ################################################################
3442 # partial reduction
3443 vpsrlq \$44,$D0lo,$tmp
3444 vpsllq \$8,$D0hi,$D0hi
3445 vpandq $mask44,$D0lo,$RR0
3446 vpaddq $tmp,$D0hi,$D0hi
3447
3448 vpaddq $D0hi,$D1lo,$D1lo
3449
3450 vpsrlq \$44,$D1lo,$tmp
3451 vpsllq \$8,$D1hi,$D1hi
3452 vpandq $mask44,$D1lo,$RR1
3453 vpaddq $tmp,$D1hi,$D1hi
3454
3455 vpaddq $D1hi,$D2lo,$D2lo
3456
3457 vpsrlq \$42,$D2lo,$tmp
3458 vpsllq \$10,$D2hi,$D2hi
3459 vpandq $mask42,$D2lo,$RR2
3460 vpaddq $tmp,$D2hi,$D2hi
3461
3462 vpaddq $D2hi,$RR0,$RR0
3463 vpsllq \$2,$D2hi,$D2hi
3464
3465 vpaddq $D2hi,$RR0,$RR0
3466
3467 vpsrlq \$44,$RR0,$tmp # additional step
3468 vpandq $mask44,$RR0,$RR0
3469
3470 vpaddq $tmp,$RR1,$RR1
3471
3472 ################################################################
3473 # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3474 # is 15263748, which reflects how data is loaded...
3475
3476 vpunpcklqdq $R2,$RR2,$T2 # 3748
3477 vpunpckhqdq $R2,$RR2,$R2 # 1526
3478 vpunpcklqdq $R0,$RR0,$T0
3479 vpunpckhqdq $R0,$RR0,$R0
3480 vpunpcklqdq $R1,$RR1,$T1
3481 vpunpckhqdq $R1,$RR1,$R1
3482___
3483######## switch to %zmm
3484map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3485map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3486map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3487map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3488
3489$code.=<<___;
3490 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3491 vshufi64x2 \$0x44,$R0,$T0,$RR0
3492 vshufi64x2 \$0x44,$R1,$T1,$RR1
3493
3494 vmovdqu64 16*0($inp),$T2 # load data
3495 vmovdqu64 16*4($inp),$T3
3496 lea 16*8($inp),$inp
3497
3498 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3499 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3500 vpaddq $RR2,$SS2,$SS2
3501 vpaddq $RR1,$SS1,$SS1
3502 vpsllq \$2,$SS2,$SS2
3503 vpsllq \$2,$SS1,$SS1
3504
3505 vpbroadcastq $padbit,$PAD
3506 vpbroadcastq %x#$mask44,$mask44
3507 vpbroadcastq %x#$mask42,$mask42
3508
3509 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3510 vpbroadcastq %x#$SS2,$S2
3511 vpbroadcastq %x#$RR0,$R0
3512 vpbroadcastq %x#$RR1,$R1
3513 vpbroadcastq %x#$RR2,$R2
3514
3515 vpunpcklqdq $T3,$T2,$T1 # transpose data
3516 vpunpckhqdq $T3,$T2,$T3
3517
3518 # at this point 64-bit lanes are ordered as 73625140
3519
3520 vpsrlq \$24,$T3,$T2 # splat the data
3521 vporq $PAD,$T2,$T2
3522 vpaddq $T2,$H2,$H2 # accumulate input
3523 vpandq $mask44,$T1,$T0
3524 vpsrlq \$44,$T1,$T1
3525 vpsllq \$20,$T3,$T3
3526 vporq $T3,$T1,$T1
3527 vpandq $mask44,$T1,$T1
3528
3529 sub \$8,$len
3530 jz .Ltail_vpmadd52_8x
3531 jmp .Loop_vpmadd52_8x
3532
3533.align 32
3534.Loop_vpmadd52_8x:
3535 #vpaddq $T2,$H2,$H2 # accumulate input
3536 vpaddq $T0,$H0,$H0
3537 vpaddq $T1,$H1,$H1
3538
3539 vpxorq $D0lo,$D0lo,$D0lo
3540 vpmadd52luq $H2,$S1,$D0lo
3541 vpxorq $D0hi,$D0hi,$D0hi
3542 vpmadd52huq $H2,$S1,$D0hi
3543 vpxorq $D1lo,$D1lo,$D1lo
3544 vpmadd52luq $H2,$S2,$D1lo
3545 vpxorq $D1hi,$D1hi,$D1hi
3546 vpmadd52huq $H2,$S2,$D1hi
3547 vpxorq $D2lo,$D2lo,$D2lo
3548 vpmadd52luq $H2,$R0,$D2lo
3549 vpxorq $D2hi,$D2hi,$D2hi
3550 vpmadd52huq $H2,$R0,$D2hi
3551
3552 vmovdqu64 16*0($inp),$T2 # load data
3553 vmovdqu64 16*4($inp),$T3
3554 lea 16*8($inp),$inp
3555 vpmadd52luq $H0,$R0,$D0lo
3556 vpmadd52huq $H0,$R0,$D0hi
3557 vpmadd52luq $H0,$R1,$D1lo
3558 vpmadd52huq $H0,$R1,$D1hi
3559 vpmadd52luq $H0,$R2,$D2lo
3560 vpmadd52huq $H0,$R2,$D2hi
3561
3562 vpunpcklqdq $T3,$T2,$T1 # transpose data
3563 vpunpckhqdq $T3,$T2,$T3
3564 vpmadd52luq $H1,$S2,$D0lo
3565 vpmadd52huq $H1,$S2,$D0hi
3566 vpmadd52luq $H1,$R0,$D1lo
3567 vpmadd52huq $H1,$R0,$D1hi
3568 vpmadd52luq $H1,$R1,$D2lo
3569 vpmadd52huq $H1,$R1,$D2hi
3570
3571 ################################################################
3572 # partial reduction (interleaved with data splat)
3573 vpsrlq \$44,$D0lo,$tmp
3574 vpsllq \$8,$D0hi,$D0hi
3575 vpandq $mask44,$D0lo,$H0
3576 vpaddq $tmp,$D0hi,$D0hi
3577
3578 vpsrlq \$24,$T3,$T2
3579 vporq $PAD,$T2,$T2
3580 vpaddq $D0hi,$D1lo,$D1lo
3581
3582 vpsrlq \$44,$D1lo,$tmp
3583 vpsllq \$8,$D1hi,$D1hi
3584 vpandq $mask44,$D1lo,$H1
3585 vpaddq $tmp,$D1hi,$D1hi
3586
3587 vpandq $mask44,$T1,$T0
3588 vpsrlq \$44,$T1,$T1
3589 vpsllq \$20,$T3,$T3
3590 vpaddq $D1hi,$D2lo,$D2lo
3591
3592 vpsrlq \$42,$D2lo,$tmp
3593 vpsllq \$10,$D2hi,$D2hi
3594 vpandq $mask42,$D2lo,$H2
3595 vpaddq $tmp,$D2hi,$D2hi
3596
3597 vpaddq $T2,$H2,$H2 # accumulate input
3598 vpaddq $D2hi,$H0,$H0
3599 vpsllq \$2,$D2hi,$D2hi
3600
3601 vpaddq $D2hi,$H0,$H0
3602 vporq $T3,$T1,$T1
3603 vpandq $mask44,$T1,$T1
3604
3605 vpsrlq \$44,$H0,$tmp # additional step
3606 vpandq $mask44,$H0,$H0
3607
3608 vpaddq $tmp,$H1,$H1
3609
3610 sub \$8,$len # len-=128
3611 jnz .Loop_vpmadd52_8x
3612
3613.Ltail_vpmadd52_8x:
3614 #vpaddq $T2,$H2,$H2 # accumulate input
3615 vpaddq $T0,$H0,$H0
3616 vpaddq $T1,$H1,$H1
3617
3618 vpxorq $D0lo,$D0lo,$D0lo
3619 vpmadd52luq $H2,$SS1,$D0lo
3620 vpxorq $D0hi,$D0hi,$D0hi
3621 vpmadd52huq $H2,$SS1,$D0hi
3622 vpxorq $D1lo,$D1lo,$D1lo
3623 vpmadd52luq $H2,$SS2,$D1lo
3624 vpxorq $D1hi,$D1hi,$D1hi
3625 vpmadd52huq $H2,$SS2,$D1hi
3626 vpxorq $D2lo,$D2lo,$D2lo
3627 vpmadd52luq $H2,$RR0,$D2lo
3628 vpxorq $D2hi,$D2hi,$D2hi
3629 vpmadd52huq $H2,$RR0,$D2hi
3630
3631 vpmadd52luq $H0,$RR0,$D0lo
3632 vpmadd52huq $H0,$RR0,$D0hi
3633 vpmadd52luq $H0,$RR1,$D1lo
3634 vpmadd52huq $H0,$RR1,$D1hi
3635 vpmadd52luq $H0,$RR2,$D2lo
3636 vpmadd52huq $H0,$RR2,$D2hi
3637
3638 vpmadd52luq $H1,$SS2,$D0lo
3639 vpmadd52huq $H1,$SS2,$D0hi
3640 vpmadd52luq $H1,$RR0,$D1lo
3641 vpmadd52huq $H1,$RR0,$D1hi
3642 vpmadd52luq $H1,$RR1,$D2lo
3643 vpmadd52huq $H1,$RR1,$D2hi
3644
3645 ################################################################
3646 # horizontal addition
3647
3648 mov \$1,%eax
3649 kmovw %eax,%k1
3650 vpsrldq \$8,$D0lo,$T0
3651 vpsrldq \$8,$D0hi,$H0
3652 vpsrldq \$8,$D1lo,$T1
3653 vpsrldq \$8,$D1hi,$H1
3654 vpaddq $T0,$D0lo,$D0lo
3655 vpaddq $H0,$D0hi,$D0hi
3656 vpsrldq \$8,$D2lo,$T2
3657 vpsrldq \$8,$D2hi,$H2
3658 vpaddq $T1,$D1lo,$D1lo
3659 vpaddq $H1,$D1hi,$D1hi
3660 vpermq \$0x2,$D0lo,$T0
3661 vpermq \$0x2,$D0hi,$H0
3662 vpaddq $T2,$D2lo,$D2lo
3663 vpaddq $H2,$D2hi,$D2hi
3664
3665 vpermq \$0x2,$D1lo,$T1
3666 vpermq \$0x2,$D1hi,$H1
3667 vpaddq $T0,$D0lo,$D0lo
3668 vpaddq $H0,$D0hi,$D0hi
3669 vpermq \$0x2,$D2lo,$T2
3670 vpermq \$0x2,$D2hi,$H2
3671 vpaddq $T1,$D1lo,$D1lo
3672 vpaddq $H1,$D1hi,$D1hi
3673 vextracti64x4 \$1,$D0lo,%y#$T0
3674 vextracti64x4 \$1,$D0hi,%y#$H0
3675 vpaddq $T2,$D2lo,$D2lo
3676 vpaddq $H2,$D2hi,$D2hi
3677
3678 vextracti64x4 \$1,$D1lo,%y#$T1
3679 vextracti64x4 \$1,$D1hi,%y#$H1
3680 vextracti64x4 \$1,$D2lo,%y#$T2
3681 vextracti64x4 \$1,$D2hi,%y#$H2
3682___
3683######## switch back to %ymm
3684map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3685map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3686map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3687
3688$code.=<<___;
3689 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3690 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3691 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3692 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3693 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3694 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3695
3696 ################################################################
3697 # partial reduction
3698 vpsrlq \$44,$D0lo,$tmp
3699 vpsllq \$8,$D0hi,$D0hi
3700 vpandq $mask44,$D0lo,$H0
3701 vpaddq $tmp,$D0hi,$D0hi
3702
3703 vpaddq $D0hi,$D1lo,$D1lo
3704
3705 vpsrlq \$44,$D1lo,$tmp
3706 vpsllq \$8,$D1hi,$D1hi
3707 vpandq $mask44,$D1lo,$H1
3708 vpaddq $tmp,$D1hi,$D1hi
3709
3710 vpaddq $D1hi,$D2lo,$D2lo
3711
3712 vpsrlq \$42,$D2lo,$tmp
3713 vpsllq \$10,$D2hi,$D2hi
3714 vpandq $mask42,$D2lo,$H2
3715 vpaddq $tmp,$D2hi,$D2hi
3716
3717 vpaddq $D2hi,$H0,$H0
3718 vpsllq \$2,$D2hi,$D2hi
3719
3720 vpaddq $D2hi,$H0,$H0
3721
3722 vpsrlq \$44,$H0,$tmp # additional step
3723 vpandq $mask44,$H0,$H0
3724
3725 vpaddq $tmp,$H1,$H1
3726
3727 ################################################################
3728
3729 vmovq %x#$H0,0($ctx)
3730 vmovq %x#$H1,8($ctx)
3731 vmovq %x#$H2,16($ctx)
3732 vzeroall
3733
3734.Lno_data_vpmadd52_8x:
3735 ret
048fa13e 3736.cfi_endproc
0a5d1a38
AP
3737.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3738___
3739}
fd910ef9
AP
3740$code.=<<___;
3741.type poly1305_emit_base2_44,\@function,3
3742.align 32
3743poly1305_emit_base2_44:
048fa13e 3744.cfi_startproc
98ad3fe8 3745 endbranch
fd910ef9
AP
3746 mov 0($ctx),%r8 # load hash value
3747 mov 8($ctx),%r9
3748 mov 16($ctx),%r10
3749
3750 mov %r9,%rax
3751 shr \$20,%r9
3752 shl \$44,%rax
3753 mov %r10,%rcx
3754 shr \$40,%r10
3755 shl \$24,%rcx
3756
3757 add %rax,%r8
3758 adc %rcx,%r9
3759 adc \$0,%r10
3760
3761 mov %r8,%rax
3762 add \$5,%r8 # compare to modulus
3763 mov %r9,%rcx
3764 adc \$0,%r9
3765 adc \$0,%r10
46f4e1be 3766 shr \$2,%r10 # did 130-bit value overflow?
fd910ef9
AP
3767 cmovnz %r8,%rax
3768 cmovnz %r9,%rcx
3769
3770 add 0($nonce),%rax # accumulate nonce
3771 adc 8($nonce),%rcx
3772 mov %rax,0($mac) # write result
3773 mov %rcx,8($mac)
3774
3775 ret
048fa13e 3776.cfi_endproc
fd910ef9
AP
3777.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3778___
3779} } }
89778806
AP
3780$code.=<<___;
3781.align 64
3782.Lconst:
3783.Lmask24:
3784.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3785.L129:
3786.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3787.Lmask26:
3788.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3789.Lpermd_avx2:
3790.long 2,2,2,3,2,0,2,1
3791.Lpermd_avx512:
3792.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3793
3794.L2_44_inp_permd:
3795.long 0,1,1,2,2,3,7,7
3796.L2_44_inp_shift:
3797.quad 0,12,24,64
3798.L2_44_mask:
3799.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3800.L2_44_shift_rgt:
3801.quad 44,44,42,64
3802.L2_44_shift_lft:
3803.quad 8,8,10,64
3804
3805.align 64
3806.Lx_mask44:
3807.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3808.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3809.Lx_mask42:
3810.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3811.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3812___
3813}
3814$code.=<<___;
3815.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3816.align 16
3817___
0edb109f
AP
3818
3819{ # chacha20-poly1305 helpers
3820my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3821 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3822$code.=<<___;
3823.globl xor128_encrypt_n_pad
3824.type xor128_encrypt_n_pad,\@abi-omnipotent
3825.align 16
3826xor128_encrypt_n_pad:
048fa13e 3827.cfi_startproc
0edb109f
AP
3828 sub $otp,$inp
3829 sub $otp,$out
3830 mov $len,%r10 # put len aside
3831 shr \$4,$len # len / 16
3832 jz .Ltail_enc
3833 nop
3834.Loop_enc_xmm:
3835 movdqu ($inp,$otp),%xmm0
3836 pxor ($otp),%xmm0
3837 movdqu %xmm0,($out,$otp)
3838 movdqa %xmm0,($otp)
3839 lea 16($otp),$otp
3840 dec $len
3841 jnz .Loop_enc_xmm
3842
3843 and \$15,%r10 # len % 16
3844 jz .Ldone_enc
3845
3846.Ltail_enc:
3847 mov \$16,$len
3848 sub %r10,$len
3849 xor %eax,%eax
3850.Loop_enc_byte:
3851 mov ($inp,$otp),%al
3852 xor ($otp),%al
3853 mov %al,($out,$otp)
3854 mov %al,($otp)
3855 lea 1($otp),$otp
3856 dec %r10
3857 jnz .Loop_enc_byte
3858
3859 xor %eax,%eax
3860.Loop_enc_pad:
3861 mov %al,($otp)
3862 lea 1($otp),$otp
3863 dec $len
3864 jnz .Loop_enc_pad
3865
3866.Ldone_enc:
3867 mov $otp,%rax
3868 ret
048fa13e 3869.cfi_endproc
0edb109f
AP
3870.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3871
3872.globl xor128_decrypt_n_pad
3873.type xor128_decrypt_n_pad,\@abi-omnipotent
3874.align 16
3875xor128_decrypt_n_pad:
048fa13e 3876.cfi_startproc
0edb109f
AP
3877 sub $otp,$inp
3878 sub $otp,$out
3879 mov $len,%r10 # put len aside
3880 shr \$4,$len # len / 16
3881 jz .Ltail_dec
3882 nop
3883.Loop_dec_xmm:
3884 movdqu ($inp,$otp),%xmm0
3885 movdqa ($otp),%xmm1
3886 pxor %xmm0,%xmm1
3887 movdqu %xmm1,($out,$otp)
3888 movdqa %xmm0,($otp)
3889 lea 16($otp),$otp
3890 dec $len
3891 jnz .Loop_dec_xmm
3892
3893 pxor %xmm1,%xmm1
3894 and \$15,%r10 # len % 16
3895 jz .Ldone_dec
3896
3897.Ltail_dec:
3898 mov \$16,$len
3899 sub %r10,$len
3900 xor %eax,%eax
3901 xor %r11,%r11
3902.Loop_dec_byte:
3903 mov ($inp,$otp),%r11b
3904 mov ($otp),%al
3905 xor %r11b,%al
3906 mov %al,($out,$otp)
3907 mov %r11b,($otp)
3908 lea 1($otp),$otp
3909 dec %r10
3910 jnz .Loop_dec_byte
3911
3912 xor %eax,%eax
3913.Loop_dec_pad:
3914 mov %al,($otp)
3915 lea 1($otp),$otp
3916 dec $len
3917 jnz .Loop_dec_pad
3918
3919.Ldone_dec:
3920 mov $otp,%rax
3921 ret
048fa13e 3922.cfi_endproc
0edb109f
AP
3923.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3924___
3925}
a98c648e
AP
3926
3927# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3928# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3929if ($win64) {
3930$rec="%rcx";
3931$frame="%rdx";
3932$context="%r8";
3933$disp="%r9";
3934
3935$code.=<<___;
3936.extern __imp_RtlVirtualUnwind
3937.type se_handler,\@abi-omnipotent
3938.align 16
3939se_handler:
3940 push %rsi
3941 push %rdi
3942 push %rbx
3943 push %rbp
3944 push %r12
3945 push %r13
3946 push %r14
3947 push %r15
3948 pushfq
3949 sub \$64,%rsp
3950
3951 mov 120($context),%rax # pull context->Rax
3952 mov 248($context),%rbx # pull context->Rip
3953
3954 mov 8($disp),%rsi # disp->ImageBase
3955 mov 56($disp),%r11 # disp->HandlerData
3956
3957 mov 0(%r11),%r10d # HandlerData[0]
3958 lea (%rsi,%r10),%r10 # prologue label
3959 cmp %r10,%rbx # context->Rip<.Lprologue
3960 jb .Lcommon_seh_tail
3961
3962 mov 152($context),%rax # pull context->Rsp
3963
3964 mov 4(%r11),%r10d # HandlerData[1]
3965 lea (%rsi,%r10),%r10 # epilogue label
3966 cmp %r10,%rbx # context->Rip>=.Lepilogue
3967 jae .Lcommon_seh_tail
3968
3969 lea 48(%rax),%rax
3970
3971 mov -8(%rax),%rbx
3972 mov -16(%rax),%rbp
3973 mov -24(%rax),%r12
3974 mov -32(%rax),%r13
3975 mov -40(%rax),%r14
3976 mov -48(%rax),%r15
3977 mov %rbx,144($context) # restore context->Rbx
3978 mov %rbp,160($context) # restore context->Rbp
3979 mov %r12,216($context) # restore context->R12
3980 mov %r13,224($context) # restore context->R13
3981 mov %r14,232($context) # restore context->R14
3982 mov %r15,240($context) # restore context->R14
3983
3984 jmp .Lcommon_seh_tail
3985.size se_handler,.-se_handler
3986
3987.type avx_handler,\@abi-omnipotent
3988.align 16
3989avx_handler:
3990 push %rsi
3991 push %rdi
3992 push %rbx
3993 push %rbp
3994 push %r12
3995 push %r13
3996 push %r14
3997 push %r15
3998 pushfq
3999 sub \$64,%rsp
4000
4001 mov 120($context),%rax # pull context->Rax
4002 mov 248($context),%rbx # pull context->Rip
4003
4004 mov 8($disp),%rsi # disp->ImageBase
4005 mov 56($disp),%r11 # disp->HandlerData
4006
4007 mov 0(%r11),%r10d # HandlerData[0]
4008 lea (%rsi,%r10),%r10 # prologue label
4009 cmp %r10,%rbx # context->Rip<prologue label
4010 jb .Lcommon_seh_tail
4011
4012 mov 152($context),%rax # pull context->Rsp
4013
4014 mov 4(%r11),%r10d # HandlerData[1]
4015 lea (%rsi,%r10),%r10 # epilogue label
4016 cmp %r10,%rbx # context->Rip>=epilogue label
4017 jae .Lcommon_seh_tail
4018
4019 mov 208($context),%rax # pull context->R11
4020
4021 lea 0x50(%rax),%rsi
4022 lea 0xf8(%rax),%rax
4023 lea 512($context),%rdi # &context.Xmm6
4024 mov \$20,%ecx
4025 .long 0xa548f3fc # cld; rep movsq
4026
4027.Lcommon_seh_tail:
4028 mov 8(%rax),%rdi
4029 mov 16(%rax),%rsi
4030 mov %rax,152($context) # restore context->Rsp
4031 mov %rsi,168($context) # restore context->Rsi
4032 mov %rdi,176($context) # restore context->Rdi
4033
4034 mov 40($disp),%rdi # disp->ContextRecord
4035 mov $context,%rsi # context
4036 mov \$154,%ecx # sizeof(CONTEXT)
4037 .long 0xa548f3fc # cld; rep movsq
4038
4039 mov $disp,%rsi
4040 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4041 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4042 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4043 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4044 mov 40(%rsi),%r10 # disp->ContextRecord
4045 lea 56(%rsi),%r11 # &disp->HandlerData
4046 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4047 mov %r10,32(%rsp) # arg5
4048 mov %r11,40(%rsp) # arg6
4049 mov %r12,48(%rsp) # arg7
4050 mov %rcx,56(%rsp) # arg8, (NULL)
4051 call *__imp_RtlVirtualUnwind(%rip)
4052
4053 mov \$1,%eax # ExceptionContinueSearch
4054 add \$64,%rsp
4055 popfq
4056 pop %r15
4057 pop %r14
4058 pop %r13
4059 pop %r12
4060 pop %rbp
4061 pop %rbx
4062 pop %rdi
4063 pop %rsi
4064 ret
4065.size avx_handler,.-avx_handler
4066
4067.section .pdata
4068.align 4
4069 .rva .LSEH_begin_poly1305_init
4070 .rva .LSEH_end_poly1305_init
4071 .rva .LSEH_info_poly1305_init
4072
4073 .rva .LSEH_begin_poly1305_blocks
4074 .rva .LSEH_end_poly1305_blocks
4075 .rva .LSEH_info_poly1305_blocks
4076
4077 .rva .LSEH_begin_poly1305_emit
4078 .rva .LSEH_end_poly1305_emit
4079 .rva .LSEH_info_poly1305_emit
4080___
4081$code.=<<___ if ($avx);
4082 .rva .LSEH_begin_poly1305_blocks_avx
4083 .rva .Lbase2_64_avx
4084 .rva .LSEH_info_poly1305_blocks_avx_1
4085
4086 .rva .Lbase2_64_avx
4087 .rva .Leven_avx
4088 .rva .LSEH_info_poly1305_blocks_avx_2
4089
4090 .rva .Leven_avx
4091 .rva .LSEH_end_poly1305_blocks_avx
4092 .rva .LSEH_info_poly1305_blocks_avx_3
4093
4094 .rva .LSEH_begin_poly1305_emit_avx
4095 .rva .LSEH_end_poly1305_emit_avx
4096 .rva .LSEH_info_poly1305_emit_avx
4097___
4098$code.=<<___ if ($avx>1);
4099 .rva .LSEH_begin_poly1305_blocks_avx2
4100 .rva .Lbase2_64_avx2
4101 .rva .LSEH_info_poly1305_blocks_avx2_1
4102
4103 .rva .Lbase2_64_avx2
4104 .rva .Leven_avx2
4105 .rva .LSEH_info_poly1305_blocks_avx2_2
4106
4107 .rva .Leven_avx2
4108 .rva .LSEH_end_poly1305_blocks_avx2
4109 .rva .LSEH_info_poly1305_blocks_avx2_3
4110___
abb8c44f
AP
4111$code.=<<___ if ($avx>2);
4112 .rva .LSEH_begin_poly1305_blocks_avx512
4113 .rva .LSEH_end_poly1305_blocks_avx512
4114 .rva .LSEH_info_poly1305_blocks_avx512
4115___
a98c648e
AP
4116$code.=<<___;
4117.section .xdata
4118.align 8
4119.LSEH_info_poly1305_init:
4120 .byte 9,0,0,0
4121 .rva se_handler
4122 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4123
4124.LSEH_info_poly1305_blocks:
4125 .byte 9,0,0,0
4126 .rva se_handler
4127 .rva .Lblocks_body,.Lblocks_epilogue
4128
4129.LSEH_info_poly1305_emit:
4130 .byte 9,0,0,0
4131 .rva se_handler
4132 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4133___
4134$code.=<<___ if ($avx);
4135.LSEH_info_poly1305_blocks_avx_1:
4136 .byte 9,0,0,0
4137 .rva se_handler
4138 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4139
4140.LSEH_info_poly1305_blocks_avx_2:
4141 .byte 9,0,0,0
4142 .rva se_handler
4143 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4144
4145.LSEH_info_poly1305_blocks_avx_3:
4146 .byte 9,0,0,0
4147 .rva avx_handler
4148 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4149
4150.LSEH_info_poly1305_emit_avx:
4151 .byte 9,0,0,0
4152 .rva se_handler
4153 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4154___
4155$code.=<<___ if ($avx>1);
4156.LSEH_info_poly1305_blocks_avx2_1:
4157 .byte 9,0,0,0
4158 .rva se_handler
4159 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4160
4161.LSEH_info_poly1305_blocks_avx2_2:
4162 .byte 9,0,0,0
4163 .rva se_handler
4164 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4165
4166.LSEH_info_poly1305_blocks_avx2_3:
4167 .byte 9,0,0,0
4168 .rva avx_handler
4169 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4170___
abb8c44f
AP
4171$code.=<<___ if ($avx>2);
4172.LSEH_info_poly1305_blocks_avx512:
4173 .byte 9,0,0,0
4174 .rva avx_handler
4175 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4176___
a98c648e
AP
4177}
4178
4179foreach (split('\n',$code)) {
4180 s/\`([^\`]*)\`/eval($1)/ge;
4181 s/%r([a-z]+)#d/%e$1/g;
4182 s/%r([0-9]+)#d/%r$1d/g;
abb8c44f 4183 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
a98c648e
AP
4184
4185 print $_,"\n";
4186}
a21314db 4187close STDOUT or die "error closing STDOUT: $!";