]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/aes-gcm-avx512.pl
Update copyright year
[thirdparty/openssl.git] / crypto / modes / asm / aes-gcm-avx512.pl
CommitLineData
fecb3aae 1# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
63b996e7
AM
2# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
11# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
12# (https://github.com/intel/intel-ipsec-mb).
13# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
14#
15# References:
16# [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
17# Intel Architecture Processors. August, 2010.
18# [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
19# Intel Architecture Processors. October, 2012.
20# [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
21# Usage for Computing the GCM Mode. May, 2010.
22#
23#
24# December 2021
25#
26# Initial release.
27#
28# GCM128_CONTEXT structure has storage for 16 hkeys only, but this
29# implementation can use up to 48. To avoid extending the context size,
30# precompute and store in the context first 16 hkeys only, and compute the rest
31# on demand keeping them in the local frame.
32#
33#======================================================================
34# $output is the last argument if it looks like a file (it has an extension)
35# $flavour is the first argument if it doesn't look like a file
36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
38
39$win64 = 0;
40$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$avx512vaes = 0;
43
44$0 =~ m/(.*[\/\\])[^\/\\]+$/;
45$dir = $1;
46($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
47 or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
48 or die "can't locate x86_64-xlate.pl";
49
50if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
51 $avx512vaes = ($1 >= 2.30);
52}
53
54if (!$avx512vaes
55 && $win64
56 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
57 && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
58{
59 $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
60}
61
62if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
63 $avx512vaes = ($2 >= 7.0);
64}
65
66open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
67 or die "can't call $xlate: $!";
68*STDOUT = *OUT;
69
70#======================================================================
71if ($avx512vaes>0) { #<<<
72
73$code .= <<___;
74.extern OPENSSL_ia32cap_P
75.globl ossl_vaes_vpclmulqdq_capable
76.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
77.align 32
78ossl_vaes_vpclmulqdq_capable:
79 mov OPENSSL_ia32cap_P+8(%rip), %rcx
80 # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
81 mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
82 xor %eax,%eax
83 and %rdx,%rcx
84 cmp %rdx,%rcx
85 cmove %rcx,%rax
86 ret
87.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
88___
89
90# ; Mapping key length -> AES rounds count
91my %aes_rounds = (
92 128 => 9,
93 192 => 11,
94 256 => 13);
95
96# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
97# ;;; Code generation control switches
98# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
99
100# ; ABI-aware zeroing of volatile registers in EPILOG().
101# ; Disabled due to performance reasons.
102my $CLEAR_SCRATCH_REGISTERS = 0;
103
104# ; Zero HKeys storage from the stack if they are stored there
105my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
106
107# ; Enable / disable check of function arguments for null pointer
108# ; Currently disabled, as this check is handled outside.
109my $CHECK_FUNCTION_ARGUMENTS = 0;
110
111# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
112# ;;; Global constants
113# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114
115# AES block size in bytes
116my $AES_BLOCK_SIZE = 16;
117
118# Storage capacity in elements
119my $HKEYS_STORAGE_CAPACITY = 48;
120my $LOCAL_STORAGE_CAPACITY = 48;
121my $HKEYS_CONTEXT_CAPACITY = 16;
122
123# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
124# ;;; Stack frame definition
125# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
126
127# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
128# (2) -> +8-byte space for 16-byte alignment of XMM storage
129# (3) -> Frame pointer (%RBP)
130# (4) -> +160-byte XMM storage (Windows only, zero on Linux)
131# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
132# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
133# (7) -> +768-byte HKEYS storage
134# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
135
136my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack)
137my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers
138my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48
139my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks
140
141my $STACK_HKEYS_OFFSET = 0;
142my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
143
144# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
145# ;;; Function arguments abstraction
146# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
147my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
148
149# ; This implementation follows the convention: for non-leaf functions (they
150# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
151# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This
152# ; helps to facilitate SEH handlers writing.
153#
154# ; Leaf functions here do not use more than 4 input arguments.
155if ($win64) {
156 $arg1 = "%rcx";
157 $arg2 = "%rdx";
158 $arg3 = "%r8";
159 $arg4 = "%r9";
160 $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes
161 $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
162 $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
163 $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
164 $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
165 $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
166 $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
167} else {
168 $arg1 = "%rdi";
169 $arg2 = "%rsi";
170 $arg3 = "%rdx";
171 $arg4 = "%rcx";
172 $arg5 = "%r8";
173 $arg6 = "%r9";
174 $arg7 = "`$GP_STORAGE + 8*1`(%rbp)";
175 $arg8 = "`$GP_STORAGE + 8*2`(%rbp)";
176 $arg9 = "`$GP_STORAGE + 8*3`(%rbp)";
177 $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
178 $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
179}
180
181# ; Offsets in gcm128_context structure (see include/crypto/modes.h)
182my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key
183my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer
184my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation)
185my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input
186my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted
187my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash
188my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values)
189
190# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191# ;;; Helper functions
192# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193
194# ; Generates "random" local labels
195sub random_string() {
196 my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
197 my $length = 15;
198 my $str;
199 map { $str .= $chars[rand(33)] } 1 .. $length;
200 return $str;
201}
202
203sub BYTE {
204 my ($reg) = @_;
205 if ($reg =~ /%r[abcd]x/i) {
206 $reg =~ s/%r([abcd])x/%${1}l/i;
207 } elsif ($reg =~ /%r[sdb][ip]/i) {
208 $reg =~ s/%r([sdb][ip])/%${1}l/i;
209 } elsif ($reg =~ /%r[0-9]{1,2}/i) {
210 $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
211 } else {
212 die "BYTE: unknown register: $reg\n";
213 }
214 return $reg;
215}
216
217sub WORD {
218 my ($reg) = @_;
219 if ($reg =~ /%r[abcdsdb][xip]/i) {
220 $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
221 } elsif ($reg =~ /%r[0-9]{1,2}/) {
222 $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
223 } else {
224 die "WORD: unknown register: $reg\n";
225 }
226 return $reg;
227}
228
229sub DWORD {
230 my ($reg) = @_;
231 if ($reg =~ /%r[abcdsdb][xip]/i) {
232 $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
233 } elsif ($reg =~ /%r[0-9]{1,2}/i) {
234 $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
235 } else {
236 die "DWORD: unknown register: $reg\n";
237 }
238 return $reg;
239}
240
241sub XWORD {
242 my ($reg) = @_;
243 if ($reg =~ /%[xyz]mm/i) {
244 $reg =~ s/%[xyz]mm/%xmm/i;
245 } else {
246 die "XWORD: unknown register: $reg\n";
247 }
248 return $reg;
249}
250
251sub YWORD {
252 my ($reg) = @_;
253 if ($reg =~ /%[xyz]mm/i) {
254 $reg =~ s/%[xyz]mm/%ymm/i;
255 } else {
256 die "YWORD: unknown register: $reg\n";
257 }
258 return $reg;
259}
260
261sub ZWORD {
262 my ($reg) = @_;
263 if ($reg =~ /%[xyz]mm/i) {
264 $reg =~ s/%[xyz]mm/%zmm/i;
265 } else {
266 die "ZWORD: unknown register: $reg\n";
267 }
268 return $reg;
269}
270
271# ; Helper function to construct effective address based on two kinds of
272# ; offsets: numerical or located in the register
273sub EffectiveAddress {
274 my ($base, $offset, $displacement) = @_;
275 $displacement = 0 if (!$displacement);
276
277 if ($offset =~ /^\d+\z/) { # numerical offset
278 return "`$offset + $displacement`($base)";
279 } else { # offset resides in register
280 return "$displacement($base,$offset,1)";
281 }
282}
283
284# ; Provides memory location of corresponding HashKey power
285sub HashKeyByIdx {
286 my ($idx, $base) = @_;
287 my $base_str = ($base eq "%rsp") ? "frame" : "context";
288
289 my $offset = &HashKeyOffsetByIdx($idx, $base_str);
290 return "$offset($base)";
291}
292
293# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
294sub HashKeyOffsetByIdx {
295 my ($idx, $base) = @_;
296 die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
297 if (($base ne "frame") && ($base ne "context"));
298
299 my $offset_base;
300 my $offset_idx;
301 if ($base eq "frame") { # frame storage
302 die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
303 $offset_base = $STACK_HKEYS_OFFSET;
304 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
305 } else { # context storage
306 die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
307 $offset_base = $CTX_OFFSET_HTable;
308 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
309 }
310 return $offset_base + $offset_idx;
311}
312
313# ; Creates local frame and does back up of non-volatile registers.
314# ; Holds stack unwinding directives.
315sub PROLOG {
316 my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
317
318 my $DYNAMIC_STACK_ALLOC_SIZE = 0;
319 my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
320
321 if ($need_hkeys_stack_storage) {
322 $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
323 }
324
325 if ($need_aes_stack_storage) {
326 if (!$need_hkeys_stack_storage) {
327 die "PROLOG: unsupported case - aes storage without hkeys one";
328 }
329 $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
330 }
331
332 $code .= <<___;
333 push %rbx
334.cfi_push %rbx
335.L${func_name}_seh_push_rbx:
336 push %rbp
337.cfi_push %rbp
338.L${func_name}_seh_push_rbp:
339 push %r12
340.cfi_push %r12
341.L${func_name}_seh_push_r12:
342 push %r13
343.cfi_push %r13
344.L${func_name}_seh_push_r13:
345 push %r14
346.cfi_push %r14
347.L${func_name}_seh_push_r14:
348 push %r15
349.cfi_push %r15
350.L${func_name}_seh_push_r15:
351___
352
353 if ($win64) {
354 $code .= <<___;
355 push %rdi
356.L${func_name}_seh_push_rdi:
357 push %rsi
358.L${func_name}_seh_push_rsi:
359
360 sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment
361.L${func_name}_seh_allocstack_xmm:
362___
363 }
364 $code .= <<___;
365 # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
366 # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH
367 # ; handlers. The requirement for a frame pointer is that its offset from
368 # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
369 # ; itself seems to be reasonable to use here, because later we do 64-byte stack
370 # ; alignment which gives us non-determinate offsets and complicates writing
371 # ; SEH handlers.
372 #
373 # ; It also serves as an anchor for retrieving stack arguments on both Linux
374 # ; and Windows.
375 lea `$XMM_STORAGE`(%rsp),%rbp
376.cfi_def_cfa_register %rbp
377.L${func_name}_seh_setfp:
378___
379 if ($win64) {
380
381 # ; xmm6:xmm15 need to be preserved on Windows
382 foreach my $reg_idx (6 .. 15) {
383 my $xmm_reg_offset = ($reg_idx - 6) * 16;
384 $code .= <<___;
385 vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp)
386.L${func_name}_seh_save_xmm${reg_idx}:
387___
388 }
389 }
390
391 $code .= <<___;
392# Prolog ends here. Next stack allocation is treated as "dynamic".
393.L${func_name}_seh_prolog_end:
394___
395
396 if ($DYNAMIC_STACK_ALLOC_SIZE) {
397 $code .= <<___;
398 sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
399 and \$(-64),%rsp
400___
401 }
402}
403
404# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
405# ;;; Restore register content for the caller.
406# ;;; And cleanup stack.
407# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
408sub EPILOG {
409 my ($hkeys_storage_on_stack, $payload_len) = @_;
410
411 my $rndsuffix = &random_string();
412
413 if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
414
415 # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
416 # ; were stored in the local frame storage
417 $code .= <<___;
418 cmpq \$`16*16`,$payload_len
419 jbe .Lskip_hkeys_cleanup_${rndsuffix}
420 vpxor %xmm0,%xmm0,%xmm0
421___
422 for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
423 $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
424 }
425 $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n";
426 }
427
428 if ($CLEAR_SCRATCH_REGISTERS) {
429 &clear_scratch_gps_asm();
430 &clear_scratch_zmms_asm();
431 } else {
432 $code .= "vzeroupper\n";
433 }
434
435 if ($win64) {
436
437 # ; restore xmm15:xmm6
438 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
439 my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
440 $code .= <<___;
441 vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx},
442___
443 }
444 }
445
446 if ($win64) {
447
448 # Forming valid epilog for SEH with use of frame pointer.
449 # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
450 $code .= "lea 8(%rbp),%rsp\n";
451 } else {
452 $code .= "lea (%rbp),%rsp\n";
453 $code .= ".cfi_def_cfa_register %rsp\n";
454 }
455
456 if ($win64) {
457 $code .= <<___;
458 pop %rsi
459.cfi_pop %rsi
460 pop %rdi
461.cfi_pop %rdi
462___
463 }
464 $code .= <<___;
465 pop %r15
466.cfi_pop %r15
467 pop %r14
468.cfi_pop %r14
469 pop %r13
470.cfi_pop %r13
471 pop %r12
472.cfi_pop %r12
473 pop %rbp
474.cfi_pop %rbp
475 pop %rbx
476.cfi_pop %rbx
477___
478}
479
480# ; Clears all scratch ZMM registers
481# ;
482# ; It should be called before restoring the XMM registers
483# ; for Windows (XMM6-XMM15).
484# ;
485sub clear_scratch_zmms_asm {
486
487 # ; On Linux, all ZMM registers are scratch registers
488 if (!$win64) {
489 $code .= "vzeroall\n";
490 } else {
491 foreach my $i (0 .. 5) {
492 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
493 }
494 }
495 foreach my $i (16 .. 31) {
496 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
497 }
498}
499
500# Clears all scratch GP registers
501sub clear_scratch_gps_asm {
502 foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
503 $code .= "xor $reg,$reg\n";
504 }
505 if (!$win64) {
506 foreach my $reg ("%rsi", "%rdi") {
507 $code .= "xor $reg,$reg\n";
508 }
509 }
510}
511
512sub precompute_hkeys_on_stack {
513 my $GCM128_CTX = $_[0];
514 my $HKEYS_READY = $_[1];
515 my $ZTMP0 = $_[2];
516 my $ZTMP1 = $_[3];
517 my $ZTMP2 = $_[4];
518 my $ZTMP3 = $_[5];
519 my $ZTMP4 = $_[6];
520 my $ZTMP5 = $_[7];
521 my $ZTMP6 = $_[8];
522 my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32"
523
524 die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
525 if ($HKEYS_RANGE ne "first16"
526 && $HKEYS_RANGE ne "mid16"
527 && $HKEYS_RANGE ne "all"
528 && $HKEYS_RANGE ne "first32"
529 && $HKEYS_RANGE ne "last32");
530
531 my $rndsuffix = &random_string();
532
533 $code .= <<___;
534 test $HKEYS_READY,$HKEYS_READY
535 jnz .L_skip_hkeys_precomputation_${rndsuffix}
536___
537
538 if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
539
540 # ; Fill the stack with the first 16 hkeys from the context
541 $code .= <<___;
542 # ; Move 16 hkeys from the context to stack
543 vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
544 vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
545
546 vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
547 vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
548
549 # ; broadcast HashKey^8
550 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
551
552 vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
553 vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
554
555 vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
556 vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
557___
558 }
559
560 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
561 $code .= <<___;
562 vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
563
564 # ; broadcast HashKey^8
565 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
566
567 vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
568 vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
569___
570
571 }
572
573 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
574
575 # ; Precompute hkeys^i, i=17..32
576 my $i = 20;
577 foreach (1 .. int((32 - 16) / 8)) {
578
579 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
580 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
581 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
582 $i += 4;
583
584 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
585 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
586 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
587 $i += 4;
588 }
589 }
590
591 if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
592
593 # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
594 my $i = 36;
595 foreach (1 .. int((48 - 32) / 8)) {
596
597 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
598 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
599 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
600 $i += 4;
601
602 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
603 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
604 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
605 $i += 4;
606 }
607 }
608
609 $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n";
610}
611
612# ;; =============================================================================
613# ;; Generic macro to produce code that executes $OPCODE instruction
614# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
615# ;; All three operands of the instruction come from registers.
616# ;; Note: if 3 blocks are left at the end instruction is produced to operate all
617# ;; 4 blocks (full width of ZMM)
618sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
619 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
620 my $OPCODE = $_[1]; # [in] instruction name
621 my @DST;
622 $DST[0] = $_[2]; # [out] destination ZMM register
623 $DST[1] = $_[3]; # [out] destination ZMM register
624 $DST[2] = $_[4]; # [out] destination ZMM register
625 $DST[3] = $_[5]; # [out] destination ZMM register
626 my @SRC1;
627 $SRC1[0] = $_[6]; # [in] source 1 ZMM register
628 $SRC1[1] = $_[7]; # [in] source 1 ZMM register
629 $SRC1[2] = $_[8]; # [in] source 1 ZMM register
630 $SRC1[3] = $_[9]; # [in] source 1 ZMM register
631 my @SRC2;
632 $SRC2[0] = $_[10]; # [in] source 2 ZMM register
633 $SRC2[1] = $_[11]; # [in] source 2 ZMM register
634 $SRC2[2] = $_[12]; # [in] source 2 ZMM register
635 $SRC2[3] = $_[13]; # [in] source 2 ZMM register
636
637 die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
638 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
639
640 my $reg_idx = 0;
641 my $blocks_left = $NUM_BLOCKS;
642
643 foreach (1 .. ($NUM_BLOCKS / 4)) {
644 $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
645 $reg_idx++;
646 $blocks_left -= 4;
647 }
648
649 my $DSTREG = $DST[$reg_idx];
650 my $SRC1REG = $SRC1[$reg_idx];
651 my $SRC2REG = $SRC2[$reg_idx];
652
653 if ($blocks_left == 1) {
654 $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
655 } elsif ($blocks_left == 2) {
656 $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
657 } elsif ($blocks_left == 3) {
658 $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n";
659 }
660}
661
662# ;; =============================================================================
663# ;; Loads specified number of AES blocks into ZMM registers using mask register
664# ;; for the last loaded register (xmm, ymm or zmm).
665# ;; Loads take place at 1 byte granularity.
666sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
667 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
668 my $INP = $_[1]; # [in] input data pointer to read from
669 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
670 my @DST;
671 $DST[0] = $_[3]; # [out] ZMM register with loaded data
672 $DST[1] = $_[4]; # [out] ZMM register with loaded data
673 $DST[2] = $_[5]; # [out] ZMM register with loaded data
674 $DST[3] = $_[6]; # [out] ZMM register with loaded data
675 my $MASK = $_[7]; # [in] mask register
676
677 die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
678 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
679
680 my $src_offset = 0;
681 my $dst_idx = 0;
682 my $blocks_left = $NUM_BLOCKS;
683
684 if ($NUM_BLOCKS > 0) {
685 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
686 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
687 $src_offset += 64;
688 $dst_idx++;
689 $blocks_left -= 4;
690 }
691 }
692
693 my $DSTREG = $DST[$dst_idx];
694
695 if ($blocks_left == 1) {
696 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
697 } elsif ($blocks_left == 2) {
698 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
699 } elsif (($blocks_left == 3 || $blocks_left == 4)) {
700 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
701 }
702}
703
704# ;; =============================================================================
705# ;; Stores specified number of AES blocks from ZMM registers with mask register
706# ;; for the last loaded register (xmm, ymm or zmm).
707# ;; Stores take place at 1 byte granularity.
708sub ZMM_STORE_MASKED_BLOCKS_0_16 {
709 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
710 my $OUTP = $_[1]; # [in] output data pointer to write to
711 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
712 my @SRC;
713 $SRC[0] = $_[3]; # [in] ZMM register with data to store
714 $SRC[1] = $_[4]; # [in] ZMM register with data to store
715 $SRC[2] = $_[5]; # [in] ZMM register with data to store
716 $SRC[3] = $_[6]; # [in] ZMM register with data to store
717 my $MASK = $_[7]; # [in] mask register
718
719 die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
720 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
721
722 my $dst_offset = 0;
723 my $src_idx = 0;
724 my $blocks_left = $NUM_BLOCKS;
725
726 if ($NUM_BLOCKS > 0) {
727 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
728 $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
729 $dst_offset += 64;
730 $src_idx++;
731 $blocks_left -= 4;
732 }
733 }
734
735 my $SRCREG = $SRC[$src_idx];
736
737 if ($blocks_left == 1) {
738 $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
739 } elsif ($blocks_left == 2) {
740 $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
741 } elsif ($blocks_left == 3 || $blocks_left == 4) {
742 $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
743 }
744}
745
746# ;;; ===========================================================================
747# ;;; Handles AES encryption rounds
748# ;;; It handles special cases: the last and first rounds
749# ;;; Optionally, it performs XOR with data after the last AES round.
750# ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
751# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
752sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
753 my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3
754 my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7
755 my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11
756 my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15
757 my $KEY = $_[4]; # [in] zmm containing round key
758 my $ROUND = $_[5]; # [in] round number
759 my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3
760 my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7
761 my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11
762 my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15
763 my $NUMBL = $_[10]; # [in] number of blocks; numerical value
764 my $NROUNDS = $_[11]; # [in] number of rounds; numerical value
765
766 # ;;; === first AES round
767 if ($ROUND < 1) {
768
769 # ;; round 0
770 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
771 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
772 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
773 }
774
775 # ;;; === middle AES rounds
776 if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
777
778 # ;; rounds 1 to 9/11/13
779 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
780 $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
781 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
782 }
783
784 # ;;; === last AES round
785 if ($ROUND > $NROUNDS) {
786
787 # ;; the last round - mix enclast with text xor's
788 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
789 $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
790 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
791
792 # ;;; === XOR with data
793 if ( ($D0_3 ne "no_data")
794 && ($D4_7 ne "no_data")
795 && ($D8_11 ne "no_data")
796 && ($D12_15 ne "no_data"))
797 {
798 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
799 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
800 $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15);
801 }
802 }
803}
804
805# ;;; Horizontal XOR - 4 x 128bits xored together
806sub VHPXORI4x128 {
807 my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output
808 my $TMP = $_[1]; # [clobbered] ZMM temporary register
809 $code .= <<___;
810 vextracti64x4 \$1,$REG,@{[YWORD($TMP)]}
811 vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
812 vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
813 vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
814___
815}
816
817# ;;; AVX512 reduction macro
818sub VCLMUL_REDUCE {
819 my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
820 my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial
821 my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce
822 my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce
823 my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register
824 my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register
825
826 $code .= <<___;
827 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
828 # ;; first phase of the reduction
829 vpclmulqdq \$0x01,$LO128,$POLY,$TMP0
830 vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs
831 vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete
832 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
833 # ;; second phase of the reduction
834 vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1
835 vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R
836 vpclmulqdq \$0x10,$TMP0,$POLY,$OUT
837 vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts
838 vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
839 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
840___
841}
842
843# ;; ===========================================================================
844# ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
845# ;; - it is assumed that data read from $INPTR is already shuffled and
846# ;; $INPTR address is 64 byte aligned
847# ;; - there is an option to pass ready blocks through ZMM registers too.
848# ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
849sub GHASH_16 {
850 my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
851 # end_reduce (end with reduction), start_reduce
852 my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits
853 my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits
854 my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits
855 my $INPTR = $_[4]; # [in] data input pointer
856 my $INOFF = $_[5]; # [in] data input offset
857 my $INDIS = $_[6]; # [in] data input displacement
858 my $HKPTR = $_[7]; # [in] hash key pointer
859 my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset)
860 my $HKDIS = $_[9]; # [in] hash key displacement
861 my $HASH = $_[10]; # [in/out] ZMM hash value in/out
862 my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM
863 my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM
864 my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM
865 my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM
866 my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM
867 my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM
868 my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM
869 my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM
870 my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM
871 my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
872 my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
873 my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
874 my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
875 my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
876
877 my $start_ghash = 0;
878 my $do_reduction = 0;
879 if ($TYPE eq "start") {
880 $start_ghash = 1;
881 }
882
883 if ($TYPE eq "start_reduce") {
884 $start_ghash = 1;
885 $do_reduction = 1;
886 }
887
888 if ($TYPE eq "end_reduce") {
889 $do_reduction = 1;
890 }
891
892 # ;; ghash blocks 0-3
893 if (scalar(@_) == 21) {
894 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
895 } else {
896 $ZTMP9 = $DAT0;
897 }
898
899 if ($start_ghash != 0) {
900 $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n";
901 }
902 $code .= <<___;
903 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
904 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
905 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
906 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
907 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
908___
909
910 # ;; ghash blocks 4-7
911 if (scalar(@_) == 21) {
912 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
913 } else {
914 $ZTMP9 = $DAT1;
915 }
916 $code .= <<___;
917 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
918 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
919 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
920 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
921 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
922___
923
924 # ;; update sums
925 if ($start_ghash != 0) {
926 $code .= <<___;
927 vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1
928 vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H
929 vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L
930 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1
931___
932 } else { # ;; mid, end, end_reduce
933 $code .= <<___;
934 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
935 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
936 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
937 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
938___
939 }
940
941 # ;; ghash blocks 8-11
942 if (scalar(@_) == 21) {
943 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
944 } else {
945 $ZTMP9 = $DAT2;
946 }
947 $code .= <<___;
948 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
949 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
950 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
951 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
952 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
953___
954
955 # ;; ghash blocks 12-15
956 if (scalar(@_) == 21) {
957 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
958 } else {
959 $ZTMP9 = $DAT3;
960 }
961 $code .= <<___;
962 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
963 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
964 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
965 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
966 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
967 # ;; update sums
968 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
969 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
970 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
971 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
972___
973 if ($do_reduction != 0) {
974 $code .= <<___;
975 # ;; integrate GM into GH and GL
976 vpsrldq \$8,$GM,$ZTMP0
977 vpslldq \$8,$GM,$ZTMP1
978 vpxorq $ZTMP0,$GH,$GH
979 vpxorq $ZTMP1,$GL,$GL
980___
981
982 # ;; add GH and GL 128-bit words horizontally
983 &VHPXORI4x128($GH, $ZTMP0);
984 &VHPXORI4x128($GL, $ZTMP1);
985
986 # ;; reduction
987 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n";
988 &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1));
989 }
990}
991
992# ;; ===========================================================================
993# ;; GHASH 1 to 16 blocks of cipher text
994# ;; - performs reduction at the end
995# ;; - it doesn't load the data and it assumed it is already loaded and shuffled
996sub GHASH_1_TO_16 {
997 my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys
998 my $GHASH = $_[1]; # [out] ghash output
999 my $T0H = $_[2]; # [clobbered] temporary ZMM
1000 my $T0L = $_[3]; # [clobbered] temporary ZMM
1001 my $T0M1 = $_[4]; # [clobbered] temporary ZMM
1002 my $T0M2 = $_[5]; # [clobbered] temporary ZMM
1003 my $T1H = $_[6]; # [clobbered] temporary ZMM
1004 my $T1L = $_[7]; # [clobbered] temporary ZMM
1005 my $T1M1 = $_[8]; # [clobbered] temporary ZMM
1006 my $T1M2 = $_[9]; # [clobbered] temporary ZMM
1007 my $HK = $_[10]; # [clobbered] temporary ZMM
1008 my $AAD_HASH_IN = $_[11]; # [in] input hash value
1009 my @CIPHER_IN;
1010 $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3
1011 $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7
1012 $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11
1013 $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15
1014 my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks
1015 my $GH = $_[17]; # [in] ZMM with hi product part
1016 my $GM = $_[18]; # [in] ZMM with mid product part
1017 my $GL = $_[19]; # [in] ZMM with lo product part
1018
1019 die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
1020
1021 if (scalar(@_) == 17) {
1022 $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n";
1023 }
1024
1025 if ($NUM_BLOCKS == 16) {
1026 $code .= <<___;
1027 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1028 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1029 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1030 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1031 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1032 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1033 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1034 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1035 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1036 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1037 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1038 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1039 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1040 vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H
1041 vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L
1042 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1043 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1044 vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1
1045 vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2
1046 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK
1047 vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1
1048 vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0
1049 vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0
1050 vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1
1051 vpxorq $T1H,$T0H,$T1H
1052 vpxorq $T1L,$T0L,$T1L
1053 vpxorq $T1M1,$T0M1,$T1M1
1054 vpxorq $T1M2,$T0M2,$T1M2
1055___
1056 } elsif ($NUM_BLOCKS >= 12) {
1057 $code .= <<___;
1058 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1059 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1060 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1061 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1062 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1063 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1064 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1065 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1066 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1067 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1068 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1069 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1070 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1071 vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H
1072 vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L
1073 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1074 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1075 vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1
1076 vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2
1077___
1078 } elsif ($NUM_BLOCKS >= 8) {
1079 $code .= <<___;
1080 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1081 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1082 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1083 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1084 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1085 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1086 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1087 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1088 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1089 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1090 vpxorq $T1H,$T0H,$T1H
1091 vpxorq $T1L,$T0L,$T1L
1092 vpxorq $T1M1,$T0M1,$T1M1
1093 vpxorq $T1M2,$T0M2,$T1M2
1094___
1095 } elsif ($NUM_BLOCKS >= 4) {
1096 $code .= <<___;
1097 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1098 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1
1099 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0
1100 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0
1101 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1
1102___
1103 }
1104
1105 # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4)
1106 my $blocks_left = ($NUM_BLOCKS % 4);
1107 if ($blocks_left > 0) {
1108
1109 # ;; =====================================================
1110 # ;; There are 1, 2 or 3 blocks left to process.
1111 # ;; It may also be that they are the only blocks to process.
1112
1113 # ;; Set hash key and register index position for the remaining 1 to 3 blocks
1114 my $reg_idx = ($NUM_BLOCKS / 4);
1115 my $REG_IN = $CIPHER_IN[$reg_idx];
1116
1117 if ($blocks_left == 1) {
1118 $code .= <<___;
1119 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]}
1120 vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0
1121 vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1
1122 vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1
1123 vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0
1124___
1125 } elsif ($blocks_left == 2) {
1126 $code .= <<___;
1127 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1128 vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0
1129 vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1
1130 vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1
1131 vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0
1132___
1133 } else { # ; blocks_left == 3
1134 $code .= <<___;
1135 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1136 vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK
1137 vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0
1138 vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1
1139 vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1
1140 vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0
1141___
1142 }
1143
1144 if (scalar(@_) == 20) {
1145
1146 # ;; *** GH/GM/GL passed as arguments
1147 if ($NUM_BLOCKS >= 4) {
1148 $code .= <<___;
1149 # ;; add ghash product sums from the first 4, 8 or 12 blocks
1150 vpxorq $T1M1,$T0M1,$T0M1
1151 vpternlogq \$0x96,$T1M2,$GM,$T0M2
1152 vpternlogq \$0x96,$T1H,$GH,$T0H
1153 vpternlogq \$0x96,$T1L,$GL,$T0L
1154___
1155 } else {
1156 $code .= <<___;
1157 vpxorq $GM,$T0M1,$T0M1
1158 vpxorq $GH,$T0H,$T0H
1159 vpxorq $GL,$T0L,$T0L
1160___
1161 }
1162 } else {
1163
1164 # ;; *** GH/GM/GL NOT passed as arguments
1165 if ($NUM_BLOCKS >= 4) {
1166 $code .= <<___;
1167 # ;; add ghash product sums from the first 4, 8 or 12 blocks
1168 vpxorq $T1M1,$T0M1,$T0M1
1169 vpxorq $T1M2,$T0M2,$T0M2
1170 vpxorq $T1H,$T0H,$T0H
1171 vpxorq $T1L,$T0L,$T0L
1172___
1173 }
1174 }
1175 $code .= <<___;
1176 # ;; integrate TM into TH and TL
1177 vpxorq $T0M2,$T0M1,$T0M1
1178 vpsrldq \$8,$T0M1,$T1M1
1179 vpslldq \$8,$T0M1,$T1M2
1180 vpxorq $T1M1,$T0H,$T0H
1181 vpxorq $T1M2,$T0L,$T0L
1182___
1183 } else {
1184
1185 # ;; =====================================================
1186 # ;; number of blocks is 4, 8, 12 or 16
1187 # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2
1188 if (scalar(@_) == 20) {
1189 $code .= <<___;
1190 # ;; *** GH/GM/GL passed as arguments
1191 vpxorq $GM,$T1M1,$T1M1
1192 vpxorq $GH,$T1H,$T1H
1193 vpxorq $GL,$T1L,$T1L
1194___
1195 }
1196 $code .= <<___;
1197 # ;; integrate TM into TH and TL
1198 vpxorq $T1M2,$T1M1,$T1M1
1199 vpsrldq \$8,$T1M1,$T0M1
1200 vpslldq \$8,$T1M1,$T0M2
1201 vpxorq $T0M1,$T1H,$T0H
1202 vpxorq $T0M2,$T1L,$T0L
1203___
1204 }
1205
1206 # ;; add TH and TL 128-bit words horizontally
1207 &VHPXORI4x128($T0H, $T1M1);
1208 &VHPXORI4x128($T0L, $T1M2);
1209
1210 # ;; reduction
1211 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n";
1212 &VCLMUL_REDUCE(
1213 @{[XWORD($GHASH)]},
1214 @{[XWORD($HK)]},
1215 @{[XWORD($T0H)]},
1216 @{[XWORD($T0L)]},
1217 @{[XWORD($T0M1)]},
1218 @{[XWORD($T0M2)]});
1219}
1220
1221# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1222# ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1)
1223# ;; Input: A and B (128-bits each, bit-reflected)
1224# ;; Output: C = A*B*x mod poly, (i.e. >>1 )
1225# ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1226# ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1227# ;;
1228# ;; Refer to [3] for more detals.
1229# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1230sub GHASH_MUL {
1231 my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits)
1232 my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits)
1233 my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm
1234 my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm
1235 my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm
1236
1237 $code .= <<___;
1238 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1239 vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1
1240 vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0
1241 vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0
1242 vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1
1243 vpxorq $T3,$GH,$GH
1244
1245 vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs
1246 vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs
1247 vpxorq $T3,$T1,$T1
1248 vpxorq $T2,$GH,$GH
1249
1250 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1251 # ;first phase of the reduction
1252 vmovdqu64 POLY2(%rip),$T3
1253
1254 vpclmulqdq \$0x01,$GH,$T3,$T2
1255 vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs
1256 vpxorq $T2,$GH,$GH # ; first phase of the reduction complete
1257
1258 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1259 # ;second phase of the reduction
1260 vpclmulqdq \$0x00,$GH,$T3,$T2
1261 vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R
1262 vpclmulqdq \$0x10,$GH,$T3,$GH
1263 vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts
1264 # ; second phase of the reduction complete, the result is in $GH
1265 vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2
1266 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1267___
1268}
1269
1270# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1271# ;;; PRECOMPUTE computes HashKey_i
1272sub PRECOMPUTE {
1273 my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated
1274 my $HK = $_[1]; #; [in] xmm, hash key
1275 my $T1 = $_[2]; #; [clobbered] xmm
1276 my $T2 = $_[3]; #; [clobbered] xmm
1277 my $T3 = $_[4]; #; [clobbered] xmm
1278 my $T4 = $_[5]; #; [clobbered] xmm
1279 my $T5 = $_[6]; #; [clobbered] xmm
1280 my $T6 = $_[7]; #; [clobbered] xmm
1281
1282 my $ZT1 = &ZWORD($T1);
1283 my $ZT2 = &ZWORD($T2);
1284 my $ZT3 = &ZWORD($T3);
1285 my $ZT4 = &ZWORD($T4);
1286 my $ZT5 = &ZWORD($T5);
1287 my $ZT6 = &ZWORD($T6);
1288
1289 my $YT1 = &YWORD($T1);
1290 my $YT2 = &YWORD($T2);
1291 my $YT3 = &YWORD($T3);
1292 my $YT4 = &YWORD($T4);
1293 my $YT5 = &YWORD($T5);
1294 my $YT6 = &YWORD($T6);
1295
1296 $code .= <<___;
1297 vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5
1298 vmovdqa $YT5,$YT4
1299___
1300
1301 # ;; calculate HashKey^2<<1 mod poly
1302 &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3);
1303
1304 $code .= <<___;
1305 vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]}
1306 vinserti64x2 \$1,$HK,$YT4,$YT5
1307 vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2
1308___
1309
1310 # ;; use 2x128-bit computation
1311 # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly
1312 &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4
1313
1314 $code .= <<___;
1315 vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]}
1316
1317 vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5
1318
1319 # ;; switch to 4x128-bit computations now
1320 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4
1321 vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6
1322___
1323
1324 # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly
1325 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1326 $code .= <<___;
1327 vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now
1328 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4
1329___
1330
1331 # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly
1332 # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution
1333
1334 # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9)
1335 &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3);
1336 $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n";
1337
1338 # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13)
1339 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1340 $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n";
1341
1342 # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys
1343}
1344
1345# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1346# ;; READ_SMALL_DATA_INPUT
1347# ;; Packs xmm register with data when data input is less or equal to 16 bytes
1348# ;; Returns 0 if data has length 0
1349# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1350sub READ_SMALL_DATA_INPUT {
1351 my $OUTPUT = $_[0]; # [out] xmm register
1352 my $INPUT = $_[1]; # [in] buffer pointer to read from
1353 my $LENGTH = $_[2]; # [in] number of bytes to read
1354 my $TMP1 = $_[3]; # [clobbered]
1355 my $TMP2 = $_[4]; # [clobbered]
1356 my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask
1357
1358 $code .= <<___;
1359 mov \$16,@{[DWORD($TMP2)]}
1360 lea byte_len_to_mask_table(%rip),$TMP1
1361 cmp $TMP2,$LENGTH
1362 cmovc $LENGTH,$TMP2
1363___
1364 if ($win64) {
1365 $code .= <<___;
1366 add $TMP2,$TMP1
1367 add $TMP2,$TMP1
1368 kmovw ($TMP1),$MASK
1369___
1370 } else {
1371 $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n";
1372 }
1373 $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n";
1374}
1375
1376# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1377# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
1378# Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
1379# Output: The hash of the data (AAD_HASH).
1380# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1381sub CALC_AAD_HASH {
1382 my $A_IN = $_[0]; # [in] AAD text pointer
1383 my $A_LEN = $_[1]; # [in] AAD length
1384 my $AAD_HASH = $_[2]; # [in/out] xmm ghash value
1385 my $GCM128_CTX = $_[3]; # [in] pointer to context
1386 my $ZT0 = $_[4]; # [clobbered] ZMM register
1387 my $ZT1 = $_[5]; # [clobbered] ZMM register
1388 my $ZT2 = $_[6]; # [clobbered] ZMM register
1389 my $ZT3 = $_[7]; # [clobbered] ZMM register
1390 my $ZT4 = $_[8]; # [clobbered] ZMM register
1391 my $ZT5 = $_[9]; # [clobbered] ZMM register
1392 my $ZT6 = $_[10]; # [clobbered] ZMM register
1393 my $ZT7 = $_[11]; # [clobbered] ZMM register
1394 my $ZT8 = $_[12]; # [clobbered] ZMM register
1395 my $ZT9 = $_[13]; # [clobbered] ZMM register
1396 my $ZT10 = $_[14]; # [clobbered] ZMM register
1397 my $ZT11 = $_[15]; # [clobbered] ZMM register
1398 my $ZT12 = $_[16]; # [clobbered] ZMM register
1399 my $ZT13 = $_[17]; # [clobbered] ZMM register
1400 my $ZT14 = $_[18]; # [clobbered] ZMM register
1401 my $ZT15 = $_[19]; # [clobbered] ZMM register
1402 my $ZT16 = $_[20]; # [clobbered] ZMM register
1403 my $T1 = $_[21]; # [clobbered] GP register
1404 my $T2 = $_[22]; # [clobbered] GP register
1405 my $T3 = $_[23]; # [clobbered] GP register
1406 my $MASKREG = $_[24]; # [clobbered] mask register
1407
1408 my $HKEYS_READY = "%rbx";
1409
1410 my $SHFMSK = $ZT13;
1411
1412 my $rndsuffix = &random_string();
1413
1414 $code .= <<___;
1415 mov $A_IN,$T1 # ; T1 = AAD
1416 mov $A_LEN,$T2 # ; T2 = aadLen
1417 or $T2,$T2
1418 jz .L_CALC_AAD_done_${rndsuffix}
1419
1420 xor $HKEYS_READY,$HKEYS_READY
1421 vmovdqa64 SHUF_MASK(%rip),$SHFMSK
1422
1423.L_get_AAD_loop48x16_${rndsuffix}:
1424 cmp \$`(48*16)`,$T2
1425 jl .L_exit_AAD_loop48x16_${rndsuffix}
1426___
1427
1428 $code .= <<___;
1429 vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3
1430 vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7
1431 vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11
1432 vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15
1433 vpshufb $SHFMSK,$ZT1,$ZT1
1434 vpshufb $SHFMSK,$ZT2,$ZT2
1435 vpshufb $SHFMSK,$ZT3,$ZT3
1436 vpshufb $SHFMSK,$ZT4,$ZT4
1437___
1438
1439 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all");
1440 $code .= "mov \$1,$HKEYS_READY\n";
1441
1442 &GHASH_16(
1443 "start", $ZT5, $ZT6, $ZT7,
1444 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1445 &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0,
1446 $ZT8, $ZT9, $ZT10, $ZT11,
1447 $ZT12, $ZT14, $ZT15, $ZT16,
1448 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1449 $ZT4);
1450
1451 $code .= <<___;
1452 vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19
1453 vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23
1454 vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27
1455 vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31
1456 vpshufb $SHFMSK,$ZT1,$ZT1
1457 vpshufb $SHFMSK,$ZT2,$ZT2
1458 vpshufb $SHFMSK,$ZT3,$ZT3
1459 vpshufb $SHFMSK,$ZT4,$ZT4
1460___
1461
1462 &GHASH_16(
1463 "mid", $ZT5, $ZT6, $ZT7,
1464 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1465 &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0,
1466 $ZT8, $ZT9, $ZT10, $ZT11,
1467 $ZT12, $ZT14, $ZT15, $ZT16,
1468 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1469 $ZT4);
1470
1471 $code .= <<___;
1472 vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35
1473 vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39
1474 vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43
1475 vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47
1476 vpshufb $SHFMSK,$ZT1,$ZT1
1477 vpshufb $SHFMSK,$ZT2,$ZT2
1478 vpshufb $SHFMSK,$ZT3,$ZT3
1479 vpshufb $SHFMSK,$ZT4,$ZT4
1480___
1481
1482 &GHASH_16(
1483 "end_reduce", $ZT5, $ZT6, $ZT7,
1484 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1485 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1486 $ZT8, $ZT9, $ZT10, $ZT11,
1487 $ZT12, $ZT14, $ZT15, $ZT16,
1488 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1489 $ZT4);
1490
1491 $code .= <<___;
1492 sub \$`(48*16)`,$T2
1493 je .L_CALC_AAD_done_${rndsuffix}
1494
1495 add \$`(48*16)`,$T1
1496 jmp .L_get_AAD_loop48x16_${rndsuffix}
1497
1498.L_exit_AAD_loop48x16_${rndsuffix}:
1499 # ; Less than 48x16 bytes remaining
1500 cmp \$`(32*16)`,$T2
1501 jl .L_less_than_32x16_${rndsuffix}
1502___
1503
1504 $code .= <<___;
1505 # ; Get next 16 blocks
1506 vmovdqu64 `64*0`($T1),$ZT1
1507 vmovdqu64 `64*1`($T1),$ZT2
1508 vmovdqu64 `64*2`($T1),$ZT3
1509 vmovdqu64 `64*3`($T1),$ZT4
1510 vpshufb $SHFMSK,$ZT1,$ZT1
1511 vpshufb $SHFMSK,$ZT2,$ZT2
1512 vpshufb $SHFMSK,$ZT3,$ZT3
1513 vpshufb $SHFMSK,$ZT4,$ZT4
1514___
1515
1516 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32");
1517 $code .= "mov \$1,$HKEYS_READY\n";
1518
1519 &GHASH_16(
1520 "start", $ZT5, $ZT6, $ZT7,
1521 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1522 &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1523 $ZT8, $ZT9, $ZT10, $ZT11,
1524 $ZT12, $ZT14, $ZT15, $ZT16,
1525 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1526 $ZT4);
1527
1528 $code .= <<___;
1529 vmovdqu64 `16*16 + 64*0`($T1),$ZT1
1530 vmovdqu64 `16*16 + 64*1`($T1),$ZT2
1531 vmovdqu64 `16*16 + 64*2`($T1),$ZT3
1532 vmovdqu64 `16*16 + 64*3`($T1),$ZT4
1533 vpshufb $SHFMSK,$ZT1,$ZT1
1534 vpshufb $SHFMSK,$ZT2,$ZT2
1535 vpshufb $SHFMSK,$ZT3,$ZT3
1536 vpshufb $SHFMSK,$ZT4,$ZT4
1537___
1538
1539 &GHASH_16(
1540 "end_reduce", $ZT5, $ZT6, $ZT7,
1541 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1542 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1543 $ZT8, $ZT9, $ZT10, $ZT11,
1544 $ZT12, $ZT14, $ZT15, $ZT16,
1545 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1546 $ZT4);
1547
1548 $code .= <<___;
1549 sub \$`(32*16)`,$T2
1550 je .L_CALC_AAD_done_${rndsuffix}
1551
1552 add \$`(32*16)`,$T1
1553 jmp .L_less_than_16x16_${rndsuffix}
1554
1555.L_less_than_32x16_${rndsuffix}:
1556 cmp \$`(16*16)`,$T2
1557 jl .L_less_than_16x16_${rndsuffix}
1558 # ; Get next 16 blocks
1559 vmovdqu64 `64*0`($T1),$ZT1
1560 vmovdqu64 `64*1`($T1),$ZT2
1561 vmovdqu64 `64*2`($T1),$ZT3
1562 vmovdqu64 `64*3`($T1),$ZT4
1563 vpshufb $SHFMSK,$ZT1,$ZT1
1564 vpshufb $SHFMSK,$ZT2,$ZT2
1565 vpshufb $SHFMSK,$ZT3,$ZT3
1566 vpshufb $SHFMSK,$ZT4,$ZT4
1567___
1568
1569 # ; This code path does not use more than 16 hkeys, so they can be taken from the context
1570 # ; (not from the stack storage)
1571 &GHASH_16(
1572 "start_reduce", $ZT5, $ZT6, $ZT7,
1573 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX,
1574 &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0,
1575 $ZT8, $ZT9, $ZT10, $ZT11,
1576 $ZT12, $ZT14, $ZT15, $ZT16,
1577 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1578 $ZT4);
1579
1580 $code .= <<___;
1581 sub \$`(16*16)`,$T2
1582 je .L_CALC_AAD_done_${rndsuffix}
1583
1584 add \$`(16*16)`,$T1
1585 # ; Less than 16x16 bytes remaining
1586.L_less_than_16x16_${rndsuffix}:
1587 # ;; prep mask source address
1588 lea byte64_len_to_mask_table(%rip),$T3
1589 lea ($T3,$T2,8),$T3
1590
1591 # ;; calculate number of blocks to ghash (including partial bytes)
1592 add \$15,@{[DWORD($T2)]}
1593 shr \$4,@{[DWORD($T2)]}
1594 cmp \$2,@{[DWORD($T2)]}
1595 jb .L_AAD_blocks_1_${rndsuffix}
1596 je .L_AAD_blocks_2_${rndsuffix}
1597 cmp \$4,@{[DWORD($T2)]}
1598 jb .L_AAD_blocks_3_${rndsuffix}
1599 je .L_AAD_blocks_4_${rndsuffix}
1600 cmp \$6,@{[DWORD($T2)]}
1601 jb .L_AAD_blocks_5_${rndsuffix}
1602 je .L_AAD_blocks_6_${rndsuffix}
1603 cmp \$8,@{[DWORD($T2)]}
1604 jb .L_AAD_blocks_7_${rndsuffix}
1605 je .L_AAD_blocks_8_${rndsuffix}
1606 cmp \$10,@{[DWORD($T2)]}
1607 jb .L_AAD_blocks_9_${rndsuffix}
1608 je .L_AAD_blocks_10_${rndsuffix}
1609 cmp \$12,@{[DWORD($T2)]}
1610 jb .L_AAD_blocks_11_${rndsuffix}
1611 je .L_AAD_blocks_12_${rndsuffix}
1612 cmp \$14,@{[DWORD($T2)]}
1613 jb .L_AAD_blocks_13_${rndsuffix}
1614 je .L_AAD_blocks_14_${rndsuffix}
1615 cmp \$15,@{[DWORD($T2)]}
1616 je .L_AAD_blocks_15_${rndsuffix}
1617___
1618
1619 # ;; fall through for 16 blocks
1620
1621 # ;; The flow of each of these cases is identical:
1622 # ;; - load blocks plain text
1623 # ;; - shuffle loaded blocks
1624 # ;; - xor in current hash value into block 0
1625 # ;; - perform up multiplications with ghash keys
1626 # ;; - jump to reduction code
1627
1628 for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) {
1629 $code .= ".L_AAD_blocks_${aad_blocks}_${rndsuffix}:\n";
1630 if ($aad_blocks > 12) {
1631 $code .= "sub \$`12*16*8`, $T3\n";
1632 } elsif ($aad_blocks > 8) {
1633 $code .= "sub \$`8*16*8`, $T3\n";
1634 } elsif ($aad_blocks > 4) {
1635 $code .= "sub \$`4*16*8`, $T3\n";
1636 }
1637 $code .= "kmovq ($T3),$MASKREG\n";
1638
1639 &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG);
1640
1641 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4,
1642 $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
1643
1644 &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH),
1645 $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks);
1646
1647 if ($aad_blocks > 1) {
1648
1649 # ;; fall through to CALC_AAD_done in 1 block case
1650 $code .= "jmp .L_CALC_AAD_done_${rndsuffix}\n";
1651 }
1652
1653 }
1654 $code .= ".L_CALC_AAD_done_${rndsuffix}:\n";
1655
1656 # ;; result in AAD_HASH
1657}
1658
1659# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1660# ;; PARTIAL_BLOCK
1661# ;; Handles encryption/decryption and the tag partial blocks between
1662# ;; update calls.
1663# ;; Requires the input data be at least 1 byte long.
1664# ;; Output:
1665# ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT),
1666# ;; AAD_HASH and updated GCM128_CTX
1667# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1668sub PARTIAL_BLOCK {
1669 my $GCM128_CTX = $_[0]; # [in] key pointer
1670 my $PBLOCK_LEN = $_[1]; # [in] partial block length
1671 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
1672 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
1673 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
1674 my $DATA_OFFSET = $_[5]; # [out] data offset (gets set)
1675 my $AAD_HASH = $_[6]; # [out] updated GHASH value
1676 my $ENC_DEC = $_[7]; # [in] cipher direction
1677 my $GPTMP0 = $_[8]; # [clobbered] GP temporary register
1678 my $GPTMP1 = $_[9]; # [clobbered] GP temporary register
1679 my $GPTMP2 = $_[10]; # [clobbered] GP temporary register
1680 my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register
1681 my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register
1682 my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register
1683 my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register
1684 my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register
1685 my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register
1686 my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register
1687 my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register
1688 my $MASKREG = $_[19]; # [clobbered] mask temporary register
1689
1690 my $XTMP0 = &XWORD($ZTMP0);
1691 my $XTMP1 = &XWORD($ZTMP1);
1692 my $XTMP2 = &XWORD($ZTMP2);
1693 my $XTMP3 = &XWORD($ZTMP3);
1694 my $XTMP4 = &XWORD($ZTMP4);
1695 my $XTMP5 = &XWORD($ZTMP5);
1696 my $XTMP6 = &XWORD($ZTMP6);
1697 my $XTMP7 = &XWORD($ZTMP7);
1698
1699 my $LENGTH = $DATA_OFFSET;
1700 my $IA0 = $GPTMP1;
1701 my $IA1 = $GPTMP2;
1702 my $IA2 = $GPTMP0;
1703
1704 my $rndsuffix = &random_string();
1705
1706 $code .= <<___;
1707 # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero
1708 mov ($PBLOCK_LEN),$LENGTH
1709 or $LENGTH,$LENGTH
1710 je .L_partial_block_done_${rndsuffix} # ;Leave Macro if no partial blocks
1711___
1712
1713 &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG);
1714
1715 $code .= <<___;
1716 # ;; XTMP1 = my_ctx_data.partial_block_enc_key
1717 vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1
1718 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2
1719
1720 # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes
1721 # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16)
1722 lea SHIFT_MASK(%rip),$IA0
1723 add $LENGTH,$IA0
1724 vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask
1725 vpshufb $XTMP3,$XTMP1,$XTMP1
1726___
1727
1728 if ($ENC_DEC eq "DEC") {
1729 $code .= <<___;
1730 # ;; keep copy of cipher text in $XTMP4
1731 vmovdqa64 $XTMP0,$XTMP4
1732___
1733 }
1734 $code .= <<___;
1735 vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn)
1736 # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block
1737 # ;; Determine if partial block is not being filled and shift mask accordingly
1738___
1739 if ($win64) {
1740 $code .= <<___;
1741 mov $PLAIN_CIPH_LEN,$IA1
1742 add $LENGTH,$IA1
1743___
1744 } else {
1745 $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n";
1746 }
1747 $code .= <<___;
1748 sub \$16,$IA1
1749 jge .L_no_extra_mask_${rndsuffix}
1750 sub $IA1,$IA0
1751.L_no_extra_mask_${rndsuffix}:
1752 # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1
1753 # ;; - mask out bottom $LENGTH bytes of $XTMP1
1754 # ;; sizeof(SHIFT_MASK) == 16 bytes
1755 vmovdqu64 16($IA0),$XTMP0
1756 vpand $XTMP0,$XTMP1,$XTMP1
1757___
1758
1759 if ($ENC_DEC eq "DEC") {
1760 $code .= <<___;
1761 vpand $XTMP0,$XTMP4,$XTMP4
1762 vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4
1763 vpshufb $XTMP3,$XTMP4,$XTMP4
1764 vpxorq $XTMP4,$AAD_HASH,$AAD_HASH
1765___
1766 } else {
1767 $code .= <<___;
1768 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
1769 vpshufb $XTMP3,$XTMP1,$XTMP1
1770 vpxorq $XTMP1,$AAD_HASH,$AAD_HASH
1771___
1772 }
1773 $code .= <<___;
1774 cmp \$0,$IA1
1775 jl .L_partial_incomplete_${rndsuffix}
1776___
1777
1778 # ;; GHASH computation for the last <16 Byte block
1779 &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7);
1780
1781 $code .= <<___;
1782 movq \$0, ($PBLOCK_LEN)
1783 # ;; Set $LENGTH to be the number of bytes to write out
1784 mov $LENGTH,$IA0
1785 mov \$16,$LENGTH
1786 sub $IA0,$LENGTH
1787 jmp .L_enc_dec_done_${rndsuffix}
1788
1789.L_partial_incomplete_${rndsuffix}:
1790___
1791 if ($win64) {
1792 $code .= <<___;
1793 mov $PLAIN_CIPH_LEN,$IA0
1794 add $IA0,($PBLOCK_LEN)
1795___
1796 } else {
1797 $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n";
1798 }
1799 $code .= <<___;
1800 mov $PLAIN_CIPH_LEN,$LENGTH
1801
1802.L_enc_dec_done_${rndsuffix}:
1803 # ;; output encrypted Bytes
1804
1805 lea byte_len_to_mask_table(%rip),$IA0
1806 kmovw ($IA0,$LENGTH,2),$MASKREG
1807 vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)
1808___
1809
1810 if ($ENC_DEC eq "ENC") {
1811 $code .= <<___;
1812 # ;; shuffle XTMP1 back to output as ciphertext
1813 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
1814 vpshufb $XTMP3,$XTMP1,$XTMP1
1815___
1816 }
1817 $code .= <<___;
1818 mov $CIPH_PLAIN_OUT,$IA0
1819 vmovdqu8 $XTMP1,($IA0){$MASKREG}
1820.L_partial_block_done_${rndsuffix}:
1821___
1822}
1823
1824# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1825# ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation
1826sub INITIAL_BLOCKS_PARTIAL_CIPHER {
1827 my $AES_KEYS = $_[0]; # [in] key pointer
1828 my $GCM128_CTX = $_[1]; # [in] context pointer
1829 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
1830 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
1831 my $LENGTH = $_[4]; # [in/clobbered] length in bytes
1832 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
1833 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1834 my $CTR = $_[7]; # [in/out] current counter value
1835 my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC)
1836 my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH
1837 my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH
1838 my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH
1839 my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH
1840 my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text
1841 my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH
1842 my $CTR0 = $_[15]; # [clobbered] ZMM temporary
1843 my $CTR1 = $_[16]; # [clobbered] ZMM temporary
1844 my $CTR2 = $_[17]; # [clobbered] ZMM temporary
1845 my $CTR3 = $_[18]; # [clobbered] ZMM temporary
1846 my $ZT1 = $_[19]; # [clobbered] ZMM temporary
1847 my $IA0 = $_[20]; # [clobbered] GP temporary
1848 my $IA1 = $_[21]; # [clobbered] GP temporary
1849 my $MASKREG = $_[22]; # [clobbered] mask register
1850 my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask
1851
1852 if ($NUM_BLOCKS == 1) {
1853 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n";
1854 } elsif ($NUM_BLOCKS == 2) {
1855 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n";
1856 } else {
1857 $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n";
1858 }
1859
1860 # ;; prepare AES counter blocks
1861 if ($NUM_BLOCKS == 1) {
1862 $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n";
1863 } elsif ($NUM_BLOCKS == 2) {
1864 $code .= <<___;
1865 vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]}
1866 vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]}
1867___
1868 } else {
1869 $code .= <<___;
1870 vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]}
1871 vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0
1872___
1873 if ($NUM_BLOCKS > 4) {
1874 $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n";
1875 }
1876 if ($NUM_BLOCKS > 8) {
1877 $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n";
1878 }
1879 if ($NUM_BLOCKS > 12) {
1880 $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n";
1881 }
1882 }
1883
1884 # ;; get load/store mask
1885 $code .= <<___;
1886 lea byte64_len_to_mask_table(%rip),$IA0
1887 mov $LENGTH,$IA1
1888___
1889 if ($NUM_BLOCKS > 12) {
1890 $code .= "sub \$`3*64`,$IA1\n";
1891 } elsif ($NUM_BLOCKS > 8) {
1892 $code .= "sub \$`2*64`,$IA1\n";
1893 } elsif ($NUM_BLOCKS > 4) {
1894 $code .= "sub \$`1*64`,$IA1\n";
1895 }
1896 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
1897
1898 # ;; extract new counter value
1899 # ;; shuffle the counters for AES rounds
1900 if ($NUM_BLOCKS <= 4) {
1901 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n";
1902 } elsif ($NUM_BLOCKS <= 8) {
1903 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n";
1904 } elsif ($NUM_BLOCKS <= 12) {
1905 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n";
1906 } else {
1907 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n";
1908 }
1909 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1910 $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0,
1911 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1912
1913 # ;; load plain/cipher text
1914 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG);
1915
1916 # ;; AES rounds and XOR with plain/cipher text
1917 foreach my $j (0 .. ($NROUNDS + 1)) {
1918 $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n";
1919 &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j,
1920 $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS);
1921 }
1922
1923 # ;; retrieve the last cipher counter block (partially XOR'ed with text)
1924 # ;; - this is needed for partial block cases
1925 if ($NUM_BLOCKS <= 4) {
1926 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n";
1927 } elsif ($NUM_BLOCKS <= 8) {
1928 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n";
1929 } elsif ($NUM_BLOCKS <= 12) {
1930 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n";
1931 } else {
1932 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n";
1933 }
1934
1935 # ;; write cipher/plain text back to output and
1936 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
1937 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG);
1938
1939 # ;; zero bytes outside the mask before hashing
1940 if ($NUM_BLOCKS <= 4) {
1941 $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n";
1942 } elsif ($NUM_BLOCKS <= 8) {
1943 $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n";
1944 } elsif ($NUM_BLOCKS <= 12) {
1945 $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n";
1946 } else {
1947 $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n";
1948 }
1949
1950 # ;; Shuffle the cipher text blocks for hashing part
1951 # ;; ZT5 and ZT6 are expected outputs with blocks for hashing
1952 if ($ENC_DEC eq "DEC") {
1953
1954 # ;; Decrypt case
1955 # ;; - cipher blocks are in ZT5 & ZT6
1956 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1957 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0,
1958 $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1959 } else {
1960
1961 # ;; Encrypt case
1962 # ;; - cipher blocks are in CTR0-CTR3
1963 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1964 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0,
1965 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1966 }
1967
1968 # ;; Extract the last block for partials and multi_call cases
1969 if ($NUM_BLOCKS <= 4) {
1970 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n";
1971 } elsif ($NUM_BLOCKS <= 8) {
1972 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n";
1973 } elsif ($NUM_BLOCKS <= 12) {
1974 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n";
1975 } else {
1976 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n";
1977 }
1978
1979}
1980
1981# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1982# ;; Computes GHASH on 1 to 16 blocks
1983sub INITIAL_BLOCKS_PARTIAL_GHASH {
1984 my $AES_KEYS = $_[0]; # [in] key pointer
1985 my $GCM128_CTX = $_[1]; # [in] context pointer
1986 my $LENGTH = $_[2]; # [in/clobbered] length in bytes
1987 my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1988 my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value
1989 my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC)
1990 my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH
1991 my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH
1992 my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH
1993 my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH
1994 my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text
1995 my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH
1996 my $ZT0 = $_[12]; # [clobbered] ZMM temporary
1997 my $ZT1 = $_[13]; # [clobbered] ZMM temporary
1998 my $ZT2 = $_[14]; # [clobbered] ZMM temporary
1999 my $ZT3 = $_[15]; # [clobbered] ZMM temporary
2000 my $ZT4 = $_[16]; # [clobbered] ZMM temporary
2001 my $ZT5 = $_[17]; # [clobbered] ZMM temporary
2002 my $ZT6 = $_[18]; # [clobbered] ZMM temporary
2003 my $ZT7 = $_[19]; # [clobbered] ZMM temporary
2004 my $ZT8 = $_[20]; # [clobbered] ZMM temporary
2005 my $PBLOCK_LEN = $_[21]; # [in] partial block length
2006 my $GH = $_[22]; # [in] ZMM with hi product part
2007 my $GM = $_[23]; # [in] ZMM with mid prodcut part
2008 my $GL = $_[24]; # [in] ZMM with lo product part
2009
2010 my $rndsuffix = &random_string();
2011
2012 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2013 # ;;; - Hash all but the last partial block of data
2014 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2015
2016 # ;; update data offset
2017 if ($NUM_BLOCKS > 1) {
2018
2019 # ;; The final block of data may be <16B
2020 $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n";
2021 }
2022
2023 if ($NUM_BLOCKS < 16) {
2024 $code .= <<___;
2025 # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
2026 # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
2027 cmp \$16,$LENGTH
2028 jl .L_small_initial_partial_block_${rndsuffix}
2029
2030 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2031 # ;;; Handle a full length final block - encrypt and hash all blocks
2032 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2033
2034 sub \$16,$LENGTH
2035 movq \$0,($PBLOCK_LEN)
2036___
2037
2038 # ;; Hash all of the data
2039 if (scalar(@_) == 22) {
2040
2041 # ;; start GHASH compute
2042 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2043 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS);
2044 } elsif (scalar(@_) == 25) {
2045
2046 # ;; continue GHASH compute
2047 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2048 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL);
2049 }
2050 $code .= "jmp .L_small_initial_compute_done_${rndsuffix}\n";
2051 }
2052
2053 $code .= <<___;
2054.L_small_initial_partial_block_${rndsuffix}:
2055
2056 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2057 # ;;; Handle ghash for a <16B final block
2058 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2059
2060 # ;; As it's an init / update / finalize series we need to leave the
2061 # ;; last block if it's less than a full block of data.
2062
2063 mov $LENGTH,($PBLOCK_LEN)
2064 vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX)
2065___
2066
2067 my $k = ($NUM_BLOCKS - 1);
2068 my $last_block_to_hash = 1;
2069 if (($NUM_BLOCKS > $last_block_to_hash)) {
2070
2071 # ;; ZT12-ZT20 - temporary registers
2072 if (scalar(@_) == 22) {
2073
2074 # ;; start GHASH compute
2075 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2076 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k);
2077 } elsif (scalar(@_) == 25) {
2078
2079 # ;; continue GHASH compute
2080 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2081 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL);
2082 }
2083
2084 # ;; just fall through no jmp needed
2085 } else {
2086
2087 if (scalar(@_) == 25) {
2088 $code .= <<___;
2089 # ;; Reduction is required in this case.
2090 # ;; Integrate GM into GH and GL.
2091 vpsrldq \$8,$GM,$ZT0
2092 vpslldq \$8,$GM,$ZT1
2093 vpxorq $ZT0,$GH,$GH
2094 vpxorq $ZT1,$GL,$GL
2095___
2096
2097 # ;; Add GH and GL 128-bit words horizontally
2098 &VHPXORI4x128($GH, $ZT0);
2099 &VHPXORI4x128($GL, $ZT1);
2100
2101 # ;; 256-bit to 128-bit reduction
2102 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n";
2103 &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2));
2104 }
2105 $code .= <<___;
2106 # ;; Record that a reduction is not needed -
2107 # ;; In this case no hashes are computed because there
2108 # ;; is only one initial block and it is < 16B in length.
2109 # ;; We only need to check if a reduction is needed if
2110 # ;; initial_blocks == 1 and init/update/final is being used.
2111 # ;; In this case we may just have a partial block, and that
2112 # ;; gets hashed in finalize.
2113
2114 # ;; The hash should end up in HASH_IN_OUT.
2115 # ;; The only way we should get here is if there is
2116 # ;; a partial block of data, so xor that into the hash.
2117 vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT
2118 # ;; The result is in $HASH_IN_OUT
2119 jmp .L_after_reduction_${rndsuffix}
2120___
2121 }
2122
2123 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2124 # ;;; After GHASH reduction
2125 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2126
2127 $code .= ".L_small_initial_compute_done_${rndsuffix}:\n";
2128
2129 # ;; If using init/update/finalize, we need to xor any partial block data
2130 # ;; into the hash.
2131 if ($NUM_BLOCKS > 1) {
2132
2133 # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place
2134 if ($NUM_BLOCKS != 16) {
2135 $code .= <<___;
2136 # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero
2137 or $LENGTH,$LENGTH
2138 je .L_after_reduction_${rndsuffix}
2139___
2140 }
2141 $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n";
2142 }
2143
2144 $code .= ".L_after_reduction_${rndsuffix}:\n";
2145
2146 # ;; Final hash is now in HASH_IN_OUT
2147}
2148
2149# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2150# ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
2151# ;; It may look similar to INITIAL_BLOCKS but its usage is different:
2152# ;; - first encrypts/decrypts required number of blocks and then
2153# ;; ghashes these blocks
2154# ;; - Small packets or left over data chunks (<256 bytes)
2155# ;; - Remaining data chunks below 256 bytes (multi buffer code)
2156# ;;
2157# ;; num_initial_blocks is expected to include the partial final block
2158# ;; in the count.
2159sub INITIAL_BLOCKS_PARTIAL {
2160 my $AES_KEYS = $_[0]; # [in] key pointer
2161 my $GCM128_CTX = $_[1]; # [in] context pointer
2162 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
2163 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
2164 my $LENGTH = $_[4]; # [in/clobbered] length in bytes
2165 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
2166 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
2167 my $CTR = $_[7]; # [in/out] current counter value
2168 my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value
2169 my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC)
2170 my $CTR0 = $_[10]; # [clobbered] ZMM temporary
2171 my $CTR1 = $_[11]; # [clobbered] ZMM temporary
2172 my $CTR2 = $_[12]; # [clobbered] ZMM temporary
2173 my $CTR3 = $_[13]; # [clobbered] ZMM temporary
2174 my $DAT0 = $_[14]; # [clobbered] ZMM temporary
2175 my $DAT1 = $_[15]; # [clobbered] ZMM temporary
2176 my $DAT2 = $_[16]; # [clobbered] ZMM temporary
2177 my $DAT3 = $_[17]; # [clobbered] ZMM temporary
2178 my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary
2179 my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary
2180 my $ZT0 = $_[20]; # [clobbered] ZMM temporary
2181 my $ZT1 = $_[21]; # [clobbered] ZMM temporary
2182 my $ZT2 = $_[22]; # [clobbered] ZMM temporary
2183 my $ZT3 = $_[23]; # [clobbered] ZMM temporary
2184 my $ZT4 = $_[24]; # [clobbered] ZMM temporary
2185 my $IA0 = $_[25]; # [clobbered] GP temporary
2186 my $IA1 = $_[26]; # [clobbered] GP temporary
2187 my $MASKREG = $_[27]; # [clobbered] mask register
2188 my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask
2189 my $PBLOCK_LEN = $_[29]; # [in] partial block length
2190
2191 &INITIAL_BLOCKS_PARTIAL_CIPHER(
2192 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
2193 $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR,
2194 $ENC_DEC, $DAT0, $DAT1, $DAT2,
2195 $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0,
2196 $CTR1, $CTR2, $CTR3, $ZT0,
2197 $IA0, $IA1, $MASKREG, $SHUFMASK);
2198
2199 &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0,
2200 $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK),
2201 &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN);
2202}
2203
2204# ;; ===========================================================================
2205# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2206# ;; followed with GHASH of the N blocks.
2207sub GHASH_16_ENCRYPT_N_GHASH_N {
2208 my $AES_KEYS = $_[0]; # [in] key pointer
2209 my $GCM128_CTX = $_[1]; # [in] context pointer
2210 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
2211 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
2212 my $DATA_OFFSET = $_[4]; # [in] data offset
2213 my $LENGTH = $_[5]; # [in] data length
2214 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
2215 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
2216 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
2217 # (can be in form of register or numerical value)
2218 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
2219 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
2220 my $B00_03 = $_[11]; # [clobbered] temporary ZMM
2221 my $B04_07 = $_[12]; # [clobbered] temporary ZMM
2222 my $B08_11 = $_[13]; # [clobbered] temporary ZMM
2223 my $B12_15 = $_[14]; # [clobbered] temporary ZMM
2224 my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM
2225 my $GH1L = $_[16]; # [clobbered] temporary ZMM
2226 my $GH1M = $_[17]; # [clobbered] temporary ZMM
2227 my $GH1T = $_[18]; # [clobbered] temporary ZMM
2228 my $GH2H = $_[19]; # [clobbered] temporary ZMM
2229 my $GH2L = $_[20]; # [clobbered] temporary ZMM
2230 my $GH2M = $_[21]; # [clobbered] temporary ZMM
2231 my $GH2T = $_[22]; # [clobbered] temporary ZMM
2232 my $GH3H = $_[23]; # [clobbered] temporary ZMM
2233 my $GH3L = $_[24]; # [clobbered] temporary ZMM
2234 my $GH3M = $_[25]; # [clobbered] temporary ZMM
2235 my $GH3T = $_[26]; # [clobbered] temporary ZMM
2236 my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM
2237 my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM
2238 my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM
2239 my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM
2240 my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM
2241 my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM
2242 my $ZT01 = $_[33]; # [clobbered] temporary ZMM
2243 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
2244 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2245 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
2246 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
2247 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
2248 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
2249 my $ENC_DEC = $_[40]; # [in] cipher direction
2250 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
2251 my $IA0 = $_[42]; # [clobbered] GP temporary
2252 my $IA1 = $_[43]; # [clobbered] GP temporary
2253 my $MASKREG = $_[44]; # [clobbered] mask register
2254 my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
2255 my $PBLOCK_LEN = $_[46]; # [in] partial block length
2256
2257 die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
2258 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
2259
2260 my $rndsuffix = &random_string();
2261
2262 my $GH1H = $HASH_IN_OUT;
2263
2264 # ; this is to avoid additional move in do_reduction case
2265
2266 my $LAST_GHASH_BLK = $GH1L;
2267 my $LAST_CIPHER_BLK = $GH1T;
2268
2269 my $RED_POLY = $GH2T;
2270 my $RED_P1 = $GH2L;
2271 my $RED_T1 = $GH2H;
2272 my $RED_T2 = $GH2M;
2273
2274 my $DATA1 = $GH3H;
2275 my $DATA2 = $GH3L;
2276 my $DATA3 = $GH3M;
2277 my $DATA4 = $GH3T;
2278
2279 # ;; do reduction after the 16 blocks ?
2280 my $do_reduction = 0;
2281
2282 # ;; is 16 block chunk a start?
2283 my $is_start = 0;
2284
2285 if ($GHASH_TYPE eq "start_reduce") {
2286 $is_start = 1;
2287 $do_reduction = 1;
2288 }
2289
2290 if ($GHASH_TYPE eq "start") {
2291 $is_start = 1;
2292 }
2293
2294 if ($GHASH_TYPE eq "end_reduce") {
2295 $do_reduction = 1;
2296 }
2297
2298 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2299 # ;; - get load/store mask
2300 # ;; - load plain/cipher text
2301 # ;; get load/store mask
2302 $code .= <<___;
2303 lea byte64_len_to_mask_table(%rip),$IA0
2304 mov $LENGTH,$IA1
2305___
2306 if ($NUM_BLOCKS > 12) {
2307 $code .= "sub \$`3*64`,$IA1\n";
2308 } elsif ($NUM_BLOCKS > 8) {
2309 $code .= "sub \$`2*64`,$IA1\n";
2310 } elsif ($NUM_BLOCKS > 4) {
2311 $code .= "sub \$`1*64`,$IA1\n";
2312 }
2313 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
2314
2315 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2316 # ;; prepare counter blocks
2317
2318 $code .= <<___;
2319 cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
2320 jae .L_16_blocks_overflow_${rndsuffix}
2321___
2322
2323 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2324 $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
2325 $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
2326 $code .= <<___;
2327 jmp .L_16_blocks_ok_${rndsuffix}
2328
2329.L_16_blocks_overflow_${rndsuffix}:
2330 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
2331 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
2332___
2333 if ($NUM_BLOCKS > 4) {
2334 $code .= <<___;
2335 vmovdqa64 ddq_add_4444(%rip),$B12_15
2336 vpaddd $B12_15,$B00_03,$B04_07
2337___
2338 }
2339 if ($NUM_BLOCKS > 8) {
2340 $code .= "vpaddd $B12_15,$B04_07,$B08_11\n";
2341 }
2342 if ($NUM_BLOCKS > 12) {
2343 $code .= "vpaddd $B12_15,$B08_11,$B12_15\n";
2344 }
2345 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2346 $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2347 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2348 $code .= <<___;
2349.L_16_blocks_ok_${rndsuffix}:
2350
2351 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2352 # ;; - pre-load constants
2353 # ;; - add current hash into the 1st block
2354 vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1
2355___
2356 if ($is_start != 0) {
2357 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
2358 } else {
2359 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
2360 }
2361
2362 $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
2363
2364 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2365 # ;; save counter for the next round
2366 # ;; increment counter overflow check register
2367 if ($NUM_BLOCKS <= 4) {
2368 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
2369 } elsif ($NUM_BLOCKS <= 8) {
2370 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
2371 } elsif ($NUM_BLOCKS <= 12) {
2372 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
2373 } else {
2374 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
2375 }
2376 $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
2377
2378 $code .= <<___;
2379 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2380 # ;; pre-load constants
2381 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
2382 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
2383 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
2384___
2385
2386 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2387 # ;; stitch AES rounds with GHASH
2388
2389 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2390 # ;; AES round 0 - ARK
2391
2392 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2393 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2394 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2395 $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
2396
2397 $code .= <<___;
2398 # ;;==================================================
2399 # ;; GHASH 4 blocks (15 to 12)
2400 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
2401 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
2402 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
2403 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
2404 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
2405 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
2406___
2407
2408 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2409 # ;; AES round 1
2410 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2411 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2412 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2413 $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
2414
2415 $code .= <<___;
2416 # ;; =================================================
2417 # ;; GHASH 4 blocks (11 to 8)
2418 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
2419 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
2420 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
2421 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
2422 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
2423 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
2424___
2425
2426 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2427 # ;; AES round 2
2428 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2429 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2430 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2431 $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n";
2432
2433 $code .= <<___;
2434 # ;; =================================================
2435 # ;; GHASH 4 blocks (7 to 4)
2436 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
2437 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
2438 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
2439 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
2440___
2441
2442 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2443 # ;; AES rounds 3
2444 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2445 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2446 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2447 $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n";
2448
2449 $code .= <<___;
2450 # ;; =================================================
2451 # ;; Gather (XOR) GHASH for 12 blocks
2452 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
2453 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
2454 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
2455 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
2456___
2457
2458 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2459 # ;; AES rounds 4
2460 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2461 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2462 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2463 $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n";
2464
2465 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2466 # ;; load plain/cipher text
2467 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
2468
2469 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2470 # ;; AES rounds 5
2471 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2472 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2473 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2474 $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n";
2475
2476 $code .= <<___;
2477 # ;; =================================================
2478 # ;; GHASH 4 blocks (3 to 0)
2479 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
2480 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
2481 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
2482 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
2483___
2484
2485 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2486 # ;; AES round 6
2487 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2488 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2489 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2490 $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n";
2491
2492 # ;; =================================================
2493 # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
2494 # ;; - add GH2[MTLH] to GH1[MTLH]
2495 $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n";
2496 if ($do_reduction != 0) {
2497
2498 if ($is_start != 0) {
2499 $code .= "vpxorq $GH2M,$GH1M,$GH1M\n";
2500 } else {
2501 $code .= <<___;
2502 vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
2503 vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
2504 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
2505___
2506 }
2507
2508 } else {
2509
2510 # ;; Update H/M/L hash sums if not carrying reduction
2511 if ($is_start != 0) {
2512 $code .= <<___;
2513 vpxorq $GH2H,$GH1H,$TO_REDUCE_H
2514 vpxorq $GH2L,$GH1L,$TO_REDUCE_L
2515 vpxorq $GH2M,$GH1M,$TO_REDUCE_M
2516___
2517 } else {
2518 $code .= <<___;
2519 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
2520 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
2521 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
2522___
2523 }
2524
2525 }
2526
2527 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2528 # ;; AES round 7
2529 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2530 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2531 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2532 $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n";
2533
2534 # ;; =================================================
2535 # ;; prepare mid sum for adding to high & low
2536 # ;; load polynomial constant for reduction
2537 if ($do_reduction != 0) {
2538 $code .= <<___;
2539 vpsrldq \$8,$GH1M,$GH2M
2540 vpslldq \$8,$GH1M,$GH1M
2541
2542 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
2543___
2544 }
2545
2546 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2547 # ;; AES round 8
2548 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2549 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2550 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2551 $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n";
2552
2553 # ;; =================================================
2554 # ;; Add mid product to high and low
2555 if ($do_reduction != 0) {
2556 if ($is_start != 0) {
2557 $code .= <<___;
2558 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
2559 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
2560___
2561 } else {
2562 $code .= <<___;
2563 vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64
2564 vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64
2565___
2566 }
2567 }
2568
2569 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2570 # ;; AES round 9
2571 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2572 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2573 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2574
2575 # ;; =================================================
2576 # ;; horizontal xor of low and high 4x128
2577 if ($do_reduction != 0) {
2578 &VHPXORI4x128($GH1H, $GH2H);
2579 &VHPXORI4x128($GH1L, $GH2L);
2580 }
2581
2582 if (($NROUNDS >= 11)) {
2583 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
2584 }
2585
2586 # ;; =================================================
2587 # ;; first phase of reduction
2588 if ($do_reduction != 0) {
2589 $code .= <<___;
2590 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
2591 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
2592 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
2593___
2594 }
2595
2596 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2597 # ;; AES rounds up to 11 (AES192) or 13 (AES256)
2598 # ;; AES128 is done
2599 if (($NROUNDS >= 11)) {
2600 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2601 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2602 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2603 $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n";
2604
2605 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2606 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2607 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2608 if (($NROUNDS == 13)) {
2609 $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n";
2610
2611 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2612 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2613 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2614 $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n";
2615
2616 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2617 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2618 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2619 }
2620 }
2621
2622 # ;; =================================================
2623 # ;; second phase of the reduction
2624 if ($do_reduction != 0) {
2625 $code .= <<___;
2626 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
2627 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
2628 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
2629 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
2630 # ;; GH1H = GH1H + RED_T1 + RED_T2
2631 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
2632___
2633 }
2634
2635 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2636 # ;; the last AES round
2637 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2638 $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2639 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2640
2641 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2642 # ;; XOR against plain/cipher text
2643 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2644 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2645 $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4);
2646
2647 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2648 # ;; retrieve the last cipher counter block (partially XOR'ed with text)
2649 # ;; - this is needed for partial block cases
2650 if ($NUM_BLOCKS <= 4) {
2651 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2652 } elsif ($NUM_BLOCKS <= 8) {
2653 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2654 } elsif ($NUM_BLOCKS <= 12) {
2655 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2656 } else {
2657 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2658 }
2659
2660 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2661 # ;; store cipher/plain text
2662 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
2663 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
2664
2665 # ;; =================================================
2666 # ;; shuffle cipher text blocks for GHASH computation
2667 if ($ENC_DEC eq "ENC") {
2668
2669 # ;; zero bytes outside the mask before hashing
2670 if ($NUM_BLOCKS <= 4) {
2671 $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n";
2672 } elsif ($NUM_BLOCKS <= 8) {
2673 $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n";
2674 } elsif ($NUM_BLOCKS <= 12) {
2675 $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n";
2676 } else {
2677 $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n";
2678 }
2679
2680 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2681 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03,
2682 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2683 } else {
2684
2685 # ;; zero bytes outside the mask before hashing
2686 if ($NUM_BLOCKS <= 4) {
2687 $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n";
2688 } elsif ($NUM_BLOCKS <= 8) {
2689 $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n";
2690 } elsif ($NUM_BLOCKS <= 12) {
2691 $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n";
2692 } else {
2693 $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n";
2694 }
2695
2696 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2697 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1,
2698 $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2699 }
2700
2701 # ;; =================================================
2702 # ;; Extract the last block for partial / multi_call cases
2703 if ($NUM_BLOCKS <= 4) {
2704 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
2705 } elsif ($NUM_BLOCKS <= 8) {
2706 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
2707 } elsif ($NUM_BLOCKS <= 12) {
2708 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
2709 } else {
2710 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
2711 }
2712
2713 if ($do_reduction != 0) {
2714
2715 # ;; GH1H holds reduced hash value
2716 # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
2717 # ;; - register rename trick obsoletes the above move
2718 }
2719
2720 # ;; =================================================
2721 # ;; GHASH last N blocks
2722 # ;; - current hash value in HASH_IN_OUT or
2723 # ;; product parts in TO_REDUCE_H/M/L
2724 # ;; - DATA1-DATA4 include blocks for GHASH
2725
2726 if ($do_reduction == 0) {
2727 &INITIAL_BLOCKS_PARTIAL_GHASH(
2728 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
2729 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
2730 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2731 $B00_03, $B04_07, $B08_11, $B12_15,
2732 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
2733 $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M,
2734 $TO_REDUCE_L);
2735 } else {
2736 &INITIAL_BLOCKS_PARTIAL_GHASH(
2737 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
2738 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
2739 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2740 $B00_03, $B04_07, $B08_11, $B12_15,
2741 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
2742 $GHKEY1, $PBLOCK_LEN);
2743 }
2744}
2745
2746# ;; ===========================================================================
2747# ;; ===========================================================================
2748# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2749# ;; followed with GHASH of the N blocks.
2750sub GCM_ENC_DEC_LAST {
2751 my $AES_KEYS = $_[0]; # [in] key pointer
2752 my $GCM128_CTX = $_[1]; # [in] context pointer
2753 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
2754 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
2755 my $DATA_OFFSET = $_[4]; # [in] data offset
2756 my $LENGTH = $_[5]; # [in/clobbered] data length
2757 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
2758 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
2759 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
2760 # (can be register or numerical offset)
2761 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
2762 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
2763 my $ZT00 = $_[11]; # [clobbered] temporary ZMM
2764 my $ZT01 = $_[12]; # [clobbered] temporary ZMM
2765 my $ZT02 = $_[13]; # [clobbered] temporary ZMM
2766 my $ZT03 = $_[14]; # [clobbered] temporary ZMM
2767 my $ZT04 = $_[15]; # [clobbered] temporary ZMM
2768 my $ZT05 = $_[16]; # [clobbered] temporary ZMM
2769 my $ZT06 = $_[17]; # [clobbered] temporary ZMM
2770 my $ZT07 = $_[18]; # [clobbered] temporary ZMM
2771 my $ZT08 = $_[19]; # [clobbered] temporary ZMM
2772 my $ZT09 = $_[20]; # [clobbered] temporary ZMM
2773 my $ZT10 = $_[21]; # [clobbered] temporary ZMM
2774 my $ZT11 = $_[22]; # [clobbered] temporary ZMM
2775 my $ZT12 = $_[23]; # [clobbered] temporary ZMM
2776 my $ZT13 = $_[24]; # [clobbered] temporary ZMM
2777 my $ZT14 = $_[25]; # [clobbered] temporary ZMM
2778 my $ZT15 = $_[26]; # [clobbered] temporary ZMM
2779 my $ZT16 = $_[27]; # [clobbered] temporary ZMM
2780 my $ZT17 = $_[28]; # [clobbered] temporary ZMM
2781 my $ZT18 = $_[29]; # [clobbered] temporary ZMM
2782 my $ZT19 = $_[30]; # [clobbered] temporary ZMM
2783 my $ZT20 = $_[31]; # [clobbered] temporary ZMM
2784 my $ZT21 = $_[32]; # [clobbered] temporary ZMM
2785 my $ZT22 = $_[33]; # [clobbered] temporary ZMM
2786 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
2787 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2788 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
2789 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
2790 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
2791 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
2792 my $ENC_DEC = $_[40]; # [in] cipher direction
2793 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
2794 my $IA0 = $_[42]; # [clobbered] GP temporary
2795 my $IA1 = $_[43]; # [clobbered] GP temporary
2796 my $MASKREG = $_[44]; # [clobbered] mask register
2797 my $PBLOCK_LEN = $_[45]; # [in] partial block length
2798
2799 my $rndsuffix = &random_string();
2800
2801 $code .= <<___;
2802 mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
2803 add \$15,@{[DWORD($IA0)]}
2804 shr \$4,@{[DWORD($IA0)]}
2805 je .L_last_num_blocks_is_0_${rndsuffix}
2806
2807 cmp \$8,@{[DWORD($IA0)]}
2808 je .L_last_num_blocks_is_8_${rndsuffix}
2809 jb .L_last_num_blocks_is_7_1_${rndsuffix}
2810
2811
2812 cmp \$12,@{[DWORD($IA0)]}
2813 je .L_last_num_blocks_is_12_${rndsuffix}
2814 jb .L_last_num_blocks_is_11_9_${rndsuffix}
2815
2816 # ;; 16, 15, 14 or 13
2817 cmp \$15,@{[DWORD($IA0)]}
2818 je .L_last_num_blocks_is_15_${rndsuffix}
2819 ja .L_last_num_blocks_is_16_${rndsuffix}
2820 cmp \$14,@{[DWORD($IA0)]}
2821 je .L_last_num_blocks_is_14_${rndsuffix}
2822 jmp .L_last_num_blocks_is_13_${rndsuffix}
2823
2824.L_last_num_blocks_is_11_9_${rndsuffix}:
2825 # ;; 11, 10 or 9
2826 cmp \$10,@{[DWORD($IA0)]}
2827 je .L_last_num_blocks_is_10_${rndsuffix}
2828 ja .L_last_num_blocks_is_11_${rndsuffix}
2829 jmp .L_last_num_blocks_is_9_${rndsuffix}
2830
2831.L_last_num_blocks_is_7_1_${rndsuffix}:
2832 cmp \$4,@{[DWORD($IA0)]}
2833 je .L_last_num_blocks_is_4_${rndsuffix}
2834 jb .L_last_num_blocks_is_3_1_${rndsuffix}
2835 # ;; 7, 6 or 5
2836 cmp \$6,@{[DWORD($IA0)]}
2837 ja .L_last_num_blocks_is_7_${rndsuffix}
2838 je .L_last_num_blocks_is_6_${rndsuffix}
2839 jmp .L_last_num_blocks_is_5_${rndsuffix}
2840
2841.L_last_num_blocks_is_3_1_${rndsuffix}:
2842 # ;; 3, 2 or 1
2843 cmp \$2,@{[DWORD($IA0)]}
2844 ja .L_last_num_blocks_is_3_${rndsuffix}
2845 je .L_last_num_blocks_is_2_${rndsuffix}
2846___
2847
2848 # ;; fall through for `jmp .L_last_num_blocks_is_1`
2849
2850 # ;; Use rep to generate different block size variants
2851 # ;; - one block size has to be the first one
2852 for my $num_blocks (1 .. 16) {
2853 $code .= ".L_last_num_blocks_is_${num_blocks}_${rndsuffix}:\n";
2854 &GHASH_16_ENCRYPT_N_GHASH_N(
2855 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET,
2856 $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET,
2857 $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03,
2858 $ZT04, $ZT05, $ZT06, $ZT07, $ZT08,
2859 $ZT09, $ZT10, $ZT11, $ZT12, $ZT13,
2860 $ZT14, $ZT15, $ZT16, $ZT17, $ZT18,
2861 $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4,
2862 $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M,
2863 $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG,
2864 $num_blocks, $PBLOCK_LEN);
2865
2866 $code .= "jmp .L_last_blocks_done_${rndsuffix}\n";
2867 }
2868
2869 $code .= ".L_last_num_blocks_is_0_${rndsuffix}:\n";
2870
2871 # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction
2872 # ;; - convert mid into end_reduce
2873 # ;; - convert start into start_reduce
2874 if ($GHASH_TYPE eq "mid") {
2875 $GHASH_TYPE = "end_reduce";
2876 }
2877 if ($GHASH_TYPE eq "start") {
2878 $GHASH_TYPE = "start_reduce";
2879 }
2880
2881 &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp",
2882 $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01,
2883 $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09);
2884
2885 $code .= ".L_last_blocks_done_${rndsuffix}:\n";
2886}
2887
2888# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2889# ;; Main GCM macro stitching cipher with GHASH
2890# ;; - operates on single stream
2891# ;; - encrypts 16 blocks at a time
2892# ;; - ghash the 16 previously encrypted ciphertext blocks
2893# ;; - no partial block or multi_call handling here
2894sub GHASH_16_ENCRYPT_16_PARALLEL {
2895 my $AES_KEYS = $_[0]; # [in] key pointer
2896 my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer
2897 my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer
2898 my $DATA_OFFSET = $_[3]; # [in] data offset
2899 my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian
2900 my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check
2901 my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value)
2902 my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out
2903 my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in
2904 my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb
2905 my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher)
2906 my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher)
2907 my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher)
2908 my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher)
2909 my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
2910 my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher)
2911 my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher)
2912 my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher)
2913 my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher)
2914 my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash)
2915 my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash)
2916 my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash)
2917 my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash)
2918 my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash)
2919 my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash)
2920 my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash)
2921 my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash)
2922 my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash)
2923 my $ZT19 = $_[28]; # [clobbered] temporary ZMM
2924 my $ZT20 = $_[29]; # [clobbered] temporary ZMM
2925 my $ZT21 = $_[30]; # [clobbered] temporary ZMM
2926 my $ZT22 = $_[31]; # [clobbered] temporary ZMM
2927 my $ZT23 = $_[32]; # [clobbered] temporary ZMM
2928 my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian
2929 my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2930 my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum
2931 my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum
2932 my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum
2933 my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time"
2934 my $ENC_DEC = $_[39]; # [in] cipher direction
2935 my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
2936 my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
2937 my $IA0 = $_[42]; # [clobbered] temporary GPR
2938
2939 my $B00_03 = $ZT1;
2940 my $B04_07 = $ZT2;
2941 my $B08_11 = $ZT3;
2942 my $B12_15 = $ZT4;
2943
2944 my $GH1H = $ZT5;
2945
2946 # ; @note: do not change this mapping
2947 my $GH1L = $ZT6;
2948 my $GH1M = $ZT7;
2949 my $GH1T = $ZT8;
2950
2951 my $GH2H = $ZT9;
2952 my $GH2L = $ZT10;
2953 my $GH2M = $ZT11;
2954 my $GH2T = $ZT12;
2955
2956 my $RED_POLY = $GH2T;
2957 my $RED_P1 = $GH2L;
2958 my $RED_T1 = $GH2H;
2959 my $RED_T2 = $GH2M;
2960
2961 my $GH3H = $ZT13;
2962 my $GH3L = $ZT14;
2963 my $GH3M = $ZT15;
2964 my $GH3T = $ZT16;
2965
2966 my $DATA1 = $ZT13;
2967 my $DATA2 = $ZT14;
2968 my $DATA3 = $ZT15;
2969 my $DATA4 = $ZT16;
2970
2971 my $AESKEY1 = $ZT17;
2972 my $AESKEY2 = $ZT18;
2973
2974 my $GHKEY1 = $ZT19;
2975 my $GHKEY2 = $ZT20;
2976 my $GHDAT1 = $ZT21;
2977 my $GHDAT2 = $ZT22;
2978
2979 my $rndsuffix = &random_string();
2980
2981 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2982 # ;; prepare counter blocks
2983
2984 $code .= <<___;
2985 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
2986 jae .L_16_blocks_overflow_${rndsuffix}
2987 vpaddd $ADDBE_1234,$CTR_BE,$B00_03
2988 vpaddd $ADDBE_4x4,$B00_03,$B04_07
2989 vpaddd $ADDBE_4x4,$B04_07,$B08_11
2990 vpaddd $ADDBE_4x4,$B08_11,$B12_15
2991 jmp .L_16_blocks_ok_${rndsuffix}
2992.L_16_blocks_overflow_${rndsuffix}:
2993 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
2994 vmovdqa64 ddq_add_4444(%rip),$B12_15
2995 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
2996 vpaddd $B12_15,$B00_03,$B04_07
2997 vpaddd $B12_15,$B04_07,$B08_11
2998 vpaddd $B12_15,$B08_11,$B12_15
2999 vpshufb $SHFMSK,$B00_03,$B00_03
3000 vpshufb $SHFMSK,$B04_07,$B04_07
3001 vpshufb $SHFMSK,$B08_11,$B08_11
3002 vpshufb $SHFMSK,$B12_15,$B12_15
3003.L_16_blocks_ok_${rndsuffix}:
3004___
3005
3006 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3007 # ;; pre-load constants
3008 $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
3009 if ($GHASH_IN ne "no_ghash_in") {
3010 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
3011 } else {
3012 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
3013 }
3014
3015 $code .= <<___;
3016 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
3017
3018 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3019 # ;; save counter for the next round
3020 # ;; increment counter overflow check register
3021 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
3022 addb \$16,@{[BYTE($CTR_CHECK)]}
3023 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3024 # ;; pre-load constants
3025 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
3026 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
3027 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
3028
3029 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3030 # ;; stitch AES rounds with GHASH
3031
3032 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3033 # ;; AES round 0 - ARK
3034
3035 vpxorq $AESKEY1,$B00_03,$B00_03
3036 vpxorq $AESKEY1,$B04_07,$B04_07
3037 vpxorq $AESKEY1,$B08_11,$B08_11
3038 vpxorq $AESKEY1,$B12_15,$B12_15
3039 vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1
3040
3041 # ;;==================================================
3042 # ;; GHASH 4 blocks (15 to 12)
3043 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
3044 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
3045 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
3046 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
3047 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
3048 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
3049
3050 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3051 # ;; AES round 1
3052 vaesenc $AESKEY2,$B00_03,$B00_03
3053 vaesenc $AESKEY2,$B04_07,$B04_07
3054 vaesenc $AESKEY2,$B08_11,$B08_11
3055 vaesenc $AESKEY2,$B12_15,$B12_15
3056 vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2
3057
3058 # ;; =================================================
3059 # ;; GHASH 4 blocks (11 to 8)
3060 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
3061 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
3062 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
3063 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
3064 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
3065 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
3066
3067 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3068 # ;; AES round 2
3069 vaesenc $AESKEY1,$B00_03,$B00_03
3070 vaesenc $AESKEY1,$B04_07,$B04_07
3071 vaesenc $AESKEY1,$B08_11,$B08_11
3072 vaesenc $AESKEY1,$B12_15,$B12_15
3073 vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1
3074
3075 # ;; =================================================
3076 # ;; GHASH 4 blocks (7 to 4)
3077 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
3078 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
3079 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
3080 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
3081 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3082 # ;; AES rounds 3
3083 vaesenc $AESKEY2,$B00_03,$B00_03
3084 vaesenc $AESKEY2,$B04_07,$B04_07
3085 vaesenc $AESKEY2,$B08_11,$B08_11
3086 vaesenc $AESKEY2,$B12_15,$B12_15
3087 vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2
3088
3089 # ;; =================================================
3090 # ;; Gather (XOR) GHASH for 12 blocks
3091 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
3092 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
3093 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
3094 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
3095
3096 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3097 # ;; AES rounds 4
3098 vaesenc $AESKEY1,$B00_03,$B00_03
3099 vaesenc $AESKEY1,$B04_07,$B04_07
3100 vaesenc $AESKEY1,$B08_11,$B08_11
3101 vaesenc $AESKEY1,$B12_15,$B12_15
3102 vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1
3103
3104 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3105 # ;; load plain/cipher text (recycle GH3xx registers)
3106 vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
3107 vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
3108 vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
3109 vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
3110
3111 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3112 # ;; AES rounds 5
3113 vaesenc $AESKEY2,$B00_03,$B00_03
3114 vaesenc $AESKEY2,$B04_07,$B04_07
3115 vaesenc $AESKEY2,$B08_11,$B08_11
3116 vaesenc $AESKEY2,$B12_15,$B12_15
3117 vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2
3118
3119 # ;; =================================================
3120 # ;; GHASH 4 blocks (3 to 0)
3121 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
3122 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
3123 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
3124 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
3125 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3126 # ;; AES round 6
3127 vaesenc $AESKEY1,$B00_03,$B00_03
3128 vaesenc $AESKEY1,$B04_07,$B04_07
3129 vaesenc $AESKEY1,$B08_11,$B08_11
3130 vaesenc $AESKEY1,$B12_15,$B12_15
3131 vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1
3132___
3133
3134 # ;; =================================================
3135 # ;; gather GHASH in GH1L (low) and GH1H (high)
3136 if ($DO_REDUCTION eq "first_time") {
3137 $code .= <<___;
3138 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3139 vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM
3140 vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH
3141 vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL
3142___
3143 }
3144 if ($DO_REDUCTION eq "no_reduction") {
3145 $code .= <<___;
3146 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3147 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM
3148 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH
3149 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL
3150___
3151 }
3152 if ($DO_REDUCTION eq "final_reduction") {
3153 $code .= <<___;
3154 # ;; phase 1: add mid products together
3155 # ;; also load polynomial constant for reduction
3156 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3157 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
3158
3159 vpsrldq \$8,$GH1M,$GH2M
3160 vpslldq \$8,$GH1M,$GH1M
3161
3162 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
3163___
3164 }
3165
3166 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3167 # ;; AES round 7
3168 $code .= <<___;
3169 vaesenc $AESKEY2,$B00_03,$B00_03
3170 vaesenc $AESKEY2,$B04_07,$B04_07
3171 vaesenc $AESKEY2,$B08_11,$B08_11
3172 vaesenc $AESKEY2,$B12_15,$B12_15
3173 vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2
3174___
3175
3176 # ;; =================================================
3177 # ;; Add mid product to high and low
3178 if ($DO_REDUCTION eq "final_reduction") {
3179 $code .= <<___;
3180 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
3181 vpxorq $TO_REDUCE_H,$GH1H,$GH1H
3182 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
3183 vpxorq $TO_REDUCE_L,$GH1L,$GH1L
3184___
3185 }
3186
3187 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3188 # ;; AES round 8
3189 $code .= <<___;
3190 vaesenc $AESKEY1,$B00_03,$B00_03
3191 vaesenc $AESKEY1,$B04_07,$B04_07
3192 vaesenc $AESKEY1,$B08_11,$B08_11
3193 vaesenc $AESKEY1,$B12_15,$B12_15
3194 vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1
3195___
3196
3197 # ;; =================================================
3198 # ;; horizontal xor of low and high 4x128
3199 if ($DO_REDUCTION eq "final_reduction") {
3200 &VHPXORI4x128($GH1H, $GH2H);
3201 &VHPXORI4x128($GH1L, $GH2L);
3202 }
3203
3204 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3205 # ;; AES round 9
3206 $code .= <<___;
3207 vaesenc $AESKEY2,$B00_03,$B00_03
3208 vaesenc $AESKEY2,$B04_07,$B04_07
3209 vaesenc $AESKEY2,$B08_11,$B08_11
3210 vaesenc $AESKEY2,$B12_15,$B12_15
3211___
3212 if (($NROUNDS >= 11)) {
3213 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
3214 }
3215
3216 # ;; =================================================
3217 # ;; first phase of reduction
3218 if ($DO_REDUCTION eq "final_reduction") {
3219 $code .= <<___;
3220 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
3221 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
3222 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
3223___
3224 }
3225
3226 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3227 # ;; AES rounds up to 11 (AES192) or 13 (AES256)
3228 # ;; AES128 is done
3229 if (($NROUNDS >= 11)) {
3230 $code .= <<___;
3231 vaesenc $AESKEY1,$B00_03,$B00_03
3232 vaesenc $AESKEY1,$B04_07,$B04_07
3233 vaesenc $AESKEY1,$B08_11,$B08_11
3234 vaesenc $AESKEY1,$B12_15,$B12_15
3235 vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1
3236
3237 vaesenc $AESKEY2,$B00_03,$B00_03
3238 vaesenc $AESKEY2,$B04_07,$B04_07
3239 vaesenc $AESKEY2,$B08_11,$B08_11
3240 vaesenc $AESKEY2,$B12_15,$B12_15
3241___
3242 if (($NROUNDS == 13)) {
3243 $code .= <<___;
3244 vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2
3245
3246 vaesenc $AESKEY1,$B00_03,$B00_03
3247 vaesenc $AESKEY1,$B04_07,$B04_07
3248 vaesenc $AESKEY1,$B08_11,$B08_11
3249 vaesenc $AESKEY1,$B12_15,$B12_15
3250 vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1
3251
3252 vaesenc $AESKEY2,$B00_03,$B00_03
3253 vaesenc $AESKEY2,$B04_07,$B04_07
3254 vaesenc $AESKEY2,$B08_11,$B08_11
3255 vaesenc $AESKEY2,$B12_15,$B12_15
3256___
3257 }
3258 }
3259
3260 # ;; =================================================
3261 # ;; second phase of the reduction
3262 if ($DO_REDUCTION eq "final_reduction") {
3263 $code .= <<___;
3264 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
3265 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
3266 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
3267 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
3268 # ;; GH1H = GH1H x RED_T1 x RED_T2
3269 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
3270___
3271 }
3272
3273 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3274 # ;; the last AES round
3275 $code .= <<___;
3276 vaesenclast $AESKEY1,$B00_03,$B00_03
3277 vaesenclast $AESKEY1,$B04_07,$B04_07
3278 vaesenclast $AESKEY1,$B08_11,$B08_11
3279 vaesenclast $AESKEY1,$B12_15,$B12_15
3280
3281 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3282 # ;; XOR against plain/cipher text
3283 vpxorq $DATA1,$B00_03,$B00_03
3284 vpxorq $DATA2,$B04_07,$B04_07
3285 vpxorq $DATA3,$B08_11,$B08_11
3286 vpxorq $DATA4,$B12_15,$B12_15
3287
3288 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3289 # ;; store cipher/plain text
3290 mov $CIPH_PLAIN_OUT,$IA0
3291 vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
3292 vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
3293 vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
3294 vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
3295___
3296
3297 # ;; =================================================
3298 # ;; shuffle cipher text blocks for GHASH computation
3299 if ($ENC_DEC eq "ENC") {
3300 $code .= <<___;
3301 vpshufb $SHFMSK,$B00_03,$B00_03
3302 vpshufb $SHFMSK,$B04_07,$B04_07
3303 vpshufb $SHFMSK,$B08_11,$B08_11
3304 vpshufb $SHFMSK,$B12_15,$B12_15
3305___
3306 } else {
3307 $code .= <<___;
3308 vpshufb $SHFMSK,$DATA1,$B00_03
3309 vpshufb $SHFMSK,$DATA2,$B04_07
3310 vpshufb $SHFMSK,$DATA3,$B08_11
3311 vpshufb $SHFMSK,$DATA4,$B12_15
3312___
3313 }
3314
3315 # ;; =================================================
3316 # ;; store shuffled cipher text for ghashing
3317 $code .= <<___;
3318 vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
3319 vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
3320 vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
3321 vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
3322___
3323}
3324
3325# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3326# ;;; Encryption of a single block
3327sub ENCRYPT_SINGLE_BLOCK {
3328 my $AES_KEY = $_[0]; # ; [in]
3329 my $XMM0 = $_[1]; # ; [in/out]
3330 my $GPR1 = $_[2]; # ; [clobbered]
3331
3332 my $rndsuffix = &random_string();
3333
3334 $code .= <<___;
3335 # ; load number of rounds from AES_KEY structure (offset in bytes is
3336 # ; size of the |rd_key| buffer)
3337 mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]}
3338 cmp \$9,@{[DWORD($GPR1)]}
3339 je .Laes_128_${rndsuffix}
3340 cmp \$11,@{[DWORD($GPR1)]}
3341 je .Laes_192_${rndsuffix}
3342 cmp \$13,@{[DWORD($GPR1)]}
3343 je .Laes_256_${rndsuffix}
3344 jmp .Lexit_aes_${rndsuffix}
3345___
3346 for my $keylen (sort keys %aes_rounds) {
3347 my $nr = $aes_rounds{$keylen};
3348 $code .= <<___;
3349.align 32
3350.Laes_${keylen}_${rndsuffix}:
3351___
3352 $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n";
3353 for (my $i = 1; $i <= $nr; $i++) {
3354 $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n";
3355 }
3356 $code .= <<___;
3357 vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0
3358 jmp .Lexit_aes_${rndsuffix}
3359___
3360 }
3361 $code .= ".Lexit_aes_${rndsuffix}:\n\n";
3362}
3363
3364sub CALC_J0 {
3365 my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context
3366 my $IV = $_[1]; #; [in] Pointer to IV
3367 my $IV_LEN = $_[2]; #; [in] IV length
3368 my $J0 = $_[3]; #; [out] XMM reg to contain J0
3369 my $ZT0 = $_[4]; #; [clobbered] ZMM register
3370 my $ZT1 = $_[5]; #; [clobbered] ZMM register
3371 my $ZT2 = $_[6]; #; [clobbered] ZMM register
3372 my $ZT3 = $_[7]; #; [clobbered] ZMM register
3373 my $ZT4 = $_[8]; #; [clobbered] ZMM register
3374 my $ZT5 = $_[9]; #; [clobbered] ZMM register
3375 my $ZT6 = $_[10]; #; [clobbered] ZMM register
3376 my $ZT7 = $_[11]; #; [clobbered] ZMM register
3377 my $ZT8 = $_[12]; #; [clobbered] ZMM register
3378 my $ZT9 = $_[13]; #; [clobbered] ZMM register
3379 my $ZT10 = $_[14]; #; [clobbered] ZMM register
3380 my $ZT11 = $_[15]; #; [clobbered] ZMM register
3381 my $ZT12 = $_[16]; #; [clobbered] ZMM register
3382 my $ZT13 = $_[17]; #; [clobbered] ZMM register
3383 my $ZT14 = $_[18]; #; [clobbered] ZMM register
3384 my $ZT15 = $_[19]; #; [clobbered] ZMM register
3385 my $ZT16 = $_[20]; #; [clobbered] ZMM register
3386 my $T1 = $_[21]; #; [clobbered] GP register
3387 my $T2 = $_[22]; #; [clobbered] GP register
3388 my $T3 = $_[23]; #; [clobbered] GP register
3389 my $MASKREG = $_[24]; #; [clobbered] mask register
3390
3391 # ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
3392 # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */
3393
3394 # ;; Calculate GHASH of (IV || 0s)
3395 $code .= "vpxor $J0,$J0,$J0\n";
3396 &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
3397 $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG);
3398
3399 # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
3400 $code .= <<___;
3401 mov $IV_LEN,$T1
3402 shl \$3,$T1 # ; IV length in bits
3403 vmovq $T1,@{[XWORD($ZT2)]}
3404
3405 # ;; Might need shuffle of ZT2
3406 vpxorq $J0,@{[XWORD($ZT2)]},$J0
3407
3408 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]}
3409___
3410 &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]});
3411
3412 $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n";
3413}
3414
3415# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3416# ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for
3417# ;;; encoding/decoding.
3418# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3419sub GCM_INIT_IV {
3420 my $AES_KEYS = $_[0]; # [in] AES key schedule
3421 my $GCM128_CTX = $_[1]; # [in/out] GCM context
3422 my $IV = $_[2]; # [in] IV pointer
3423 my $IV_LEN = $_[3]; # [in] IV length
3424 my $GPR1 = $_[4]; # [clobbered] GP register
3425 my $GPR2 = $_[5]; # [clobbered] GP register
3426 my $GPR3 = $_[6]; # [clobbered] GP register
3427 my $MASKREG = $_[7]; # [clobbered] mask register
3428 my $CUR_COUNT = $_[8]; # [out] XMM with current counter
3429 my $ZT0 = $_[9]; # [clobbered] ZMM register
3430 my $ZT1 = $_[10]; # [clobbered] ZMM register
3431 my $ZT2 = $_[11]; # [clobbered] ZMM register
3432 my $ZT3 = $_[12]; # [clobbered] ZMM register
3433 my $ZT4 = $_[13]; # [clobbered] ZMM register
3434 my $ZT5 = $_[14]; # [clobbered] ZMM register
3435 my $ZT6 = $_[15]; # [clobbered] ZMM register
3436 my $ZT7 = $_[16]; # [clobbered] ZMM register
3437 my $ZT8 = $_[17]; # [clobbered] ZMM register
3438 my $ZT9 = $_[18]; # [clobbered] ZMM register
3439 my $ZT10 = $_[19]; # [clobbered] ZMM register
3440 my $ZT11 = $_[20]; # [clobbered] ZMM register
3441 my $ZT12 = $_[21]; # [clobbered] ZMM register
3442 my $ZT13 = $_[22]; # [clobbered] ZMM register
3443 my $ZT14 = $_[23]; # [clobbered] ZMM register
3444 my $ZT15 = $_[24]; # [clobbered] ZMM register
3445 my $ZT16 = $_[25]; # [clobbered] ZMM register
3446
3447 my $ZT0x = $ZT0;
3448 $ZT0x =~ s/zmm/xmm/;
3449
3450 $code .= <<___;
3451 cmp \$12,$IV_LEN
3452 je iv_len_12_init_IV
3453___
3454
3455 # ;; IV is different than 12 bytes
3456 &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7,
3457 $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3458 $code .= <<___;
3459 jmp skip_iv_len_12_init_IV
3460iv_len_12_init_IV: # ;; IV is 12 bytes
3461 # ;; read 12 IV bytes and pad with 0x00000001
3462 vmovdqu8 ONEf(%rip),$CUR_COUNT
3463 mov $IV,$GPR2
3464 mov \$0x0000000000000fff,@{[DWORD($GPR1)]}
3465 kmovq $GPR1,$MASKREG
3466 vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1
3467skip_iv_len_12_init_IV:
3468 vmovdqu $CUR_COUNT,$ZT0x
3469___
3470 &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0)
3471 $code .= <<___;
3472 vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage
3473
3474 # ;; store IV as counter in LE format
3475 vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT
3476 vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi
3477___
3478}
3479
3480sub GCM_UPDATE_AAD {
3481 my $GCM128_CTX = $_[0]; # [in] GCM context pointer
3482 my $A_IN = $_[1]; # [in] AAD pointer
3483 my $A_LEN = $_[2]; # [in] AAD length in bytes
3484 my $GPR1 = $_[3]; # [clobbered] GP register
3485 my $GPR2 = $_[4]; # [clobbered] GP register
3486 my $GPR3 = $_[5]; # [clobbered] GP register
3487 my $MASKREG = $_[6]; # [clobbered] mask register
3488 my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value
3489 my $ZT0 = $_[8]; # [clobbered] ZMM register
3490 my $ZT1 = $_[9]; # [clobbered] ZMM register
3491 my $ZT2 = $_[10]; # [clobbered] ZMM register
3492 my $ZT3 = $_[11]; # [clobbered] ZMM register
3493 my $ZT4 = $_[12]; # [clobbered] ZMM register
3494 my $ZT5 = $_[13]; # [clobbered] ZMM register
3495 my $ZT6 = $_[14]; # [clobbered] ZMM register
3496 my $ZT7 = $_[15]; # [clobbered] ZMM register
3497 my $ZT8 = $_[16]; # [clobbered] ZMM register
3498 my $ZT9 = $_[17]; # [clobbered] ZMM register
3499 my $ZT10 = $_[18]; # [clobbered] ZMM register
3500 my $ZT11 = $_[19]; # [clobbered] ZMM register
3501 my $ZT12 = $_[20]; # [clobbered] ZMM register
3502 my $ZT13 = $_[21]; # [clobbered] ZMM register
3503 my $ZT14 = $_[22]; # [clobbered] ZMM register
3504 my $ZT15 = $_[23]; # [clobbered] ZMM register
3505 my $ZT16 = $_[24]; # [clobbered] ZMM register
3506
3507 # ; load current hash
3508 $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n";
3509
3510 &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2,
3511 $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13,
3512 $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3513
3514 # ; load current hash
3515 $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n";
3516}
3517
3518# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3519# ;;; Cipher and ghash of payloads shorter than 256 bytes
3520# ;;; - number of blocks in the message comes as argument
3521# ;;; - depending on the number of blocks an optimized variant of
3522# ;;; INITIAL_BLOCKS_PARTIAL is invoked
3523sub GCM_ENC_DEC_SMALL {
3524 my $AES_KEYS = $_[0]; # [in] key pointer
3525 my $GCM128_CTX = $_[1]; # [in] context pointer
3526 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
3527 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
3528 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
3529 my $ENC_DEC = $_[5]; # [in] cipher direction
3530 my $DATA_OFFSET = $_[6]; # [in] data offset
3531 my $LENGTH = $_[7]; # [in] data length
3532 my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16
3533 my $CTR = $_[9]; # [in/out] XMM counter block
3534 my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value
3535 my $ZTMP0 = $_[11]; # [clobbered] ZMM register
3536 my $ZTMP1 = $_[12]; # [clobbered] ZMM register
3537 my $ZTMP2 = $_[13]; # [clobbered] ZMM register
3538 my $ZTMP3 = $_[14]; # [clobbered] ZMM register
3539 my $ZTMP4 = $_[15]; # [clobbered] ZMM register
3540 my $ZTMP5 = $_[16]; # [clobbered] ZMM register
3541 my $ZTMP6 = $_[17]; # [clobbered] ZMM register
3542 my $ZTMP7 = $_[18]; # [clobbered] ZMM register
3543 my $ZTMP8 = $_[19]; # [clobbered] ZMM register
3544 my $ZTMP9 = $_[20]; # [clobbered] ZMM register
3545 my $ZTMP10 = $_[21]; # [clobbered] ZMM register
3546 my $ZTMP11 = $_[22]; # [clobbered] ZMM register
3547 my $ZTMP12 = $_[23]; # [clobbered] ZMM register
3548 my $ZTMP13 = $_[24]; # [clobbered] ZMM register
3549 my $ZTMP14 = $_[25]; # [clobbered] ZMM register
3550 my $IA0 = $_[26]; # [clobbered] GP register
3551 my $IA1 = $_[27]; # [clobbered] GP register
3552 my $MASKREG = $_[28]; # [clobbered] mask register
3553 my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask
3554 my $PBLOCK_LEN = $_[30]; # [in] partial block length
3555
3556 my $rndsuffix = &random_string();
3557
3558 $code .= <<___;
3559 cmp \$8,$NUM_BLOCKS
3560 je .L_small_initial_num_blocks_is_8_${rndsuffix}
3561 jl .L_small_initial_num_blocks_is_7_1_${rndsuffix}
3562
3563
3564 cmp \$12,$NUM_BLOCKS
3565 je .L_small_initial_num_blocks_is_12_${rndsuffix}
3566 jl .L_small_initial_num_blocks_is_11_9_${rndsuffix}
3567
3568 # ;; 16, 15, 14 or 13
3569 cmp \$16,$NUM_BLOCKS
3570 je .L_small_initial_num_blocks_is_16_${rndsuffix}
3571 cmp \$15,$NUM_BLOCKS
3572 je .L_small_initial_num_blocks_is_15_${rndsuffix}
3573 cmp \$14,$NUM_BLOCKS
3574 je .L_small_initial_num_blocks_is_14_${rndsuffix}
3575 jmp .L_small_initial_num_blocks_is_13_${rndsuffix}
3576
3577.L_small_initial_num_blocks_is_11_9_${rndsuffix}:
3578 # ;; 11, 10 or 9
3579 cmp \$11,$NUM_BLOCKS
3580 je .L_small_initial_num_blocks_is_11_${rndsuffix}
3581 cmp \$10,$NUM_BLOCKS
3582 je .L_small_initial_num_blocks_is_10_${rndsuffix}
3583 jmp .L_small_initial_num_blocks_is_9_${rndsuffix}
3584
3585.L_small_initial_num_blocks_is_7_1_${rndsuffix}:
3586 cmp \$4,$NUM_BLOCKS
3587 je .L_small_initial_num_blocks_is_4_${rndsuffix}
3588 jl .L_small_initial_num_blocks_is_3_1_${rndsuffix}
3589 # ;; 7, 6 or 5
3590 cmp \$7,$NUM_BLOCKS
3591 je .L_small_initial_num_blocks_is_7_${rndsuffix}
3592 cmp \$6,$NUM_BLOCKS
3593 je .L_small_initial_num_blocks_is_6_${rndsuffix}
3594 jmp .L_small_initial_num_blocks_is_5_${rndsuffix}
3595
3596.L_small_initial_num_blocks_is_3_1_${rndsuffix}:
3597 # ;; 3, 2 or 1
3598 cmp \$3,$NUM_BLOCKS
3599 je .L_small_initial_num_blocks_is_3_${rndsuffix}
3600 cmp \$2,$NUM_BLOCKS
3601 je .L_small_initial_num_blocks_is_2_${rndsuffix}
3602
3603 # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed
3604
3605 # ;; Generation of different block size variants
3606 # ;; - one block size has to be the first one
3607___
3608
3609 for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) {
3610 $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${rndsuffix}:\n";
3611 &INITIAL_BLOCKS_PARTIAL(
3612 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET,
3613 $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1,
3614 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3615 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3616 $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN);
3617
3618 if ($num_blocks != 16) {
3619 $code .= "jmp .L_small_initial_blocks_encrypted_${rndsuffix}\n";
3620 }
3621 }
3622
3623 $code .= ".L_small_initial_blocks_encrypted_${rndsuffix}:\n";
3624}
3625
3626# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3627# ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context
3628# ; struct has been initialized by GCM_INIT_IV
3629# ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
3630# ; Clobbers rax, r10-r15, and zmm0-zmm31, k1
3631# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3632sub GCM_ENC_DEC {
3633 my $AES_KEYS = $_[0]; # [in] AES Key schedule
3634 my $GCM128_CTX = $_[1]; # [in] context pointer
3635 my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update
3636 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer
3637 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
3638 my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer
3639 my $ENC_DEC = $_[6]; # [in] cipher direction
3640
3641 my $IA0 = "%r10";
3642 my $IA1 = "%r12";
3643 my $IA2 = "%r13";
3644 my $IA3 = "%r15";
3645 my $IA4 = "%r11";
3646 my $IA5 = "%rax";
3647 my $IA6 = "%rbx";
3648 my $IA7 = "%r14";
3649
3650 my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
3651
3652 my $CTR_CHECK = $IA3;
3653 my $DATA_OFFSET = $IA4;
3654 my $HASHK_PTR = $IA6;
3655
3656 my $HKEYS_READY = $IA7;
3657
3658 my $CTR_BLOCKz = "%zmm2";
3659 my $CTR_BLOCKx = "%xmm2";
3660
3661 # ; hardcoded in GCM_INIT
3662
3663 my $AAD_HASHz = "%zmm14";
3664 my $AAD_HASHx = "%xmm14";
3665
3666 # ; hardcoded in GCM_COMPLETE
3667
3668 my $ZTMP0 = "%zmm0";
3669 my $ZTMP1 = "%zmm3";
3670 my $ZTMP2 = "%zmm4";
3671 my $ZTMP3 = "%zmm5";
3672 my $ZTMP4 = "%zmm6";
3673 my $ZTMP5 = "%zmm7";
3674 my $ZTMP6 = "%zmm10";
3675 my $ZTMP7 = "%zmm11";
3676 my $ZTMP8 = "%zmm12";
3677 my $ZTMP9 = "%zmm13";
3678 my $ZTMP10 = "%zmm15";
3679 my $ZTMP11 = "%zmm16";
3680 my $ZTMP12 = "%zmm17";
3681
3682 my $ZTMP13 = "%zmm19";
3683 my $ZTMP14 = "%zmm20";
3684 my $ZTMP15 = "%zmm21";
3685 my $ZTMP16 = "%zmm30";
3686 my $ZTMP17 = "%zmm31";
3687 my $ZTMP18 = "%zmm1";
3688 my $ZTMP19 = "%zmm18";
3689 my $ZTMP20 = "%zmm8";
3690 my $ZTMP21 = "%zmm22";
3691 my $ZTMP22 = "%zmm23";
3692
3693 my $GH = "%zmm24";
3694 my $GL = "%zmm25";
3695 my $GM = "%zmm26";
3696 my $SHUF_MASK = "%zmm29";
3697
3698 # ; Unused in the small packet path
3699 my $ADDBE_4x4 = "%zmm27";
3700 my $ADDBE_1234 = "%zmm28";
3701
3702 my $MASKREG = "%k1";
3703
3704 my $rndsuffix = &random_string();
3705
3706 # ;; reduction every 48 blocks, depth 32 blocks
3707 # ;; @note 48 blocks is the maximum capacity of the stack frame
3708 my $big_loop_nblocks = 48;
3709 my $big_loop_depth = 32;
3710
3711 # ;;; Macro flow depending on packet size
3712 # ;;; - LENGTH <= 16 blocks
3713 # ;;; - cipher followed by hashing (reduction)
3714 # ;;; - 16 blocks < LENGTH < 32 blocks
3715 # ;;; - cipher 16 blocks
3716 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3717 # ;;; - 32 blocks < LENGTH < 48 blocks
3718 # ;;; - cipher 2 x 16 blocks
3719 # ;;; - hash 16 blocks
3720 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3721 # ;;; - LENGTH >= 48 blocks
3722 # ;;; - cipher 2 x 16 blocks
3723 # ;;; - while (data_to_cipher >= 48 blocks):
3724 # ;;; - cipher 16 blocks & hash 16 blocks
3725 # ;;; - cipher 16 blocks & hash 16 blocks
3726 # ;;; - cipher 16 blocks & hash 16 blocks (reduction)
3727 # ;;; - if (data_to_cipher >= 32 blocks):
3728 # ;;; - cipher 16 blocks & hash 16 blocks
3729 # ;;; - cipher 16 blocks & hash 16 blocks
3730 # ;;; - hash 16 blocks (reduction)
3731 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3732 # ;;; - elif (data_to_cipher >= 16 blocks):
3733 # ;;; - cipher 16 blocks & hash 16 blocks
3734 # ;;; - hash 16 blocks
3735 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3736 # ;;; - else:
3737 # ;;; - hash 16 blocks
3738 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3739
3740 if ($win64) {
3741 $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n";
3742 } else {
3743 $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
3744 }
3745 $code .= "je .L_enc_dec_done_${rndsuffix}\n";
3746
3747 # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in
3748 # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc'
3749
3750 $code .= "xor $HKEYS_READY, $HKEYS_READY\n";
3751 $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
3752
3753 # ;; Used for the update flow - if there was a previous partial
3754 # ;; block fill the remaining bytes here.
3755 &PARTIAL_BLOCK(
3756 $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
3757 $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1,
3758 $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3,
3759 $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG);
3760
3761 $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
3762
3763 # ;; Save the amount of data left to process in $LENGTH
3764 # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
3765 if ($win64) {
3766 $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n";
3767 }
3768
3769 # ;; There may be no more data if it was consumed in the partial block.
3770 $code .= <<___;
3771 sub $DATA_OFFSET,$LENGTH
3772 je .L_enc_dec_done_${rndsuffix}
3773___
3774
3775 $code .= <<___;
3776 cmp \$`(16 * 16)`,$LENGTH
3777 jbe .L_message_below_equal_16_blocks_${rndsuffix}
3778
3779 vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK
3780 vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
3781 vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
3782
3783 # ;; start the pipeline
3784 # ;; - 32 blocks aes-ctr
3785 # ;; - 16 blocks ghash + aes-ctr
3786
3787 # ;; set up CTR_CHECK
3788 vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
3789 and \$255,@{[DWORD($CTR_CHECK)]}
3790 # ;; in LE format after init, convert to BE
3791 vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
3792 vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
3793___
3794
3795 # ;; ==== AES-CTR - first 16 blocks
3796 my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3797 my $data_in_out_offset = 0;
3798 &INITIAL_BLOCKS_16(
3799 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
3800 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
3801 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
3802 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
3803
3804 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3805 "first16");
3806
3807 $code .= <<___;
3808 cmp \$`(32 * 16)`,$LENGTH
3809 jb .L_message_below_32_blocks_${rndsuffix}
3810___
3811
3812 # ;; ==== AES-CTR - next 16 blocks
3813 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3814 $data_in_out_offset = (16 * 16);
3815 &INITIAL_BLOCKS_16(
3816 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
3817 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
3818 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
3819 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
3820
3821 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3822 "last32");
3823 $code .= "mov \$1,$HKEYS_READY\n";
3824
3825 $code .= <<___;
3826 add \$`(32 * 16)`,$DATA_OFFSET
3827 sub \$`(32 * 16)`,$LENGTH
3828
3829 cmp \$`($big_loop_nblocks * 16)`,$LENGTH
3830 jb .L_no_more_big_nblocks_${rndsuffix}
3831___
3832
3833 # ;; ====
3834 # ;; ==== AES-CTR + GHASH - 48 blocks loop
3835 # ;; ====
3836 $code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n";
3837
3838 # ;; ==== AES-CTR + GHASH - 16 blocks, start
3839 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3840 $data_in_out_offset = (0 * 16);
3841 my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3842 &GHASH_16_ENCRYPT_16_PARALLEL(
3843 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3844 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3845 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3846 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3847 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3848 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3849 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
3850 $IA0);
3851
3852 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3853 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3854 $data_in_out_offset = (16 * 16);
3855 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3856 &GHASH_16_ENCRYPT_16_PARALLEL(
3857 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3858 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3859 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3860 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3861 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3862 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3863 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3864 $IA0);
3865
3866 # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
3867 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3868 $data_in_out_offset = (32 * 16);
3869 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3870 &GHASH_16_ENCRYPT_16_PARALLEL(
3871 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3872 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3873 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3874 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3875 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3876 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3877 $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3878 $IA0);
3879
3880 # ;; === xor cipher block 0 with GHASH (ZT4)
3881 $code .= <<___;
3882 vmovdqa64 $ZTMP4,$AAD_HASHz
3883
3884 add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
3885 sub \$`($big_loop_nblocks * 16)`,$LENGTH
3886 cmp \$`($big_loop_nblocks * 16)`,$LENGTH
3887 jae .L_encrypt_big_nblocks_${rndsuffix}
3888
3889.L_no_more_big_nblocks_${rndsuffix}:
3890
3891 cmp \$`(32 * 16)`,$LENGTH
3892 jae .L_encrypt_32_blocks_${rndsuffix}
3893
3894 cmp \$`(16 * 16)`,$LENGTH
3895 jae .L_encrypt_16_blocks_${rndsuffix}
3896___
3897
3898 # ;; =====================================================
3899 # ;; =====================================================
3900 # ;; ==== GHASH 1 x 16 blocks
3901 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3902 # ;; ==== then GHASH N blocks
3903 $code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n";
3904
3905 # ;; calculate offset to the right hash key
3906 $code .= <<___;
3907mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
3908and \$~15,@{[DWORD($IA0)]}
3909mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3910sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3911___
3912
3913 # ;; ==== GHASH 32 blocks and follow with reduction
3914 &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
3915 "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3916
3917 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3918 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3919 $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
3920 &GCM_ENC_DEC_LAST(
3921 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
3922 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
3923 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3924 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
3925 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
3926 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
3927 "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
3928 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
3929
3930 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
3931 $code .= "jmp .L_ghash_done_${rndsuffix}\n";
3932
3933 # ;; =====================================================
3934 # ;; =====================================================
3935 # ;; ==== GHASH & encrypt 1 x 16 blocks
3936 # ;; ==== GHASH & encrypt 1 x 16 blocks
3937 # ;; ==== GHASH 1 x 16 blocks (reduction)
3938 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3939 # ;; ==== then GHASH N blocks
3940 $code .= ".L_encrypt_32_blocks_${rndsuffix}:\n";
3941
3942 # ;; ==== AES-CTR + GHASH - 16 blocks, start
3943 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3944 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3945 $data_in_out_offset = (0 * 16);
3946 &GHASH_16_ENCRYPT_16_PARALLEL(
3947 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3948 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3949 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3950 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3951 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3952 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3953 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
3954 $IA0);
3955
3956 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3957 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3958 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3959 $data_in_out_offset = (16 * 16);
3960 &GHASH_16_ENCRYPT_16_PARALLEL(
3961 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3962 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3963 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3964 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3965 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3966 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3967 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3968 $IA0);
3969
3970 # ;; ==== GHASH 16 blocks with reduction
3971 &GHASH_16(
3972 "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
3973 "%rsp", &HashKeyOffsetByIdx(16, "frame"),
3974 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3975
3976 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3977 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3978 $code .= <<___;
3979 sub \$`(32 * 16)`,$LENGTH
3980 add \$`(32 * 16)`,$DATA_OFFSET
3981___
3982
3983 # ;; calculate offset to the right hash key
3984 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
3985 $code .= <<___;
3986 and \$~15,@{[DWORD($IA0)]}
3987 mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3988 sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3989___
3990 &GCM_ENC_DEC_LAST(
3991 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
3992 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
3993 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3994 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
3995 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
3996 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
3997 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
3998 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
3999
4000 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4001 $code .= "jmp .L_ghash_done_${rndsuffix}\n";
4002
4003 # ;; =====================================================
4004 # ;; =====================================================
4005 # ;; ==== GHASH & encrypt 16 blocks (done before)
4006 # ;; ==== GHASH 1 x 16 blocks
4007 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
4008 # ;; ==== then GHASH N blocks
4009 $code .= ".L_encrypt_16_blocks_${rndsuffix}:\n";
4010
4011 # ;; ==== AES-CTR + GHASH - 16 blocks, start
4012 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4013 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4014 $data_in_out_offset = (0 * 16);
4015 &GHASH_16_ENCRYPT_16_PARALLEL(
4016 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
4017 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
4018 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
4019 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
4020 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
4021 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
4022 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4023 $IA0);
4024
4025 # ;; ==== GHASH 1 x 16 blocks
4026 &GHASH_16(
4027 "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
4028 "%rsp", &HashKeyOffsetByIdx(32, "frame"),
4029 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
4030
4031 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
4032 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4033 $code .= <<___;
4034 sub \$`(16 * 16)`,$LENGTH
4035 add \$`(16 * 16)`,$DATA_OFFSET
4036___
4037 &GCM_ENC_DEC_LAST(
4038 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
4039 $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK,
4040 &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
4041 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4,
4042 $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
4043 $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4044 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16,
4045 $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20,
4046 $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4047 "end_reduce", $GL, $GH, $GM,
4048 $ENC_DEC, $AAD_HASHz, $IA0, $IA5,
4049 $MASKREG, $PBLOCK_LEN);
4050
4051 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4052 $code .= <<___;
4053 jmp .L_ghash_done_${rndsuffix}
4054
4055.L_message_below_32_blocks_${rndsuffix}:
4056 # ;; 32 > number of blocks > 16
4057
4058 sub \$`(16 * 16)`,$LENGTH
4059 add \$`(16 * 16)`,$DATA_OFFSET
4060___
4061 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4062
4063 # ;; calculate offset to the right hash key
4064 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
4065
4066 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4067 "mid16");
4068 $code .= "mov \$1,$HKEYS_READY\n";
4069
4070 $code .= <<___;
4071and \$~15,@{[DWORD($IA0)]}
4072mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
4073sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
4074___
4075
4076 &GCM_ENC_DEC_LAST(
4077 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
4078 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
4079 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4080 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4081 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
4082 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4083 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
4084 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
4085
4086 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4087 $code .= <<___;
4088 jmp .L_ghash_done_${rndsuffix}
4089
4090.L_message_below_equal_16_blocks_${rndsuffix}:
4091 # ;; Determine how many blocks to process
4092 # ;; - process one additional block if there is a partial block
4093 mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
4094 add \$15,@{[DWORD($IA1)]}
4095 shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16
4096___
4097 &GCM_ENC_DEC_SMALL(
4098 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
4099 $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0,
4100 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4101 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4102 $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK,
4103 $PBLOCK_LEN);
4104
4105 # ;; fall through to exit
4106
4107 $code .= ".L_ghash_done_${rndsuffix}:\n";
4108
4109 # ;; save the last counter block
4110 $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n";
4111 $code .= <<___;
4112 vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4113.L_enc_dec_done_${rndsuffix}:
4114___
4115}
4116
4117# ;;; ===========================================================================
4118# ;;; Encrypt/decrypt the initial 16 blocks
4119sub INITIAL_BLOCKS_16 {
4120 my $IN = $_[0]; # [in] input buffer
4121 my $OUT = $_[1]; # [in] output buffer
4122 my $AES_KEYS = $_[2]; # [in] pointer to expanded keys
4123 my $DATA_OFFSET = $_[3]; # [in] data offset
4124 my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits)
4125 my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits
4126 my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check
4127 my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian)
4128 my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
4129 my $T0 = $_[9]; # [clobered] temporary ZMM register
4130 my $T1 = $_[10]; # [clobered] temporary ZMM register
4131 my $T2 = $_[11]; # [clobered] temporary ZMM register
4132 my $T3 = $_[12]; # [clobered] temporary ZMM register
4133 my $T4 = $_[13]; # [clobered] temporary ZMM register
4134 my $T5 = $_[14]; # [clobered] temporary ZMM register
4135 my $T6 = $_[15]; # [clobered] temporary ZMM register
4136 my $T7 = $_[16]; # [clobered] temporary ZMM register
4137 my $T8 = $_[17]; # [clobered] temporary ZMM register
4138 my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask
4139 my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector
4140 my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks
4141 my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset
4142 my $IA0 = $_[22]; # [clobered] temporary GP register
4143
4144 my $B00_03 = $T5;
4145 my $B04_07 = $T6;
4146 my $B08_11 = $T7;
4147 my $B12_15 = $T8;
4148
4149 my $rndsuffix = &random_string();
4150
4151 my $stack_offset = $BLK_OFFSET;
4152 $code .= <<___;
4153 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4154 # ;; prepare counter blocks
4155
4156 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
4157 jae .L_next_16_overflow_${rndsuffix}
4158 vpaddd $ADDBE_1234,$CTR,$B00_03
4159 vpaddd $ADDBE_4x4,$B00_03,$B04_07
4160 vpaddd $ADDBE_4x4,$B04_07,$B08_11
4161 vpaddd $ADDBE_4x4,$B08_11,$B12_15
4162 jmp .L_next_16_ok_${rndsuffix}
4163.L_next_16_overflow_${rndsuffix}:
4164 vpshufb $SHUF_MASK,$CTR,$CTR
4165 vmovdqa64 ddq_add_4444(%rip),$B12_15
4166 vpaddd ddq_add_1234(%rip),$CTR,$B00_03
4167 vpaddd $B12_15,$B00_03,$B04_07
4168 vpaddd $B12_15,$B04_07,$B08_11
4169 vpaddd $B12_15,$B08_11,$B12_15
4170 vpshufb $SHUF_MASK,$B00_03,$B00_03
4171 vpshufb $SHUF_MASK,$B04_07,$B04_07
4172 vpshufb $SHUF_MASK,$B08_11,$B08_11
4173 vpshufb $SHUF_MASK,$B12_15,$B12_15
4174.L_next_16_ok_${rndsuffix}:
4175 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR
4176 addb \$16,@{[BYTE($CTR_CHECK)]}
4177 # ;; === load 16 blocks of data
4178 vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0
4179 vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1
4180 vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2
4181 vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3
4182
4183 # ;; move to AES encryption rounds
4184 vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4
4185 vpxorq $T4,$B00_03,$B00_03
4186 vpxorq $T4,$B04_07,$B04_07
4187 vpxorq $T4,$B08_11,$B08_11
4188 vpxorq $T4,$B12_15,$B12_15
4189___
4190 foreach (1 .. ($NROUNDS)) {
4191 $code .= <<___;
4192 vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4
4193 vaesenc $T4,$B00_03,$B00_03
4194 vaesenc $T4,$B04_07,$B04_07
4195 vaesenc $T4,$B08_11,$B08_11
4196 vaesenc $T4,$B12_15,$B12_15
4197___
4198 }
4199 $code .= <<___;
4200 vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4
4201 vaesenclast $T4,$B00_03,$B00_03
4202 vaesenclast $T4,$B04_07,$B04_07
4203 vaesenclast $T4,$B08_11,$B08_11
4204 vaesenclast $T4,$B12_15,$B12_15
4205
4206 # ;; xor against text
4207 vpxorq $T0,$B00_03,$B00_03
4208 vpxorq $T1,$B04_07,$B04_07
4209 vpxorq $T2,$B08_11,$B08_11
4210 vpxorq $T3,$B12_15,$B12_15
4211
4212 # ;; store
4213 mov $OUT, $IA0
4214 vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1)
4215 vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1)
4216 vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1)
4217 vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1)
4218___
4219 if ($ENC_DEC eq "DEC") {
4220 $code .= <<___;
4221 # ;; decryption - cipher text needs to go to GHASH phase
4222 vpshufb $SHUF_MASK,$T0,$B00_03
4223 vpshufb $SHUF_MASK,$T1,$B04_07
4224 vpshufb $SHUF_MASK,$T2,$B08_11
4225 vpshufb $SHUF_MASK,$T3,$B12_15
4226___
4227 } else {
4228 $code .= <<___;
4229 # ;; encryption
4230 vpshufb $SHUF_MASK,$B00_03,$B00_03
4231 vpshufb $SHUF_MASK,$B04_07,$B04_07
4232 vpshufb $SHUF_MASK,$B08_11,$B08_11
4233 vpshufb $SHUF_MASK,$B12_15,$B12_15
4234___
4235 }
4236
4237 if ($GHASH ne "no_ghash") {
4238 $code .= <<___;
4239 # ;; === xor cipher block 0 with GHASH for the next GHASH round
4240 vpxorq $GHASH,$B00_03,$B00_03
4241___
4242 }
4243 $code .= <<___;
4244 vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp)
4245 vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp)
4246 vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp)
4247 vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp)
4248___
4249}
4250
4251# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4252# ; GCM_COMPLETE Finishes ghash calculation
4253# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4254sub GCM_COMPLETE {
4255 my $GCM128_CTX = $_[0];
4256 my $PBLOCK_LEN = $_[1];
4257
4258 my $rndsuffix = &random_string();
4259
4260 $code .= <<___;
4261 vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2
4262 vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0)
4263___
4264
4265 $code .= <<___;
4266 vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4
4267
4268 # ;; Process the final partial block.
4269 cmp \$0,$PBLOCK_LEN
4270 je .L_partial_done_${rndsuffix}
4271___
4272
4273 # ;GHASH computation for the last <16 Byte block
4274 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4275
4276 $code .= <<___;
4277.L_partial_done_${rndsuffix}:
4278 vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5
4279 vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C)
4280 vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits
4281
4282 vpxor %xmm5,%xmm4,%xmm4
4283___
4284
4285 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4286
4287 $code .= <<___;
4288 vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap
4289 vpxor %xmm4,%xmm3,%xmm3
4290
4291.L_return_T_${rndsuffix}:
4292 vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4293___
4294}
4295
4296# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4297# ;;; Functions definitions
4298# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4299
4300$code .= ".text\n";
4301{
4302 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4303 # ;void ossl_aes_gcm_init_avx512 /
4304 # ; (const void *aes_keys,
4305 # ; void *gcm128ctx)
4306 # ;
4307 # ; Precomputes hashkey table for GHASH optimization.
4308 # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4309 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4310 $code .= <<___;
4311.globl ossl_aes_gcm_init_avx512
4312.type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4313.align 32
4314ossl_aes_gcm_init_avx512:
4315.cfi_startproc
4316 endbranch
4317___
4318 if ($CHECK_FUNCTION_ARGUMENTS) {
4319 $code .= <<___;
4320 # ;; Check aes_keys != NULL
4321 test $arg1,$arg1
4322 jz .Labort_init
4323
4324 # ;; Check gcm128ctx != NULL
4325 test $arg2,$arg2
4326 jz .Labort_init
4327___
4328 }
4329 $code .= "vpxorq %xmm16,%xmm16,%xmm16\n";
4330 &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey
4331 $code .= <<___;
4332 vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
4333 # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;;
4334 vmovdqa64 %xmm16,%xmm2
4335 vpsllq \$1,%xmm16,%xmm16
4336 vpsrlq \$63,%xmm2,%xmm2
4337 vmovdqa %xmm2,%xmm1
4338 vpslldq \$8,%xmm2,%xmm2
4339 vpsrldq \$8,%xmm1,%xmm1
4340 vporq %xmm2,%xmm16,%xmm16
4341 # ;reduction
4342 vpshufd \$0b00100100,%xmm1,%xmm2
4343 vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
4344 vpand POLY(%rip),%xmm2,%xmm2
4345 vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly
4346 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4347 vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly
4348___
4349 &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4350 if ($CLEAR_SCRATCH_REGISTERS) {
4351 &clear_scratch_gps_asm();
4352 &clear_scratch_zmms_asm();
4353 } else {
4354 $code .= "vzeroupper\n";
4355 }
4356 $code .= <<___;
4357.Labort_init:
4358ret
4359.cfi_endproc
4360.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4361___
4362}
4363
4364# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4365# ;void ossl_aes_gcm_setiv_avx512
4366# ; (const void *aes_keys,
4367# ; void *gcm128ctx,
4368# ; const unsigned char *iv,
4369# ; size_t ivlen)
4370# ;
4371# ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure.
4372# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4373$code .= <<___;
4374.globl ossl_aes_gcm_setiv_avx512
4375.type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent
4376.align 32
4377ossl_aes_gcm_setiv_avx512:
4378.cfi_startproc
4379.Lsetiv_seh_begin:
4380 endbranch
4381___
4382if ($CHECK_FUNCTION_ARGUMENTS) {
4383 $code .= <<___;
4384 # ;; Check aes_keys != NULL
4385 test $arg1,$arg1
4386 jz .Labort_setiv
4387
4388 # ;; Check gcm128ctx != NULL
4389 test $arg2,$arg2
4390 jz .Labort_setiv
4391
4392 # ;; Check iv != NULL
4393 test $arg3,$arg3
4394 jz .Labort_setiv
4395
4396 # ;; Check ivlen != 0
4397 test $arg4,$arg4
4398 jz .Labort_setiv
4399___
4400}
4401
4402# ; NOTE: code before PROLOG() must not modify any registers
4403&PROLOG(
4404 1, # allocate stack space for hkeys
4405 0, # do not allocate stack space for AES blocks
4406 "setiv");
4407&GCM_INIT_IV(
4408 "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1",
4409 "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12",
4410 "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4411&EPILOG(
4412 1, # hkeys were allocated
4413 $arg4);
4414$code .= <<___;
4415.Labort_setiv:
4416ret
4417.Lsetiv_seh_end:
4418.cfi_endproc
4419.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
4420___
4421
4422# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4423# ;void ossl_aes_gcm_update_aad_avx512
4424# ; (unsigned char *gcm128ctx,
4425# ; const unsigned char *aad,
4426# ; size_t aadlen)
4427# ;
4428# ; Updates AAD hash in gcm128_context structure.
4429# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4430$code .= <<___;
4431.globl ossl_aes_gcm_update_aad_avx512
4432.type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent
4433.align 32
4434ossl_aes_gcm_update_aad_avx512:
4435.cfi_startproc
4436.Lghash_seh_begin:
4437 endbranch
4438___
4439if ($CHECK_FUNCTION_ARGUMENTS) {
4440 $code .= <<___;
4441 # ;; Check gcm128ctx != NULL
4442 test $arg1,$arg1
4443 jz .Lexit_update_aad
4444
4445 # ;; Check aad != NULL
4446 test $arg2,$arg2
4447 jz .Lexit_update_aad
4448
4449 # ;; Check aadlen != 0
4450 test $arg3,$arg3
4451 jz .Lexit_update_aad
4452___
4453}
4454
4455# ; NOTE: code before PROLOG() must not modify any registers
4456&PROLOG(
4457 1, # allocate stack space for hkeys,
4458 0, # do not allocate stack space for AES blocks
4459 "ghash");
4460&GCM_UPDATE_AAD(
4461 "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11",
4462 "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13",
4463 "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4464&EPILOG(
4465 1, # hkeys were allocated
4466 $arg3);
4467$code .= <<___;
4468.Lexit_update_aad:
4469ret
4470.Lghash_seh_end:
4471.cfi_endproc
4472.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
4473___
4474
4475# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4476# ;void ossl_aes_gcm_encrypt_avx512
4477# ; (const void* aes_keys,
4478# ; void *gcm128ctx,
4479# ; unsigned int *pblocklen,
4480# ; const unsigned char *in,
4481# ; size_t len,
4482# ; unsigned char *out);
4483# ;
4484# ; Performs encryption of data |in| of len |len|, and stores the output in |out|.
4485# ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4486# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4487$code .= <<___;
4488.globl ossl_aes_gcm_encrypt_avx512
4489.type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent
4490.align 32
4491ossl_aes_gcm_encrypt_avx512:
4492.cfi_startproc
4493.Lencrypt_seh_begin:
4494 endbranch
4495___
4496
4497# ; NOTE: code before PROLOG() must not modify any registers
4498&PROLOG(
4499 1, # allocate stack space for hkeys
4500 1, # allocate stack space for AES blocks
4501 "encrypt");
4502if ($CHECK_FUNCTION_ARGUMENTS) {
4503 $code .= <<___;
4504 # ;; Check aes_keys != NULL
4505 test $arg1,$arg1
4506 jz .Lexit_gcm_encrypt
4507
4508 # ;; Check gcm128ctx != NULL
4509 test $arg2,$arg2
4510 jz .Lexit_gcm_encrypt
4511
4512 # ;; Check pblocklen != NULL
4513 test $arg3,$arg3
4514 jz .Lexit_gcm_encrypt
4515
4516 # ;; Check in != NULL
4517 test $arg4,$arg4
4518 jz .Lexit_gcm_encrypt
4519
4520 # ;; Check if len != 0
4521 cmp \$0,$arg5
4522 jz .Lexit_gcm_encrypt
4523
4524 # ;; Check out != NULL
4525 cmp \$0,$arg6
4526 jz .Lexit_gcm_encrypt
4527___
4528}
4529$code .= <<___;
4530 # ; load number of rounds from AES_KEY structure (offset in bytes is
4531 # ; size of the |rd_key| buffer)
4532 mov `4*15*4`($arg1),%eax
4533 cmp \$9,%eax
4534 je .Laes_gcm_encrypt_128_avx512
4535 cmp \$11,%eax
4536 je .Laes_gcm_encrypt_192_avx512
4537 cmp \$13,%eax
4538 je .Laes_gcm_encrypt_256_avx512
4539 xor %eax,%eax
4540 jmp .Lexit_gcm_encrypt
4541___
4542for my $keylen (sort keys %aes_rounds) {
4543 $NROUNDS = $aes_rounds{$keylen};
4544 $code .= <<___;
4545.align 32
4546.Laes_gcm_encrypt_${keylen}_avx512:
4547___
4548 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC");
4549 $code .= "jmp .Lexit_gcm_encrypt\n";
4550}
4551$code .= ".Lexit_gcm_encrypt:\n";
4552&EPILOG(1, $arg5);
4553$code .= <<___;
4554ret
4555.Lencrypt_seh_end:
4556.cfi_endproc
4557.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
4558___
4559
4560# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4561# ;void ossl_aes_gcm_decrypt_avx512
4562# ; (const void* keys,
4563# ; void *gcm128ctx,
4564# ; unsigned int *pblocklen,
4565# ; const unsigned char *in,
4566# ; size_t len,
4567# ; unsigned char *out);
4568# ;
4569# ; Performs decryption of data |in| of len |len|, and stores the output in |out|.
4570# ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4571# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4572$code .= <<___;
4573.globl ossl_aes_gcm_decrypt_avx512
4574.type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent
4575.align 32
4576ossl_aes_gcm_decrypt_avx512:
4577.cfi_startproc
4578.Ldecrypt_seh_begin:
4579 endbranch
4580___
4581
4582# ; NOTE: code before PROLOG() must not modify any registers
4583&PROLOG(
4584 1, # allocate stack space for hkeys
4585 1, # allocate stack space for AES blocks
4586 "decrypt");
4587if ($CHECK_FUNCTION_ARGUMENTS) {
4588 $code .= <<___;
4589 # ;; Check keys != NULL
4590 test $arg1,$arg1
4591 jz .Lexit_gcm_decrypt
4592
4593 # ;; Check gcm128ctx != NULL
4594 test $arg2,$arg2
4595 jz .Lexit_gcm_decrypt
4596
4597 # ;; Check pblocklen != NULL
4598 test $arg3,$arg3
4599 jz .Lexit_gcm_decrypt
4600
4601 # ;; Check in != NULL
4602 test $arg4,$arg4
4603 jz .Lexit_gcm_decrypt
4604
4605 # ;; Check if len != 0
4606 cmp \$0,$arg5
4607 jz .Lexit_gcm_decrypt
4608
4609 # ;; Check out != NULL
4610 cmp \$0,$arg6
4611 jz .Lexit_gcm_decrypt
4612___
4613}
4614$code .= <<___;
4615 # ; load number of rounds from AES_KEY structure (offset in bytes is
4616 # ; size of the |rd_key| buffer)
4617 mov `4*15*4`($arg1),%eax
4618 cmp \$9,%eax
4619 je .Laes_gcm_decrypt_128_avx512
4620 cmp \$11,%eax
4621 je .Laes_gcm_decrypt_192_avx512
4622 cmp \$13,%eax
4623 je .Laes_gcm_decrypt_256_avx512
4624 xor %eax,%eax
4625 jmp .Lexit_gcm_decrypt
4626___
4627for my $keylen (sort keys %aes_rounds) {
4628 $NROUNDS = $aes_rounds{$keylen};
4629 $code .= <<___;
4630.align 32
4631.Laes_gcm_decrypt_${keylen}_avx512:
4632___
4633 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC");
4634 $code .= "jmp .Lexit_gcm_decrypt\n";
4635}
4636$code .= ".Lexit_gcm_decrypt:\n";
4637&EPILOG(1, $arg5);
4638$code .= <<___;
4639ret
4640.Ldecrypt_seh_end:
4641.cfi_endproc
4642.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
4643___
4644
4645# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4646# ;void ossl_aes_gcm_finalize_vaes_avx512
4647# ; (void *gcm128ctx,
4648# ; unsigned int pblocklen);
4649# ;
4650# ; Finalizes encryption / decryption
4651# ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4652# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4653$code .= <<___;
4654.globl ossl_aes_gcm_finalize_avx512
4655.type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent
4656.align 32
4657ossl_aes_gcm_finalize_avx512:
4658.cfi_startproc
4659 endbranch
4660___
4661if ($CHECK_FUNCTION_ARGUMENTS) {
4662 $code .= <<___;
4663 # ;; Check gcm128ctx != NULL
4664 test $arg1,$arg1
4665 jz .Labort_finalize
4666___
4667}
4668
4669&GCM_COMPLETE("$arg1", "$arg2");
4670
4671$code .= <<___;
4672.Labort_finalize:
4673ret
4674.cfi_endproc
4675.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
4676___
4677
4678# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4679# ;void ossl_gcm_gmult_avx512(u64 Xi[2],
4680# ; const void* gcm128ctx)
4681# ;
4682# ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4683# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4684$code .= <<___;
4685.globl ossl_gcm_gmult_avx512
4686.hidden ossl_gcm_gmult_avx512
4687.type ossl_gcm_gmult_avx512,\@abi-omnipotent
4688.align 32
4689ossl_gcm_gmult_avx512:
4690.cfi_startproc
4691 endbranch
4692___
4693if ($CHECK_FUNCTION_ARGUMENTS) {
4694 $code .= <<___;
4695 # ;; Check Xi != NULL
4696 test $arg1,$arg1
4697 jz .Labort_gmult
4698
4699 # ;; Check gcm128ctx != NULL
4700 test $arg2,$arg2
4701 jz .Labort_gmult
4702___
4703}
4704$code .= "vmovdqu64 ($arg1),%xmm1\n";
4705$code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n";
4706
4707&GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4708
4709$code .= "vmovdqu64 %xmm1,($arg1)\n";
4710if ($CLEAR_SCRATCH_REGISTERS) {
4711 &clear_scratch_gps_asm();
4712 &clear_scratch_zmms_asm();
4713} else {
4714 $code .= "vzeroupper\n";
4715}
4716$code .= <<___;
4717.Labort_gmult:
4718ret
4719.cfi_endproc
4720.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
4721___
4722
4723if ($win64) {
4724
4725 # Add unwind metadata for SEH.
4726
4727 # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160
4728 my $UWOP_PUSH_NONVOL = 0;
4729 my $UWOP_ALLOC_LARGE = 1;
4730 my $UWOP_SET_FPREG = 3;
4731 my $UWOP_SAVE_XMM128 = 8;
4732 my %UWOP_REG_NUMBER = (
4733 rax => 0,
4734 rcx => 1,
4735 rdx => 2,
4736 rbx => 3,
4737 rsp => 4,
4738 rbp => 5,
4739 rsi => 6,
4740 rdi => 7,
4741 map(("r$_" => $_), (8 .. 15)));
4742
4743 $code .= <<___;
4744.section .pdata
4745.align 4
4746 .rva .Lsetiv_seh_begin
4747 .rva .Lsetiv_seh_end
4748 .rva .Lsetiv_seh_info
4749
4750 .rva .Lghash_seh_begin
4751 .rva .Lghash_seh_end
4752 .rva .Lghash_seh_info
4753
4754 .rva .Lencrypt_seh_begin
4755 .rva .Lencrypt_seh_end
4756 .rva .Lencrypt_seh_info
4757
4758 .rva .Ldecrypt_seh_begin
4759 .rva .Ldecrypt_seh_end
4760 .rva .Ldecrypt_seh_info
4761
4762.section .xdata
4763___
4764
4765 foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") {
4766 $code .= <<___;
4767.align 8
4768.L${func_name}_seh_info:
4769 .byte 1 # version 1, no flags
224ea84b 4770 .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin
63b996e7
AM
4771 .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10
4772 # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16
4773 .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]}
4774___
4775
4776 # Metadata for %xmm15-%xmm6
4777 # Occupy 2 slots each
4778 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
4779
4780 # Scaled-by-16 stack offset
4781 my $xmm_reg_offset = ($reg_idx - 6);
4782 $code .= <<___;
224ea84b 4783 .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin
63b996e7
AM
4784 .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]}
4785 .value $xmm_reg_offset
4786___
4787 }
4788
4789 $code .= <<___;
4790 # Frame pointer (occupy 1 slot)
224ea84b 4791 .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin
63b996e7
AM
4792 .byte $UWOP_SET_FPREG
4793
4794 # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes
224ea84b 4795 .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin
63b996e7
AM
4796 .byte $UWOP_ALLOC_LARGE
4797 .value `($XMM_STORAGE + 8) / 8`
4798___
4799
4800 # Metadata for GPR regs
4801 # Occupy 1 slot each
4802 foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") {
4803 $code .= <<___;
224ea84b 4804 .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin
63b996e7
AM
4805 .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]}
4806___
4807 }
4808 }
4809}
4810
4811$code .= <<___;
4812.data
4813.align 16
4814POLY: .quad 0x0000000000000001, 0xC200000000000000
4815
4816.align 64
4817POLY2:
4818 .quad 0x00000001C2000000, 0xC200000000000000
4819 .quad 0x00000001C2000000, 0xC200000000000000
4820 .quad 0x00000001C2000000, 0xC200000000000000
4821 .quad 0x00000001C2000000, 0xC200000000000000
4822
4823.align 16
4824TWOONE: .quad 0x0000000000000001, 0x0000000100000000
4825
4826# ;;; Order of these constants should not change.
4827# ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
4828.align 64
4829SHUF_MASK:
4830 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4831 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4832 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4833 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4834
4835.align 16
4836SHIFT_MASK:
4837 .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
4838
4839ALL_F:
4840 .quad 0xffffffffffffffff, 0xffffffffffffffff
4841
4842ZERO:
4843 .quad 0x0000000000000000, 0x0000000000000000
4844
4845.align 16
4846ONE:
4847 .quad 0x0000000000000001, 0x0000000000000000
4848
4849.align 16
4850ONEf:
4851 .quad 0x0000000000000000, 0x0100000000000000
4852
4853.align 64
4854ddq_add_1234:
4855 .quad 0x0000000000000001, 0x0000000000000000
4856 .quad 0x0000000000000002, 0x0000000000000000
4857 .quad 0x0000000000000003, 0x0000000000000000
4858 .quad 0x0000000000000004, 0x0000000000000000
4859
4860.align 64
4861ddq_add_5678:
4862 .quad 0x0000000000000005, 0x0000000000000000
4863 .quad 0x0000000000000006, 0x0000000000000000
4864 .quad 0x0000000000000007, 0x0000000000000000
4865 .quad 0x0000000000000008, 0x0000000000000000
4866
4867.align 64
4868ddq_add_4444:
4869 .quad 0x0000000000000004, 0x0000000000000000
4870 .quad 0x0000000000000004, 0x0000000000000000
4871 .quad 0x0000000000000004, 0x0000000000000000
4872 .quad 0x0000000000000004, 0x0000000000000000
4873
4874.align 64
4875ddq_add_8888:
4876 .quad 0x0000000000000008, 0x0000000000000000
4877 .quad 0x0000000000000008, 0x0000000000000000
4878 .quad 0x0000000000000008, 0x0000000000000000
4879 .quad 0x0000000000000008, 0x0000000000000000
4880
4881.align 64
4882ddq_addbe_1234:
4883 .quad 0x0000000000000000, 0x0100000000000000
4884 .quad 0x0000000000000000, 0x0200000000000000
4885 .quad 0x0000000000000000, 0x0300000000000000
4886 .quad 0x0000000000000000, 0x0400000000000000
4887
4888.align 64
4889ddq_addbe_4444:
4890 .quad 0x0000000000000000, 0x0400000000000000
4891 .quad 0x0000000000000000, 0x0400000000000000
4892 .quad 0x0000000000000000, 0x0400000000000000
4893 .quad 0x0000000000000000, 0x0400000000000000
4894
4895.align 64
4896byte_len_to_mask_table:
4897 .value 0x0000, 0x0001, 0x0003, 0x0007
4898 .value 0x000f, 0x001f, 0x003f, 0x007f
4899 .value 0x00ff, 0x01ff, 0x03ff, 0x07ff
4900 .value 0x0fff, 0x1fff, 0x3fff, 0x7fff
4901 .value 0xffff
4902
4903.align 64
4904byte64_len_to_mask_table:
4905 .quad 0x0000000000000000, 0x0000000000000001
4906 .quad 0x0000000000000003, 0x0000000000000007
4907 .quad 0x000000000000000f, 0x000000000000001f
4908 .quad 0x000000000000003f, 0x000000000000007f
4909 .quad 0x00000000000000ff, 0x00000000000001ff
4910 .quad 0x00000000000003ff, 0x00000000000007ff
4911 .quad 0x0000000000000fff, 0x0000000000001fff
4912 .quad 0x0000000000003fff, 0x0000000000007fff
4913 .quad 0x000000000000ffff, 0x000000000001ffff
4914 .quad 0x000000000003ffff, 0x000000000007ffff
4915 .quad 0x00000000000fffff, 0x00000000001fffff
4916 .quad 0x00000000003fffff, 0x00000000007fffff
4917 .quad 0x0000000000ffffff, 0x0000000001ffffff
4918 .quad 0x0000000003ffffff, 0x0000000007ffffff
4919 .quad 0x000000000fffffff, 0x000000001fffffff
4920 .quad 0x000000003fffffff, 0x000000007fffffff
4921 .quad 0x00000000ffffffff, 0x00000001ffffffff
4922 .quad 0x00000003ffffffff, 0x00000007ffffffff
4923 .quad 0x0000000fffffffff, 0x0000001fffffffff
4924 .quad 0x0000003fffffffff, 0x0000007fffffffff
4925 .quad 0x000000ffffffffff, 0x000001ffffffffff
4926 .quad 0x000003ffffffffff, 0x000007ffffffffff
4927 .quad 0x00000fffffffffff, 0x00001fffffffffff
4928 .quad 0x00003fffffffffff, 0x00007fffffffffff
4929 .quad 0x0000ffffffffffff, 0x0001ffffffffffff
4930 .quad 0x0003ffffffffffff, 0x0007ffffffffffff
4931 .quad 0x000fffffffffffff, 0x001fffffffffffff
4932 .quad 0x003fffffffffffff, 0x007fffffffffffff
4933 .quad 0x00ffffffffffffff, 0x01ffffffffffffff
4934 .quad 0x03ffffffffffffff, 0x07ffffffffffffff
4935 .quad 0x0fffffffffffffff, 0x1fffffffffffffff
4936 .quad 0x3fffffffffffffff, 0x7fffffffffffffff
4937 .quad 0xffffffffffffffff
4938___
4939
4940} else {
4941# Fallback for old assembler
4942$code .= <<___;
4943.text
4944.globl ossl_vaes_vpclmulqdq_capable
4945.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
4946ossl_vaes_vpclmulqdq_capable:
4947 xor %eax,%eax
4948 ret
4949.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
4950
4951.globl ossl_aes_gcm_init_avx512
4952.globl ossl_aes_gcm_setiv_avx512
4953.globl ossl_aes_gcm_update_aad_avx512
4954.globl ossl_aes_gcm_encrypt_avx512
4955.globl ossl_aes_gcm_decrypt_avx512
4956.globl ossl_aes_gcm_finalize_avx512
4957.globl ossl_gcm_gmult_avx512
4958
4959.type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4960ossl_aes_gcm_init_avx512:
4961ossl_aes_gcm_setiv_avx512:
4962ossl_aes_gcm_update_aad_avx512:
4963ossl_aes_gcm_encrypt_avx512:
4964ossl_aes_gcm_decrypt_avx512:
4965ossl_aes_gcm_finalize_avx512:
4966ossl_gcm_gmult_avx512:
4967 .byte 0x0f,0x0b # ud2
4968 ret
4969.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4970___
4971}
4972
4973$code =~ s/\`([^\`]*)\`/eval $1/gem;
4974print $code;
4975close STDOUT or die "error closing STDOUT: $!";