]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/aes-gcm-avx512.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / modes / asm / aes-gcm-avx512.pl
CommitLineData
da1c088f 1# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
63b996e7
AM
2# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
11# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
12# (https://github.com/intel/intel-ipsec-mb).
13# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
14#
15# References:
16# [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
17# Intel Architecture Processors. August, 2010.
18# [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
19# Intel Architecture Processors. October, 2012.
20# [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
21# Usage for Computing the GCM Mode. May, 2010.
22#
23#
24# December 2021
25#
26# Initial release.
27#
28# GCM128_CONTEXT structure has storage for 16 hkeys only, but this
29# implementation can use up to 48. To avoid extending the context size,
30# precompute and store in the context first 16 hkeys only, and compute the rest
31# on demand keeping them in the local frame.
32#
33#======================================================================
34# $output is the last argument if it looks like a file (it has an extension)
35# $flavour is the first argument if it doesn't look like a file
36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
38
39$win64 = 0;
40$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$avx512vaes = 0;
43
44$0 =~ m/(.*[\/\\])[^\/\\]+$/;
45$dir = $1;
46($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
47 or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
48 or die "can't locate x86_64-xlate.pl";
49
50if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
51 $avx512vaes = ($1 >= 2.30);
52}
53
54if (!$avx512vaes
55 && $win64
56 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
57 && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
58{
59 $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
60}
61
2dbddfab
TM
62if (!$avx512vaes && `$ENV{CC} -v 2>&1`
63 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
64 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
65 if ($1) {
66 # Apple conditions, they use a different version series, see
67 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
68 # clang 7.0.0 is Apple clang 10.0.1
69 $avx512vaes = ($ver>=10.0001)
70 } else {
71 $avx512vaes = ($ver>=7.0);
72 }
63b996e7
AM
73}
74
75open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
76 or die "can't call $xlate: $!";
77*STDOUT = *OUT;
78
79#======================================================================
80if ($avx512vaes>0) { #<<<
81
82$code .= <<___;
83.extern OPENSSL_ia32cap_P
84.globl ossl_vaes_vpclmulqdq_capable
85.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
86.align 32
87ossl_vaes_vpclmulqdq_capable:
88 mov OPENSSL_ia32cap_P+8(%rip), %rcx
89 # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
90 mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
91 xor %eax,%eax
92 and %rdx,%rcx
93 cmp %rdx,%rcx
94 cmove %rcx,%rax
95 ret
96.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
97___
98
99# ; Mapping key length -> AES rounds count
100my %aes_rounds = (
101 128 => 9,
102 192 => 11,
103 256 => 13);
104
105# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
106# ;;; Code generation control switches
107# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
108
109# ; ABI-aware zeroing of volatile registers in EPILOG().
110# ; Disabled due to performance reasons.
111my $CLEAR_SCRATCH_REGISTERS = 0;
112
113# ; Zero HKeys storage from the stack if they are stored there
114my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
115
116# ; Enable / disable check of function arguments for null pointer
117# ; Currently disabled, as this check is handled outside.
118my $CHECK_FUNCTION_ARGUMENTS = 0;
119
120# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121# ;;; Global constants
122# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
123
124# AES block size in bytes
125my $AES_BLOCK_SIZE = 16;
126
127# Storage capacity in elements
128my $HKEYS_STORAGE_CAPACITY = 48;
129my $LOCAL_STORAGE_CAPACITY = 48;
130my $HKEYS_CONTEXT_CAPACITY = 16;
131
132# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133# ;;; Stack frame definition
134# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
135
136# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
137# (2) -> +8-byte space for 16-byte alignment of XMM storage
138# (3) -> Frame pointer (%RBP)
139# (4) -> +160-byte XMM storage (Windows only, zero on Linux)
140# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
141# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
142# (7) -> +768-byte HKEYS storage
143# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
144
145my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack)
146my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers
147my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48
148my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks
149
150my $STACK_HKEYS_OFFSET = 0;
151my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
152
153# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
154# ;;; Function arguments abstraction
155# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
156my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
157
158# ; This implementation follows the convention: for non-leaf functions (they
159# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
160# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This
161# ; helps to facilitate SEH handlers writing.
162#
163# ; Leaf functions here do not use more than 4 input arguments.
164if ($win64) {
165 $arg1 = "%rcx";
166 $arg2 = "%rdx";
167 $arg3 = "%r8";
168 $arg4 = "%r9";
169 $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes
170 $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
171 $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
172 $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
173 $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
174 $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
175 $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
176} else {
177 $arg1 = "%rdi";
178 $arg2 = "%rsi";
179 $arg3 = "%rdx";
180 $arg4 = "%rcx";
181 $arg5 = "%r8";
182 $arg6 = "%r9";
183 $arg7 = "`$GP_STORAGE + 8*1`(%rbp)";
184 $arg8 = "`$GP_STORAGE + 8*2`(%rbp)";
185 $arg9 = "`$GP_STORAGE + 8*3`(%rbp)";
186 $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
187 $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
188}
189
190# ; Offsets in gcm128_context structure (see include/crypto/modes.h)
191my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key
192my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer
193my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation)
194my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input
195my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted
196my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash
197my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values)
198
199# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
200# ;;; Helper functions
201# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
202
203# ; Generates "random" local labels
204sub random_string() {
205 my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
206 my $length = 15;
207 my $str;
208 map { $str .= $chars[rand(33)] } 1 .. $length;
209 return $str;
210}
211
212sub BYTE {
213 my ($reg) = @_;
214 if ($reg =~ /%r[abcd]x/i) {
215 $reg =~ s/%r([abcd])x/%${1}l/i;
216 } elsif ($reg =~ /%r[sdb][ip]/i) {
217 $reg =~ s/%r([sdb][ip])/%${1}l/i;
218 } elsif ($reg =~ /%r[0-9]{1,2}/i) {
219 $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
220 } else {
221 die "BYTE: unknown register: $reg\n";
222 }
223 return $reg;
224}
225
226sub WORD {
227 my ($reg) = @_;
228 if ($reg =~ /%r[abcdsdb][xip]/i) {
229 $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
230 } elsif ($reg =~ /%r[0-9]{1,2}/) {
231 $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
232 } else {
233 die "WORD: unknown register: $reg\n";
234 }
235 return $reg;
236}
237
238sub DWORD {
239 my ($reg) = @_;
240 if ($reg =~ /%r[abcdsdb][xip]/i) {
241 $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
242 } elsif ($reg =~ /%r[0-9]{1,2}/i) {
243 $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
244 } else {
245 die "DWORD: unknown register: $reg\n";
246 }
247 return $reg;
248}
249
250sub XWORD {
251 my ($reg) = @_;
252 if ($reg =~ /%[xyz]mm/i) {
253 $reg =~ s/%[xyz]mm/%xmm/i;
254 } else {
255 die "XWORD: unknown register: $reg\n";
256 }
257 return $reg;
258}
259
260sub YWORD {
261 my ($reg) = @_;
262 if ($reg =~ /%[xyz]mm/i) {
263 $reg =~ s/%[xyz]mm/%ymm/i;
264 } else {
265 die "YWORD: unknown register: $reg\n";
266 }
267 return $reg;
268}
269
270sub ZWORD {
271 my ($reg) = @_;
272 if ($reg =~ /%[xyz]mm/i) {
273 $reg =~ s/%[xyz]mm/%zmm/i;
274 } else {
275 die "ZWORD: unknown register: $reg\n";
276 }
277 return $reg;
278}
279
280# ; Helper function to construct effective address based on two kinds of
281# ; offsets: numerical or located in the register
282sub EffectiveAddress {
283 my ($base, $offset, $displacement) = @_;
284 $displacement = 0 if (!$displacement);
285
286 if ($offset =~ /^\d+\z/) { # numerical offset
287 return "`$offset + $displacement`($base)";
288 } else { # offset resides in register
289 return "$displacement($base,$offset,1)";
290 }
291}
292
293# ; Provides memory location of corresponding HashKey power
294sub HashKeyByIdx {
295 my ($idx, $base) = @_;
296 my $base_str = ($base eq "%rsp") ? "frame" : "context";
297
298 my $offset = &HashKeyOffsetByIdx($idx, $base_str);
299 return "$offset($base)";
300}
301
302# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
303sub HashKeyOffsetByIdx {
304 my ($idx, $base) = @_;
305 die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
306 if (($base ne "frame") && ($base ne "context"));
307
308 my $offset_base;
309 my $offset_idx;
310 if ($base eq "frame") { # frame storage
311 die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
312 $offset_base = $STACK_HKEYS_OFFSET;
313 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
314 } else { # context storage
315 die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
316 $offset_base = $CTX_OFFSET_HTable;
317 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
318 }
319 return $offset_base + $offset_idx;
320}
321
322# ; Creates local frame and does back up of non-volatile registers.
323# ; Holds stack unwinding directives.
324sub PROLOG {
325 my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
326
327 my $DYNAMIC_STACK_ALLOC_SIZE = 0;
328 my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
329
330 if ($need_hkeys_stack_storage) {
331 $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
332 }
333
334 if ($need_aes_stack_storage) {
335 if (!$need_hkeys_stack_storage) {
336 die "PROLOG: unsupported case - aes storage without hkeys one";
337 }
338 $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
339 }
340
341 $code .= <<___;
342 push %rbx
343.cfi_push %rbx
344.L${func_name}_seh_push_rbx:
345 push %rbp
346.cfi_push %rbp
347.L${func_name}_seh_push_rbp:
348 push %r12
349.cfi_push %r12
350.L${func_name}_seh_push_r12:
351 push %r13
352.cfi_push %r13
353.L${func_name}_seh_push_r13:
354 push %r14
355.cfi_push %r14
356.L${func_name}_seh_push_r14:
357 push %r15
358.cfi_push %r15
359.L${func_name}_seh_push_r15:
360___
361
362 if ($win64) {
363 $code .= <<___;
364 push %rdi
365.L${func_name}_seh_push_rdi:
366 push %rsi
367.L${func_name}_seh_push_rsi:
368
369 sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment
370.L${func_name}_seh_allocstack_xmm:
371___
372 }
373 $code .= <<___;
374 # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
375 # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH
376 # ; handlers. The requirement for a frame pointer is that its offset from
377 # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
378 # ; itself seems to be reasonable to use here, because later we do 64-byte stack
379 # ; alignment which gives us non-determinate offsets and complicates writing
380 # ; SEH handlers.
381 #
382 # ; It also serves as an anchor for retrieving stack arguments on both Linux
383 # ; and Windows.
384 lea `$XMM_STORAGE`(%rsp),%rbp
385.cfi_def_cfa_register %rbp
386.L${func_name}_seh_setfp:
387___
388 if ($win64) {
389
390 # ; xmm6:xmm15 need to be preserved on Windows
391 foreach my $reg_idx (6 .. 15) {
392 my $xmm_reg_offset = ($reg_idx - 6) * 16;
393 $code .= <<___;
394 vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp)
395.L${func_name}_seh_save_xmm${reg_idx}:
396___
397 }
398 }
399
400 $code .= <<___;
401# Prolog ends here. Next stack allocation is treated as "dynamic".
402.L${func_name}_seh_prolog_end:
403___
404
405 if ($DYNAMIC_STACK_ALLOC_SIZE) {
406 $code .= <<___;
407 sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
408 and \$(-64),%rsp
409___
410 }
411}
412
413# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
414# ;;; Restore register content for the caller.
415# ;;; And cleanup stack.
416# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
417sub EPILOG {
418 my ($hkeys_storage_on_stack, $payload_len) = @_;
419
420 my $rndsuffix = &random_string();
421
422 if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
423
424 # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
425 # ; were stored in the local frame storage
426 $code .= <<___;
427 cmpq \$`16*16`,$payload_len
428 jbe .Lskip_hkeys_cleanup_${rndsuffix}
429 vpxor %xmm0,%xmm0,%xmm0
430___
431 for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
432 $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
433 }
434 $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n";
435 }
436
437 if ($CLEAR_SCRATCH_REGISTERS) {
438 &clear_scratch_gps_asm();
439 &clear_scratch_zmms_asm();
440 } else {
441 $code .= "vzeroupper\n";
442 }
443
444 if ($win64) {
445
446 # ; restore xmm15:xmm6
447 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
448 my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
449 $code .= <<___;
450 vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx},
451___
452 }
453 }
454
455 if ($win64) {
456
457 # Forming valid epilog for SEH with use of frame pointer.
458 # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
459 $code .= "lea 8(%rbp),%rsp\n";
460 } else {
461 $code .= "lea (%rbp),%rsp\n";
462 $code .= ".cfi_def_cfa_register %rsp\n";
463 }
464
465 if ($win64) {
466 $code .= <<___;
467 pop %rsi
468.cfi_pop %rsi
469 pop %rdi
470.cfi_pop %rdi
471___
472 }
473 $code .= <<___;
474 pop %r15
475.cfi_pop %r15
476 pop %r14
477.cfi_pop %r14
478 pop %r13
479.cfi_pop %r13
480 pop %r12
481.cfi_pop %r12
482 pop %rbp
483.cfi_pop %rbp
484 pop %rbx
485.cfi_pop %rbx
486___
487}
488
489# ; Clears all scratch ZMM registers
490# ;
491# ; It should be called before restoring the XMM registers
492# ; for Windows (XMM6-XMM15).
493# ;
494sub clear_scratch_zmms_asm {
495
496 # ; On Linux, all ZMM registers are scratch registers
497 if (!$win64) {
498 $code .= "vzeroall\n";
499 } else {
500 foreach my $i (0 .. 5) {
501 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
502 }
503 }
504 foreach my $i (16 .. 31) {
505 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
506 }
507}
508
509# Clears all scratch GP registers
510sub clear_scratch_gps_asm {
511 foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
512 $code .= "xor $reg,$reg\n";
513 }
514 if (!$win64) {
515 foreach my $reg ("%rsi", "%rdi") {
516 $code .= "xor $reg,$reg\n";
517 }
518 }
519}
520
521sub precompute_hkeys_on_stack {
522 my $GCM128_CTX = $_[0];
523 my $HKEYS_READY = $_[1];
524 my $ZTMP0 = $_[2];
525 my $ZTMP1 = $_[3];
526 my $ZTMP2 = $_[4];
527 my $ZTMP3 = $_[5];
528 my $ZTMP4 = $_[6];
529 my $ZTMP5 = $_[7];
530 my $ZTMP6 = $_[8];
531 my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32"
532
533 die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
534 if ($HKEYS_RANGE ne "first16"
535 && $HKEYS_RANGE ne "mid16"
536 && $HKEYS_RANGE ne "all"
537 && $HKEYS_RANGE ne "first32"
538 && $HKEYS_RANGE ne "last32");
539
540 my $rndsuffix = &random_string();
541
542 $code .= <<___;
543 test $HKEYS_READY,$HKEYS_READY
544 jnz .L_skip_hkeys_precomputation_${rndsuffix}
545___
546
547 if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
548
549 # ; Fill the stack with the first 16 hkeys from the context
550 $code .= <<___;
551 # ; Move 16 hkeys from the context to stack
552 vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
553 vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
554
555 vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
556 vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
557
558 # ; broadcast HashKey^8
559 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
560
561 vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
562 vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
563
564 vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
565 vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
566___
567 }
568
569 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
570 $code .= <<___;
571 vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
572
573 # ; broadcast HashKey^8
574 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
575
576 vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
577 vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
578___
579
580 }
581
582 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
583
584 # ; Precompute hkeys^i, i=17..32
585 my $i = 20;
586 foreach (1 .. int((32 - 16) / 8)) {
587
588 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
589 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
590 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
591 $i += 4;
592
593 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
594 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
595 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
596 $i += 4;
597 }
598 }
599
600 if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
601
602 # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
603 my $i = 36;
604 foreach (1 .. int((48 - 32) / 8)) {
605
606 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
607 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
608 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
609 $i += 4;
610
611 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
612 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
613 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
614 $i += 4;
615 }
616 }
617
618 $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n";
619}
620
621# ;; =============================================================================
622# ;; Generic macro to produce code that executes $OPCODE instruction
623# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
624# ;; All three operands of the instruction come from registers.
625# ;; Note: if 3 blocks are left at the end instruction is produced to operate all
626# ;; 4 blocks (full width of ZMM)
627sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
628 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
629 my $OPCODE = $_[1]; # [in] instruction name
630 my @DST;
631 $DST[0] = $_[2]; # [out] destination ZMM register
632 $DST[1] = $_[3]; # [out] destination ZMM register
633 $DST[2] = $_[4]; # [out] destination ZMM register
634 $DST[3] = $_[5]; # [out] destination ZMM register
635 my @SRC1;
636 $SRC1[0] = $_[6]; # [in] source 1 ZMM register
637 $SRC1[1] = $_[7]; # [in] source 1 ZMM register
638 $SRC1[2] = $_[8]; # [in] source 1 ZMM register
639 $SRC1[3] = $_[9]; # [in] source 1 ZMM register
640 my @SRC2;
641 $SRC2[0] = $_[10]; # [in] source 2 ZMM register
642 $SRC2[1] = $_[11]; # [in] source 2 ZMM register
643 $SRC2[2] = $_[12]; # [in] source 2 ZMM register
644 $SRC2[3] = $_[13]; # [in] source 2 ZMM register
645
646 die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
647 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
648
649 my $reg_idx = 0;
650 my $blocks_left = $NUM_BLOCKS;
651
652 foreach (1 .. ($NUM_BLOCKS / 4)) {
653 $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
654 $reg_idx++;
655 $blocks_left -= 4;
656 }
657
658 my $DSTREG = $DST[$reg_idx];
659 my $SRC1REG = $SRC1[$reg_idx];
660 my $SRC2REG = $SRC2[$reg_idx];
661
662 if ($blocks_left == 1) {
663 $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
664 } elsif ($blocks_left == 2) {
665 $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
666 } elsif ($blocks_left == 3) {
667 $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n";
668 }
669}
670
671# ;; =============================================================================
672# ;; Loads specified number of AES blocks into ZMM registers using mask register
673# ;; for the last loaded register (xmm, ymm or zmm).
674# ;; Loads take place at 1 byte granularity.
675sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
676 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
677 my $INP = $_[1]; # [in] input data pointer to read from
678 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
679 my @DST;
680 $DST[0] = $_[3]; # [out] ZMM register with loaded data
681 $DST[1] = $_[4]; # [out] ZMM register with loaded data
682 $DST[2] = $_[5]; # [out] ZMM register with loaded data
683 $DST[3] = $_[6]; # [out] ZMM register with loaded data
684 my $MASK = $_[7]; # [in] mask register
685
686 die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
687 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
688
689 my $src_offset = 0;
690 my $dst_idx = 0;
691 my $blocks_left = $NUM_BLOCKS;
692
693 if ($NUM_BLOCKS > 0) {
694 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
695 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
696 $src_offset += 64;
697 $dst_idx++;
698 $blocks_left -= 4;
699 }
700 }
701
702 my $DSTREG = $DST[$dst_idx];
703
704 if ($blocks_left == 1) {
705 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
706 } elsif ($blocks_left == 2) {
707 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
708 } elsif (($blocks_left == 3 || $blocks_left == 4)) {
709 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
710 }
711}
712
713# ;; =============================================================================
714# ;; Stores specified number of AES blocks from ZMM registers with mask register
715# ;; for the last loaded register (xmm, ymm or zmm).
716# ;; Stores take place at 1 byte granularity.
717sub ZMM_STORE_MASKED_BLOCKS_0_16 {
718 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
719 my $OUTP = $_[1]; # [in] output data pointer to write to
720 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
721 my @SRC;
722 $SRC[0] = $_[3]; # [in] ZMM register with data to store
723 $SRC[1] = $_[4]; # [in] ZMM register with data to store
724 $SRC[2] = $_[5]; # [in] ZMM register with data to store
725 $SRC[3] = $_[6]; # [in] ZMM register with data to store
726 my $MASK = $_[7]; # [in] mask register
727
728 die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
729 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
730
731 my $dst_offset = 0;
732 my $src_idx = 0;
733 my $blocks_left = $NUM_BLOCKS;
734
735 if ($NUM_BLOCKS > 0) {
736 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
737 $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
738 $dst_offset += 64;
739 $src_idx++;
740 $blocks_left -= 4;
741 }
742 }
743
744 my $SRCREG = $SRC[$src_idx];
745
746 if ($blocks_left == 1) {
747 $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
748 } elsif ($blocks_left == 2) {
749 $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
750 } elsif ($blocks_left == 3 || $blocks_left == 4) {
751 $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
752 }
753}
754
755# ;;; ===========================================================================
756# ;;; Handles AES encryption rounds
757# ;;; It handles special cases: the last and first rounds
758# ;;; Optionally, it performs XOR with data after the last AES round.
759# ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
760# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
761sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
762 my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3
763 my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7
764 my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11
765 my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15
766 my $KEY = $_[4]; # [in] zmm containing round key
767 my $ROUND = $_[5]; # [in] round number
768 my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3
769 my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7
770 my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11
771 my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15
772 my $NUMBL = $_[10]; # [in] number of blocks; numerical value
773 my $NROUNDS = $_[11]; # [in] number of rounds; numerical value
774
775 # ;;; === first AES round
776 if ($ROUND < 1) {
777
778 # ;; round 0
779 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
780 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
781 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
782 }
783
784 # ;;; === middle AES rounds
785 if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
786
787 # ;; rounds 1 to 9/11/13
788 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
789 $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
790 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
791 }
792
793 # ;;; === last AES round
794 if ($ROUND > $NROUNDS) {
795
796 # ;; the last round - mix enclast with text xor's
797 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
798 $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
799 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
800
801 # ;;; === XOR with data
802 if ( ($D0_3 ne "no_data")
803 && ($D4_7 ne "no_data")
804 && ($D8_11 ne "no_data")
805 && ($D12_15 ne "no_data"))
806 {
807 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
808 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
809 $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15);
810 }
811 }
812}
813
814# ;;; Horizontal XOR - 4 x 128bits xored together
815sub VHPXORI4x128 {
816 my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output
817 my $TMP = $_[1]; # [clobbered] ZMM temporary register
818 $code .= <<___;
819 vextracti64x4 \$1,$REG,@{[YWORD($TMP)]}
820 vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
821 vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
822 vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
823___
824}
825
826# ;;; AVX512 reduction macro
827sub VCLMUL_REDUCE {
828 my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
829 my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial
830 my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce
831 my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce
832 my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register
833 my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register
834
835 $code .= <<___;
836 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
837 # ;; first phase of the reduction
838 vpclmulqdq \$0x01,$LO128,$POLY,$TMP0
839 vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs
840 vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete
841 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
842 # ;; second phase of the reduction
843 vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1
844 vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R
845 vpclmulqdq \$0x10,$TMP0,$POLY,$OUT
846 vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts
847 vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
848 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
849___
850}
851
852# ;; ===========================================================================
853# ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
854# ;; - it is assumed that data read from $INPTR is already shuffled and
855# ;; $INPTR address is 64 byte aligned
856# ;; - there is an option to pass ready blocks through ZMM registers too.
857# ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
858sub GHASH_16 {
859 my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
860 # end_reduce (end with reduction), start_reduce
861 my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits
862 my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits
863 my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits
864 my $INPTR = $_[4]; # [in] data input pointer
865 my $INOFF = $_[5]; # [in] data input offset
866 my $INDIS = $_[6]; # [in] data input displacement
867 my $HKPTR = $_[7]; # [in] hash key pointer
868 my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset)
869 my $HKDIS = $_[9]; # [in] hash key displacement
870 my $HASH = $_[10]; # [in/out] ZMM hash value in/out
871 my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM
872 my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM
873 my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM
874 my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM
875 my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM
876 my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM
877 my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM
878 my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM
879 my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM
880 my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
881 my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
882 my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
883 my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
884 my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
885
886 my $start_ghash = 0;
887 my $do_reduction = 0;
888 if ($TYPE eq "start") {
889 $start_ghash = 1;
890 }
891
892 if ($TYPE eq "start_reduce") {
893 $start_ghash = 1;
894 $do_reduction = 1;
895 }
896
897 if ($TYPE eq "end_reduce") {
898 $do_reduction = 1;
899 }
900
901 # ;; ghash blocks 0-3
902 if (scalar(@_) == 21) {
903 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
904 } else {
905 $ZTMP9 = $DAT0;
906 }
907
908 if ($start_ghash != 0) {
909 $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n";
910 }
911 $code .= <<___;
912 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
913 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
914 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
915 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
916 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
917___
918
919 # ;; ghash blocks 4-7
920 if (scalar(@_) == 21) {
921 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
922 } else {
923 $ZTMP9 = $DAT1;
924 }
925 $code .= <<___;
926 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
927 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
928 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
929 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
930 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
931___
932
933 # ;; update sums
934 if ($start_ghash != 0) {
935 $code .= <<___;
936 vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1
937 vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H
938 vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L
939 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1
940___
941 } else { # ;; mid, end, end_reduce
942 $code .= <<___;
943 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
944 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
945 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
946 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
947___
948 }
949
950 # ;; ghash blocks 8-11
951 if (scalar(@_) == 21) {
952 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
953 } else {
954 $ZTMP9 = $DAT2;
955 }
956 $code .= <<___;
957 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
958 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
959 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
960 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
961 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
962___
963
964 # ;; ghash blocks 12-15
965 if (scalar(@_) == 21) {
966 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
967 } else {
968 $ZTMP9 = $DAT3;
969 }
970 $code .= <<___;
971 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
972 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
973 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
974 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
975 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
976 # ;; update sums
977 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
978 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
979 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
980 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
981___
982 if ($do_reduction != 0) {
983 $code .= <<___;
984 # ;; integrate GM into GH and GL
985 vpsrldq \$8,$GM,$ZTMP0
986 vpslldq \$8,$GM,$ZTMP1
987 vpxorq $ZTMP0,$GH,$GH
988 vpxorq $ZTMP1,$GL,$GL
989___
990
991 # ;; add GH and GL 128-bit words horizontally
992 &VHPXORI4x128($GH, $ZTMP0);
993 &VHPXORI4x128($GL, $ZTMP1);
994
995 # ;; reduction
996 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n";
997 &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1));
998 }
999}
1000
1001# ;; ===========================================================================
1002# ;; GHASH 1 to 16 blocks of cipher text
1003# ;; - performs reduction at the end
1004# ;; - it doesn't load the data and it assumed it is already loaded and shuffled
1005sub GHASH_1_TO_16 {
1006 my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys
1007 my $GHASH = $_[1]; # [out] ghash output
1008 my $T0H = $_[2]; # [clobbered] temporary ZMM
1009 my $T0L = $_[3]; # [clobbered] temporary ZMM
1010 my $T0M1 = $_[4]; # [clobbered] temporary ZMM
1011 my $T0M2 = $_[5]; # [clobbered] temporary ZMM
1012 my $T1H = $_[6]; # [clobbered] temporary ZMM
1013 my $T1L = $_[7]; # [clobbered] temporary ZMM
1014 my $T1M1 = $_[8]; # [clobbered] temporary ZMM
1015 my $T1M2 = $_[9]; # [clobbered] temporary ZMM
1016 my $HK = $_[10]; # [clobbered] temporary ZMM
1017 my $AAD_HASH_IN = $_[11]; # [in] input hash value
1018 my @CIPHER_IN;
1019 $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3
1020 $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7
1021 $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11
1022 $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15
1023 my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks
1024 my $GH = $_[17]; # [in] ZMM with hi product part
1025 my $GM = $_[18]; # [in] ZMM with mid product part
1026 my $GL = $_[19]; # [in] ZMM with lo product part
1027
1028 die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
1029
1030 if (scalar(@_) == 17) {
1031 $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n";
1032 }
1033
1034 if ($NUM_BLOCKS == 16) {
1035 $code .= <<___;
1036 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1037 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1038 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1039 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1040 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1041 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1042 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1043 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1044 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1045 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1046 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1047 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1048 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1049 vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H
1050 vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L
1051 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1052 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1053 vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1
1054 vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2
1055 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK
1056 vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1
1057 vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0
1058 vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0
1059 vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1
1060 vpxorq $T1H,$T0H,$T1H
1061 vpxorq $T1L,$T0L,$T1L
1062 vpxorq $T1M1,$T0M1,$T1M1
1063 vpxorq $T1M2,$T0M2,$T1M2
1064___
1065 } elsif ($NUM_BLOCKS >= 12) {
1066 $code .= <<___;
1067 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1068 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1069 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1070 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1071 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1072 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1073 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1074 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1075 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1076 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1077 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1078 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1079 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1080 vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H
1081 vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L
1082 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1083 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1084 vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1
1085 vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2
1086___
1087 } elsif ($NUM_BLOCKS >= 8) {
1088 $code .= <<___;
1089 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1090 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1091 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1092 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1093 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1094 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1095 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1096 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1097 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1098 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1099 vpxorq $T1H,$T0H,$T1H
1100 vpxorq $T1L,$T0L,$T1L
1101 vpxorq $T1M1,$T0M1,$T1M1
1102 vpxorq $T1M2,$T0M2,$T1M2
1103___
1104 } elsif ($NUM_BLOCKS >= 4) {
1105 $code .= <<___;
1106 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1107 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1
1108 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0
1109 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0
1110 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1
1111___
1112 }
1113
1114 # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4)
1115 my $blocks_left = ($NUM_BLOCKS % 4);
1116 if ($blocks_left > 0) {
1117
1118 # ;; =====================================================
1119 # ;; There are 1, 2 or 3 blocks left to process.
1120 # ;; It may also be that they are the only blocks to process.
1121
1122 # ;; Set hash key and register index position for the remaining 1 to 3 blocks
1123 my $reg_idx = ($NUM_BLOCKS / 4);
1124 my $REG_IN = $CIPHER_IN[$reg_idx];
1125
1126 if ($blocks_left == 1) {
1127 $code .= <<___;
1128 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]}
1129 vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0
1130 vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1
1131 vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1
1132 vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0
1133___
1134 } elsif ($blocks_left == 2) {
1135 $code .= <<___;
1136 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1137 vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0
1138 vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1
1139 vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1
1140 vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0
1141___
1142 } else { # ; blocks_left == 3
1143 $code .= <<___;
1144 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1145 vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK
1146 vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0
1147 vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1
1148 vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1
1149 vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0
1150___
1151 }
1152
1153 if (scalar(@_) == 20) {
1154
1155 # ;; *** GH/GM/GL passed as arguments
1156 if ($NUM_BLOCKS >= 4) {
1157 $code .= <<___;
1158 # ;; add ghash product sums from the first 4, 8 or 12 blocks
1159 vpxorq $T1M1,$T0M1,$T0M1
1160 vpternlogq \$0x96,$T1M2,$GM,$T0M2
1161 vpternlogq \$0x96,$T1H,$GH,$T0H
1162 vpternlogq \$0x96,$T1L,$GL,$T0L
1163___
1164 } else {
1165 $code .= <<___;
1166 vpxorq $GM,$T0M1,$T0M1
1167 vpxorq $GH,$T0H,$T0H
1168 vpxorq $GL,$T0L,$T0L
1169___
1170 }
1171 } else {
1172
1173 # ;; *** GH/GM/GL NOT passed as arguments
1174 if ($NUM_BLOCKS >= 4) {
1175 $code .= <<___;
1176 # ;; add ghash product sums from the first 4, 8 or 12 blocks
1177 vpxorq $T1M1,$T0M1,$T0M1
1178 vpxorq $T1M2,$T0M2,$T0M2
1179 vpxorq $T1H,$T0H,$T0H
1180 vpxorq $T1L,$T0L,$T0L
1181___
1182 }
1183 }
1184 $code .= <<___;
1185 # ;; integrate TM into TH and TL
1186 vpxorq $T0M2,$T0M1,$T0M1
1187 vpsrldq \$8,$T0M1,$T1M1
1188 vpslldq \$8,$T0M1,$T1M2
1189 vpxorq $T1M1,$T0H,$T0H
1190 vpxorq $T1M2,$T0L,$T0L
1191___
1192 } else {
1193
1194 # ;; =====================================================
1195 # ;; number of blocks is 4, 8, 12 or 16
1196 # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2
1197 if (scalar(@_) == 20) {
1198 $code .= <<___;
1199 # ;; *** GH/GM/GL passed as arguments
1200 vpxorq $GM,$T1M1,$T1M1
1201 vpxorq $GH,$T1H,$T1H
1202 vpxorq $GL,$T1L,$T1L
1203___
1204 }
1205 $code .= <<___;
1206 # ;; integrate TM into TH and TL
1207 vpxorq $T1M2,$T1M1,$T1M1
1208 vpsrldq \$8,$T1M1,$T0M1
1209 vpslldq \$8,$T1M1,$T0M2
1210 vpxorq $T0M1,$T1H,$T0H
1211 vpxorq $T0M2,$T1L,$T0L
1212___
1213 }
1214
1215 # ;; add TH and TL 128-bit words horizontally
1216 &VHPXORI4x128($T0H, $T1M1);
1217 &VHPXORI4x128($T0L, $T1M2);
1218
1219 # ;; reduction
1220 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n";
1221 &VCLMUL_REDUCE(
1222 @{[XWORD($GHASH)]},
1223 @{[XWORD($HK)]},
1224 @{[XWORD($T0H)]},
1225 @{[XWORD($T0L)]},
1226 @{[XWORD($T0M1)]},
1227 @{[XWORD($T0M2)]});
1228}
1229
1230# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1231# ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1)
1232# ;; Input: A and B (128-bits each, bit-reflected)
1233# ;; Output: C = A*B*x mod poly, (i.e. >>1 )
1234# ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1235# ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1236# ;;
eb4129e1 1237# ;; Refer to [3] for more details.
63b996e7
AM
1238# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1239sub GHASH_MUL {
1240 my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits)
1241 my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits)
1242 my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm
1243 my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm
1244 my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm
1245
1246 $code .= <<___;
1247 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1248 vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1
1249 vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0
1250 vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0
1251 vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1
1252 vpxorq $T3,$GH,$GH
1253
1254 vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs
1255 vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs
1256 vpxorq $T3,$T1,$T1
1257 vpxorq $T2,$GH,$GH
1258
1259 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1260 # ;first phase of the reduction
1261 vmovdqu64 POLY2(%rip),$T3
1262
1263 vpclmulqdq \$0x01,$GH,$T3,$T2
1264 vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs
1265 vpxorq $T2,$GH,$GH # ; first phase of the reduction complete
1266
1267 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1268 # ;second phase of the reduction
1269 vpclmulqdq \$0x00,$GH,$T3,$T2
1270 vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R
1271 vpclmulqdq \$0x10,$GH,$T3,$GH
1272 vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts
1273 # ; second phase of the reduction complete, the result is in $GH
1274 vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2
1275 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1276___
1277}
1278
1279# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1280# ;;; PRECOMPUTE computes HashKey_i
1281sub PRECOMPUTE {
1282 my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated
1283 my $HK = $_[1]; #; [in] xmm, hash key
1284 my $T1 = $_[2]; #; [clobbered] xmm
1285 my $T2 = $_[3]; #; [clobbered] xmm
1286 my $T3 = $_[4]; #; [clobbered] xmm
1287 my $T4 = $_[5]; #; [clobbered] xmm
1288 my $T5 = $_[6]; #; [clobbered] xmm
1289 my $T6 = $_[7]; #; [clobbered] xmm
1290
1291 my $ZT1 = &ZWORD($T1);
1292 my $ZT2 = &ZWORD($T2);
1293 my $ZT3 = &ZWORD($T3);
1294 my $ZT4 = &ZWORD($T4);
1295 my $ZT5 = &ZWORD($T5);
1296 my $ZT6 = &ZWORD($T6);
1297
1298 my $YT1 = &YWORD($T1);
1299 my $YT2 = &YWORD($T2);
1300 my $YT3 = &YWORD($T3);
1301 my $YT4 = &YWORD($T4);
1302 my $YT5 = &YWORD($T5);
1303 my $YT6 = &YWORD($T6);
1304
1305 $code .= <<___;
1306 vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5
1307 vmovdqa $YT5,$YT4
1308___
1309
1310 # ;; calculate HashKey^2<<1 mod poly
1311 &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3);
1312
1313 $code .= <<___;
1314 vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]}
1315 vinserti64x2 \$1,$HK,$YT4,$YT5
1316 vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2
1317___
1318
1319 # ;; use 2x128-bit computation
1320 # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly
1321 &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4
1322
1323 $code .= <<___;
1324 vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]}
1325
1326 vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5
1327
1328 # ;; switch to 4x128-bit computations now
1329 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4
1330 vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6
1331___
1332
1333 # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly
1334 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1335 $code .= <<___;
1336 vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now
1337 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4
1338___
1339
1340 # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly
1341 # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution
1342
1343 # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9)
1344 &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3);
1345 $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n";
1346
1347 # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13)
1348 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1349 $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n";
1350
1351 # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys
1352}
1353
1354# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1355# ;; READ_SMALL_DATA_INPUT
1356# ;; Packs xmm register with data when data input is less or equal to 16 bytes
1357# ;; Returns 0 if data has length 0
1358# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1359sub READ_SMALL_DATA_INPUT {
1360 my $OUTPUT = $_[0]; # [out] xmm register
1361 my $INPUT = $_[1]; # [in] buffer pointer to read from
1362 my $LENGTH = $_[2]; # [in] number of bytes to read
1363 my $TMP1 = $_[3]; # [clobbered]
1364 my $TMP2 = $_[4]; # [clobbered]
1365 my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask
1366
1367 $code .= <<___;
1368 mov \$16,@{[DWORD($TMP2)]}
1369 lea byte_len_to_mask_table(%rip),$TMP1
1370 cmp $TMP2,$LENGTH
1371 cmovc $LENGTH,$TMP2
1372___
1373 if ($win64) {
1374 $code .= <<___;
1375 add $TMP2,$TMP1
1376 add $TMP2,$TMP1
1377 kmovw ($TMP1),$MASK
1378___
1379 } else {
1380 $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n";
1381 }
1382 $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n";
1383}
1384
1385# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1386# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
1387# Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
1388# Output: The hash of the data (AAD_HASH).
1389# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1390sub CALC_AAD_HASH {
1391 my $A_IN = $_[0]; # [in] AAD text pointer
1392 my $A_LEN = $_[1]; # [in] AAD length
1393 my $AAD_HASH = $_[2]; # [in/out] xmm ghash value
1394 my $GCM128_CTX = $_[3]; # [in] pointer to context
1395 my $ZT0 = $_[4]; # [clobbered] ZMM register
1396 my $ZT1 = $_[5]; # [clobbered] ZMM register
1397 my $ZT2 = $_[6]; # [clobbered] ZMM register
1398 my $ZT3 = $_[7]; # [clobbered] ZMM register
1399 my $ZT4 = $_[8]; # [clobbered] ZMM register
1400 my $ZT5 = $_[9]; # [clobbered] ZMM register
1401 my $ZT6 = $_[10]; # [clobbered] ZMM register
1402 my $ZT7 = $_[11]; # [clobbered] ZMM register
1403 my $ZT8 = $_[12]; # [clobbered] ZMM register
1404 my $ZT9 = $_[13]; # [clobbered] ZMM register
1405 my $ZT10 = $_[14]; # [clobbered] ZMM register
1406 my $ZT11 = $_[15]; # [clobbered] ZMM register
1407 my $ZT12 = $_[16]; # [clobbered] ZMM register
1408 my $ZT13 = $_[17]; # [clobbered] ZMM register
1409 my $ZT14 = $_[18]; # [clobbered] ZMM register
1410 my $ZT15 = $_[19]; # [clobbered] ZMM register
1411 my $ZT16 = $_[20]; # [clobbered] ZMM register
1412 my $T1 = $_[21]; # [clobbered] GP register
1413 my $T2 = $_[22]; # [clobbered] GP register
1414 my $T3 = $_[23]; # [clobbered] GP register
1415 my $MASKREG = $_[24]; # [clobbered] mask register
1416
1417 my $HKEYS_READY = "%rbx";
1418
1419 my $SHFMSK = $ZT13;
1420
1421 my $rndsuffix = &random_string();
1422
1423 $code .= <<___;
1424 mov $A_IN,$T1 # ; T1 = AAD
1425 mov $A_LEN,$T2 # ; T2 = aadLen
1426 or $T2,$T2
1427 jz .L_CALC_AAD_done_${rndsuffix}
1428
1429 xor $HKEYS_READY,$HKEYS_READY
1430 vmovdqa64 SHUF_MASK(%rip),$SHFMSK
1431
1432.L_get_AAD_loop48x16_${rndsuffix}:
1433 cmp \$`(48*16)`,$T2
1434 jl .L_exit_AAD_loop48x16_${rndsuffix}
1435___
1436
1437 $code .= <<___;
1438 vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3
1439 vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7
1440 vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11
1441 vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15
1442 vpshufb $SHFMSK,$ZT1,$ZT1
1443 vpshufb $SHFMSK,$ZT2,$ZT2
1444 vpshufb $SHFMSK,$ZT3,$ZT3
1445 vpshufb $SHFMSK,$ZT4,$ZT4
1446___
1447
1448 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all");
1449 $code .= "mov \$1,$HKEYS_READY\n";
1450
1451 &GHASH_16(
1452 "start", $ZT5, $ZT6, $ZT7,
1453 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1454 &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0,
1455 $ZT8, $ZT9, $ZT10, $ZT11,
1456 $ZT12, $ZT14, $ZT15, $ZT16,
1457 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1458 $ZT4);
1459
1460 $code .= <<___;
1461 vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19
1462 vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23
1463 vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27
1464 vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31
1465 vpshufb $SHFMSK,$ZT1,$ZT1
1466 vpshufb $SHFMSK,$ZT2,$ZT2
1467 vpshufb $SHFMSK,$ZT3,$ZT3
1468 vpshufb $SHFMSK,$ZT4,$ZT4
1469___
1470
1471 &GHASH_16(
1472 "mid", $ZT5, $ZT6, $ZT7,
1473 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1474 &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0,
1475 $ZT8, $ZT9, $ZT10, $ZT11,
1476 $ZT12, $ZT14, $ZT15, $ZT16,
1477 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1478 $ZT4);
1479
1480 $code .= <<___;
1481 vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35
1482 vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39
1483 vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43
1484 vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47
1485 vpshufb $SHFMSK,$ZT1,$ZT1
1486 vpshufb $SHFMSK,$ZT2,$ZT2
1487 vpshufb $SHFMSK,$ZT3,$ZT3
1488 vpshufb $SHFMSK,$ZT4,$ZT4
1489___
1490
1491 &GHASH_16(
1492 "end_reduce", $ZT5, $ZT6, $ZT7,
1493 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1494 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1495 $ZT8, $ZT9, $ZT10, $ZT11,
1496 $ZT12, $ZT14, $ZT15, $ZT16,
1497 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1498 $ZT4);
1499
1500 $code .= <<___;
1501 sub \$`(48*16)`,$T2
1502 je .L_CALC_AAD_done_${rndsuffix}
1503
1504 add \$`(48*16)`,$T1
1505 jmp .L_get_AAD_loop48x16_${rndsuffix}
1506
1507.L_exit_AAD_loop48x16_${rndsuffix}:
1508 # ; Less than 48x16 bytes remaining
1509 cmp \$`(32*16)`,$T2
1510 jl .L_less_than_32x16_${rndsuffix}
1511___
1512
1513 $code .= <<___;
1514 # ; Get next 16 blocks
1515 vmovdqu64 `64*0`($T1),$ZT1
1516 vmovdqu64 `64*1`($T1),$ZT2
1517 vmovdqu64 `64*2`($T1),$ZT3
1518 vmovdqu64 `64*3`($T1),$ZT4
1519 vpshufb $SHFMSK,$ZT1,$ZT1
1520 vpshufb $SHFMSK,$ZT2,$ZT2
1521 vpshufb $SHFMSK,$ZT3,$ZT3
1522 vpshufb $SHFMSK,$ZT4,$ZT4
1523___
1524
1525 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32");
1526 $code .= "mov \$1,$HKEYS_READY\n";
1527
1528 &GHASH_16(
1529 "start", $ZT5, $ZT6, $ZT7,
1530 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1531 &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1532 $ZT8, $ZT9, $ZT10, $ZT11,
1533 $ZT12, $ZT14, $ZT15, $ZT16,
1534 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1535 $ZT4);
1536
1537 $code .= <<___;
1538 vmovdqu64 `16*16 + 64*0`($T1),$ZT1
1539 vmovdqu64 `16*16 + 64*1`($T1),$ZT2
1540 vmovdqu64 `16*16 + 64*2`($T1),$ZT3
1541 vmovdqu64 `16*16 + 64*3`($T1),$ZT4
1542 vpshufb $SHFMSK,$ZT1,$ZT1
1543 vpshufb $SHFMSK,$ZT2,$ZT2
1544 vpshufb $SHFMSK,$ZT3,$ZT3
1545 vpshufb $SHFMSK,$ZT4,$ZT4
1546___
1547
1548 &GHASH_16(
1549 "end_reduce", $ZT5, $ZT6, $ZT7,
1550 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1551 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1552 $ZT8, $ZT9, $ZT10, $ZT11,
1553 $ZT12, $ZT14, $ZT15, $ZT16,
1554 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1555 $ZT4);
1556
1557 $code .= <<___;
1558 sub \$`(32*16)`,$T2
1559 je .L_CALC_AAD_done_${rndsuffix}
1560
1561 add \$`(32*16)`,$T1
1562 jmp .L_less_than_16x16_${rndsuffix}
1563
1564.L_less_than_32x16_${rndsuffix}:
1565 cmp \$`(16*16)`,$T2
1566 jl .L_less_than_16x16_${rndsuffix}
1567 # ; Get next 16 blocks
1568 vmovdqu64 `64*0`($T1),$ZT1
1569 vmovdqu64 `64*1`($T1),$ZT2
1570 vmovdqu64 `64*2`($T1),$ZT3
1571 vmovdqu64 `64*3`($T1),$ZT4
1572 vpshufb $SHFMSK,$ZT1,$ZT1
1573 vpshufb $SHFMSK,$ZT2,$ZT2
1574 vpshufb $SHFMSK,$ZT3,$ZT3
1575 vpshufb $SHFMSK,$ZT4,$ZT4
1576___
1577
1578 # ; This code path does not use more than 16 hkeys, so they can be taken from the context
1579 # ; (not from the stack storage)
1580 &GHASH_16(
1581 "start_reduce", $ZT5, $ZT6, $ZT7,
1582 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX,
1583 &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0,
1584 $ZT8, $ZT9, $ZT10, $ZT11,
1585 $ZT12, $ZT14, $ZT15, $ZT16,
1586 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1587 $ZT4);
1588
1589 $code .= <<___;
1590 sub \$`(16*16)`,$T2
1591 je .L_CALC_AAD_done_${rndsuffix}
1592
1593 add \$`(16*16)`,$T1
1594 # ; Less than 16x16 bytes remaining
1595.L_less_than_16x16_${rndsuffix}:
1596 # ;; prep mask source address
1597 lea byte64_len_to_mask_table(%rip),$T3
1598 lea ($T3,$T2,8),$T3
1599
1600 # ;; calculate number of blocks to ghash (including partial bytes)
1601 add \$15,@{[DWORD($T2)]}
1602 shr \$4,@{[DWORD($T2)]}
1603 cmp \$2,@{[DWORD($T2)]}
1604 jb .L_AAD_blocks_1_${rndsuffix}
1605 je .L_AAD_blocks_2_${rndsuffix}
1606 cmp \$4,@{[DWORD($T2)]}
1607 jb .L_AAD_blocks_3_${rndsuffix}
1608 je .L_AAD_blocks_4_${rndsuffix}
1609 cmp \$6,@{[DWORD($T2)]}
1610 jb .L_AAD_blocks_5_${rndsuffix}
1611 je .L_AAD_blocks_6_${rndsuffix}
1612 cmp \$8,@{[DWORD($T2)]}
1613 jb .L_AAD_blocks_7_${rndsuffix}
1614 je .L_AAD_blocks_8_${rndsuffix}
1615 cmp \$10,@{[DWORD($T2)]}
1616 jb .L_AAD_blocks_9_${rndsuffix}
1617 je .L_AAD_blocks_10_${rndsuffix}
1618 cmp \$12,@{[DWORD($T2)]}
1619 jb .L_AAD_blocks_11_${rndsuffix}
1620 je .L_AAD_blocks_12_${rndsuffix}
1621 cmp \$14,@{[DWORD($T2)]}
1622 jb .L_AAD_blocks_13_${rndsuffix}
1623 je .L_AAD_blocks_14_${rndsuffix}
1624 cmp \$15,@{[DWORD($T2)]}
1625 je .L_AAD_blocks_15_${rndsuffix}
1626___
1627
1628 # ;; fall through for 16 blocks
1629
1630 # ;; The flow of each of these cases is identical:
1631 # ;; - load blocks plain text
1632 # ;; - shuffle loaded blocks
1633 # ;; - xor in current hash value into block 0
1634 # ;; - perform up multiplications with ghash keys
1635 # ;; - jump to reduction code
1636
1637 for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) {
1638 $code .= ".L_AAD_blocks_${aad_blocks}_${rndsuffix}:\n";
1639 if ($aad_blocks > 12) {
1640 $code .= "sub \$`12*16*8`, $T3\n";
1641 } elsif ($aad_blocks > 8) {
1642 $code .= "sub \$`8*16*8`, $T3\n";
1643 } elsif ($aad_blocks > 4) {
1644 $code .= "sub \$`4*16*8`, $T3\n";
1645 }
1646 $code .= "kmovq ($T3),$MASKREG\n";
1647
1648 &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG);
1649
1650 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4,
1651 $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
1652
1653 &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH),
1654 $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks);
1655
1656 if ($aad_blocks > 1) {
1657
1658 # ;; fall through to CALC_AAD_done in 1 block case
1659 $code .= "jmp .L_CALC_AAD_done_${rndsuffix}\n";
1660 }
1661
1662 }
1663 $code .= ".L_CALC_AAD_done_${rndsuffix}:\n";
1664
1665 # ;; result in AAD_HASH
1666}
1667
1668# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1669# ;; PARTIAL_BLOCK
1670# ;; Handles encryption/decryption and the tag partial blocks between
1671# ;; update calls.
1672# ;; Requires the input data be at least 1 byte long.
1673# ;; Output:
1674# ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT),
1675# ;; AAD_HASH and updated GCM128_CTX
1676# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1677sub PARTIAL_BLOCK {
1678 my $GCM128_CTX = $_[0]; # [in] key pointer
1679 my $PBLOCK_LEN = $_[1]; # [in] partial block length
1680 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
1681 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
1682 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
1683 my $DATA_OFFSET = $_[5]; # [out] data offset (gets set)
1684 my $AAD_HASH = $_[6]; # [out] updated GHASH value
1685 my $ENC_DEC = $_[7]; # [in] cipher direction
1686 my $GPTMP0 = $_[8]; # [clobbered] GP temporary register
1687 my $GPTMP1 = $_[9]; # [clobbered] GP temporary register
1688 my $GPTMP2 = $_[10]; # [clobbered] GP temporary register
1689 my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register
1690 my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register
1691 my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register
1692 my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register
1693 my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register
1694 my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register
1695 my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register
1696 my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register
1697 my $MASKREG = $_[19]; # [clobbered] mask temporary register
1698
1699 my $XTMP0 = &XWORD($ZTMP0);
1700 my $XTMP1 = &XWORD($ZTMP1);
1701 my $XTMP2 = &XWORD($ZTMP2);
1702 my $XTMP3 = &XWORD($ZTMP3);
1703 my $XTMP4 = &XWORD($ZTMP4);
1704 my $XTMP5 = &XWORD($ZTMP5);
1705 my $XTMP6 = &XWORD($ZTMP6);
1706 my $XTMP7 = &XWORD($ZTMP7);
1707
1708 my $LENGTH = $DATA_OFFSET;
1709 my $IA0 = $GPTMP1;
1710 my $IA1 = $GPTMP2;
1711 my $IA2 = $GPTMP0;
1712
1713 my $rndsuffix = &random_string();
1714
1715 $code .= <<___;
1716 # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero
1717 mov ($PBLOCK_LEN),$LENGTH
1718 or $LENGTH,$LENGTH
1719 je .L_partial_block_done_${rndsuffix} # ;Leave Macro if no partial blocks
1720___
1721
1722 &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG);
1723
1724 $code .= <<___;
1725 # ;; XTMP1 = my_ctx_data.partial_block_enc_key
1726 vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1
1727 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2
1728
1729 # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes
1730 # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16)
1731 lea SHIFT_MASK(%rip),$IA0
1732 add $LENGTH,$IA0
1733 vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask
1734 vpshufb $XTMP3,$XTMP1,$XTMP1
1735___
1736
1737 if ($ENC_DEC eq "DEC") {
1738 $code .= <<___;
1739 # ;; keep copy of cipher text in $XTMP4
1740 vmovdqa64 $XTMP0,$XTMP4
1741___
1742 }
1743 $code .= <<___;
1744 vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn)
1745 # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block
1746 # ;; Determine if partial block is not being filled and shift mask accordingly
1747___
1748 if ($win64) {
1749 $code .= <<___;
1750 mov $PLAIN_CIPH_LEN,$IA1
1751 add $LENGTH,$IA1
1752___
1753 } else {
1754 $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n";
1755 }
1756 $code .= <<___;
1757 sub \$16,$IA1
1758 jge .L_no_extra_mask_${rndsuffix}
1759 sub $IA1,$IA0
1760.L_no_extra_mask_${rndsuffix}:
1761 # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1
1762 # ;; - mask out bottom $LENGTH bytes of $XTMP1
1763 # ;; sizeof(SHIFT_MASK) == 16 bytes
1764 vmovdqu64 16($IA0),$XTMP0
1765 vpand $XTMP0,$XTMP1,$XTMP1
1766___
1767
1768 if ($ENC_DEC eq "DEC") {
1769 $code .= <<___;
1770 vpand $XTMP0,$XTMP4,$XTMP4
1771 vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4
1772 vpshufb $XTMP3,$XTMP4,$XTMP4
1773 vpxorq $XTMP4,$AAD_HASH,$AAD_HASH
1774___
1775 } else {
1776 $code .= <<___;
1777 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
1778 vpshufb $XTMP3,$XTMP1,$XTMP1
1779 vpxorq $XTMP1,$AAD_HASH,$AAD_HASH
1780___
1781 }
1782 $code .= <<___;
1783 cmp \$0,$IA1
1784 jl .L_partial_incomplete_${rndsuffix}
1785___
1786
1787 # ;; GHASH computation for the last <16 Byte block
1788 &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7);
1789
1790 $code .= <<___;
1791 movq \$0, ($PBLOCK_LEN)
1792 # ;; Set $LENGTH to be the number of bytes to write out
1793 mov $LENGTH,$IA0
1794 mov \$16,$LENGTH
1795 sub $IA0,$LENGTH
1796 jmp .L_enc_dec_done_${rndsuffix}
1797
1798.L_partial_incomplete_${rndsuffix}:
1799___
1800 if ($win64) {
1801 $code .= <<___;
1802 mov $PLAIN_CIPH_LEN,$IA0
1803 add $IA0,($PBLOCK_LEN)
1804___
1805 } else {
1806 $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n";
1807 }
1808 $code .= <<___;
1809 mov $PLAIN_CIPH_LEN,$LENGTH
1810
1811.L_enc_dec_done_${rndsuffix}:
1812 # ;; output encrypted Bytes
1813
1814 lea byte_len_to_mask_table(%rip),$IA0
1815 kmovw ($IA0,$LENGTH,2),$MASKREG
1816 vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)
1817___
1818
1819 if ($ENC_DEC eq "ENC") {
1820 $code .= <<___;
1821 # ;; shuffle XTMP1 back to output as ciphertext
1822 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
1823 vpshufb $XTMP3,$XTMP1,$XTMP1
1824___
1825 }
1826 $code .= <<___;
1827 mov $CIPH_PLAIN_OUT,$IA0
1828 vmovdqu8 $XTMP1,($IA0){$MASKREG}
1829.L_partial_block_done_${rndsuffix}:
1830___
1831}
1832
1833# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1834# ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation
1835sub INITIAL_BLOCKS_PARTIAL_CIPHER {
1836 my $AES_KEYS = $_[0]; # [in] key pointer
1837 my $GCM128_CTX = $_[1]; # [in] context pointer
1838 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
1839 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
1840 my $LENGTH = $_[4]; # [in/clobbered] length in bytes
1841 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
1842 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1843 my $CTR = $_[7]; # [in/out] current counter value
1844 my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC)
1845 my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH
1846 my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH
1847 my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH
1848 my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH
1849 my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text
1850 my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH
1851 my $CTR0 = $_[15]; # [clobbered] ZMM temporary
1852 my $CTR1 = $_[16]; # [clobbered] ZMM temporary
1853 my $CTR2 = $_[17]; # [clobbered] ZMM temporary
1854 my $CTR3 = $_[18]; # [clobbered] ZMM temporary
1855 my $ZT1 = $_[19]; # [clobbered] ZMM temporary
1856 my $IA0 = $_[20]; # [clobbered] GP temporary
1857 my $IA1 = $_[21]; # [clobbered] GP temporary
1858 my $MASKREG = $_[22]; # [clobbered] mask register
1859 my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask
1860
1861 if ($NUM_BLOCKS == 1) {
1862 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n";
1863 } elsif ($NUM_BLOCKS == 2) {
1864 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n";
1865 } else {
1866 $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n";
1867 }
1868
1869 # ;; prepare AES counter blocks
1870 if ($NUM_BLOCKS == 1) {
1871 $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n";
1872 } elsif ($NUM_BLOCKS == 2) {
1873 $code .= <<___;
1874 vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]}
1875 vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]}
1876___
1877 } else {
1878 $code .= <<___;
1879 vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]}
1880 vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0
1881___
1882 if ($NUM_BLOCKS > 4) {
1883 $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n";
1884 }
1885 if ($NUM_BLOCKS > 8) {
1886 $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n";
1887 }
1888 if ($NUM_BLOCKS > 12) {
1889 $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n";
1890 }
1891 }
1892
1893 # ;; get load/store mask
1894 $code .= <<___;
1895 lea byte64_len_to_mask_table(%rip),$IA0
1896 mov $LENGTH,$IA1
1897___
1898 if ($NUM_BLOCKS > 12) {
1899 $code .= "sub \$`3*64`,$IA1\n";
1900 } elsif ($NUM_BLOCKS > 8) {
1901 $code .= "sub \$`2*64`,$IA1\n";
1902 } elsif ($NUM_BLOCKS > 4) {
1903 $code .= "sub \$`1*64`,$IA1\n";
1904 }
1905 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
1906
1907 # ;; extract new counter value
1908 # ;; shuffle the counters for AES rounds
1909 if ($NUM_BLOCKS <= 4) {
1910 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n";
1911 } elsif ($NUM_BLOCKS <= 8) {
1912 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n";
1913 } elsif ($NUM_BLOCKS <= 12) {
1914 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n";
1915 } else {
1916 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n";
1917 }
1918 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1919 $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0,
1920 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1921
1922 # ;; load plain/cipher text
1923 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG);
1924
1925 # ;; AES rounds and XOR with plain/cipher text
1926 foreach my $j (0 .. ($NROUNDS + 1)) {
1927 $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n";
1928 &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j,
1929 $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS);
1930 }
1931
1932 # ;; retrieve the last cipher counter block (partially XOR'ed with text)
1933 # ;; - this is needed for partial block cases
1934 if ($NUM_BLOCKS <= 4) {
1935 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n";
1936 } elsif ($NUM_BLOCKS <= 8) {
1937 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n";
1938 } elsif ($NUM_BLOCKS <= 12) {
1939 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n";
1940 } else {
1941 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n";
1942 }
1943
1944 # ;; write cipher/plain text back to output and
1945 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
1946 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG);
1947
1948 # ;; zero bytes outside the mask before hashing
1949 if ($NUM_BLOCKS <= 4) {
1950 $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n";
1951 } elsif ($NUM_BLOCKS <= 8) {
1952 $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n";
1953 } elsif ($NUM_BLOCKS <= 12) {
1954 $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n";
1955 } else {
1956 $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n";
1957 }
1958
1959 # ;; Shuffle the cipher text blocks for hashing part
1960 # ;; ZT5 and ZT6 are expected outputs with blocks for hashing
1961 if ($ENC_DEC eq "DEC") {
1962
1963 # ;; Decrypt case
1964 # ;; - cipher blocks are in ZT5 & ZT6
1965 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1966 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0,
1967 $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1968 } else {
1969
1970 # ;; Encrypt case
1971 # ;; - cipher blocks are in CTR0-CTR3
1972 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1973 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0,
1974 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1975 }
1976
1977 # ;; Extract the last block for partials and multi_call cases
1978 if ($NUM_BLOCKS <= 4) {
1979 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n";
1980 } elsif ($NUM_BLOCKS <= 8) {
1981 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n";
1982 } elsif ($NUM_BLOCKS <= 12) {
1983 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n";
1984 } else {
1985 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n";
1986 }
1987
1988}
1989
1990# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1991# ;; Computes GHASH on 1 to 16 blocks
1992sub INITIAL_BLOCKS_PARTIAL_GHASH {
1993 my $AES_KEYS = $_[0]; # [in] key pointer
1994 my $GCM128_CTX = $_[1]; # [in] context pointer
1995 my $LENGTH = $_[2]; # [in/clobbered] length in bytes
1996 my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1997 my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value
1998 my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC)
1999 my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH
2000 my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH
2001 my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH
2002 my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH
2003 my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text
2004 my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH
2005 my $ZT0 = $_[12]; # [clobbered] ZMM temporary
2006 my $ZT1 = $_[13]; # [clobbered] ZMM temporary
2007 my $ZT2 = $_[14]; # [clobbered] ZMM temporary
2008 my $ZT3 = $_[15]; # [clobbered] ZMM temporary
2009 my $ZT4 = $_[16]; # [clobbered] ZMM temporary
2010 my $ZT5 = $_[17]; # [clobbered] ZMM temporary
2011 my $ZT6 = $_[18]; # [clobbered] ZMM temporary
2012 my $ZT7 = $_[19]; # [clobbered] ZMM temporary
2013 my $ZT8 = $_[20]; # [clobbered] ZMM temporary
2014 my $PBLOCK_LEN = $_[21]; # [in] partial block length
2015 my $GH = $_[22]; # [in] ZMM with hi product part
2016 my $GM = $_[23]; # [in] ZMM with mid prodcut part
2017 my $GL = $_[24]; # [in] ZMM with lo product part
2018
2019 my $rndsuffix = &random_string();
2020
2021 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2022 # ;;; - Hash all but the last partial block of data
2023 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2024
2025 # ;; update data offset
2026 if ($NUM_BLOCKS > 1) {
2027
2028 # ;; The final block of data may be <16B
2029 $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n";
2030 }
2031
2032 if ($NUM_BLOCKS < 16) {
2033 $code .= <<___;
2034 # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
2035 # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
2036 cmp \$16,$LENGTH
2037 jl .L_small_initial_partial_block_${rndsuffix}
2038
2039 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2040 # ;;; Handle a full length final block - encrypt and hash all blocks
2041 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2042
2043 sub \$16,$LENGTH
2044 movq \$0,($PBLOCK_LEN)
2045___
2046
2047 # ;; Hash all of the data
2048 if (scalar(@_) == 22) {
2049
2050 # ;; start GHASH compute
2051 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2052 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS);
2053 } elsif (scalar(@_) == 25) {
2054
2055 # ;; continue GHASH compute
2056 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2057 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL);
2058 }
2059 $code .= "jmp .L_small_initial_compute_done_${rndsuffix}\n";
2060 }
2061
2062 $code .= <<___;
2063.L_small_initial_partial_block_${rndsuffix}:
2064
2065 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2066 # ;;; Handle ghash for a <16B final block
2067 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2068
2069 # ;; As it's an init / update / finalize series we need to leave the
2070 # ;; last block if it's less than a full block of data.
2071
2072 mov $LENGTH,($PBLOCK_LEN)
2073 vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX)
2074___
2075
2076 my $k = ($NUM_BLOCKS - 1);
2077 my $last_block_to_hash = 1;
2078 if (($NUM_BLOCKS > $last_block_to_hash)) {
2079
2080 # ;; ZT12-ZT20 - temporary registers
2081 if (scalar(@_) == 22) {
2082
2083 # ;; start GHASH compute
2084 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2085 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k);
2086 } elsif (scalar(@_) == 25) {
2087
2088 # ;; continue GHASH compute
2089 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2090 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL);
2091 }
2092
2093 # ;; just fall through no jmp needed
2094 } else {
2095
2096 if (scalar(@_) == 25) {
2097 $code .= <<___;
2098 # ;; Reduction is required in this case.
2099 # ;; Integrate GM into GH and GL.
2100 vpsrldq \$8,$GM,$ZT0
2101 vpslldq \$8,$GM,$ZT1
2102 vpxorq $ZT0,$GH,$GH
2103 vpxorq $ZT1,$GL,$GL
2104___
2105
2106 # ;; Add GH and GL 128-bit words horizontally
2107 &VHPXORI4x128($GH, $ZT0);
2108 &VHPXORI4x128($GL, $ZT1);
2109
2110 # ;; 256-bit to 128-bit reduction
2111 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n";
2112 &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2));
2113 }
2114 $code .= <<___;
2115 # ;; Record that a reduction is not needed -
2116 # ;; In this case no hashes are computed because there
2117 # ;; is only one initial block and it is < 16B in length.
2118 # ;; We only need to check if a reduction is needed if
2119 # ;; initial_blocks == 1 and init/update/final is being used.
2120 # ;; In this case we may just have a partial block, and that
2121 # ;; gets hashed in finalize.
2122
2123 # ;; The hash should end up in HASH_IN_OUT.
2124 # ;; The only way we should get here is if there is
2125 # ;; a partial block of data, so xor that into the hash.
2126 vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT
2127 # ;; The result is in $HASH_IN_OUT
2128 jmp .L_after_reduction_${rndsuffix}
2129___
2130 }
2131
2132 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2133 # ;;; After GHASH reduction
2134 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2135
2136 $code .= ".L_small_initial_compute_done_${rndsuffix}:\n";
2137
2138 # ;; If using init/update/finalize, we need to xor any partial block data
2139 # ;; into the hash.
2140 if ($NUM_BLOCKS > 1) {
2141
2142 # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place
2143 if ($NUM_BLOCKS != 16) {
2144 $code .= <<___;
2145 # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero
2146 or $LENGTH,$LENGTH
2147 je .L_after_reduction_${rndsuffix}
2148___
2149 }
2150 $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n";
2151 }
2152
2153 $code .= ".L_after_reduction_${rndsuffix}:\n";
2154
2155 # ;; Final hash is now in HASH_IN_OUT
2156}
2157
2158# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2159# ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
2160# ;; It may look similar to INITIAL_BLOCKS but its usage is different:
2161# ;; - first encrypts/decrypts required number of blocks and then
2162# ;; ghashes these blocks
2163# ;; - Small packets or left over data chunks (<256 bytes)
2164# ;; - Remaining data chunks below 256 bytes (multi buffer code)
2165# ;;
2166# ;; num_initial_blocks is expected to include the partial final block
2167# ;; in the count.
2168sub INITIAL_BLOCKS_PARTIAL {
2169 my $AES_KEYS = $_[0]; # [in] key pointer
2170 my $GCM128_CTX = $_[1]; # [in] context pointer
2171 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
2172 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
2173 my $LENGTH = $_[4]; # [in/clobbered] length in bytes
2174 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
2175 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
2176 my $CTR = $_[7]; # [in/out] current counter value
2177 my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value
2178 my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC)
2179 my $CTR0 = $_[10]; # [clobbered] ZMM temporary
2180 my $CTR1 = $_[11]; # [clobbered] ZMM temporary
2181 my $CTR2 = $_[12]; # [clobbered] ZMM temporary
2182 my $CTR3 = $_[13]; # [clobbered] ZMM temporary
2183 my $DAT0 = $_[14]; # [clobbered] ZMM temporary
2184 my $DAT1 = $_[15]; # [clobbered] ZMM temporary
2185 my $DAT2 = $_[16]; # [clobbered] ZMM temporary
2186 my $DAT3 = $_[17]; # [clobbered] ZMM temporary
2187 my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary
2188 my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary
2189 my $ZT0 = $_[20]; # [clobbered] ZMM temporary
2190 my $ZT1 = $_[21]; # [clobbered] ZMM temporary
2191 my $ZT2 = $_[22]; # [clobbered] ZMM temporary
2192 my $ZT3 = $_[23]; # [clobbered] ZMM temporary
2193 my $ZT4 = $_[24]; # [clobbered] ZMM temporary
2194 my $IA0 = $_[25]; # [clobbered] GP temporary
2195 my $IA1 = $_[26]; # [clobbered] GP temporary
2196 my $MASKREG = $_[27]; # [clobbered] mask register
2197 my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask
2198 my $PBLOCK_LEN = $_[29]; # [in] partial block length
2199
2200 &INITIAL_BLOCKS_PARTIAL_CIPHER(
2201 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
2202 $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR,
2203 $ENC_DEC, $DAT0, $DAT1, $DAT2,
2204 $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0,
2205 $CTR1, $CTR2, $CTR3, $ZT0,
2206 $IA0, $IA1, $MASKREG, $SHUFMASK);
2207
2208 &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0,
2209 $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK),
2210 &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN);
2211}
2212
2213# ;; ===========================================================================
2214# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2215# ;; followed with GHASH of the N blocks.
2216sub GHASH_16_ENCRYPT_N_GHASH_N {
2217 my $AES_KEYS = $_[0]; # [in] key pointer
2218 my $GCM128_CTX = $_[1]; # [in] context pointer
2219 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
2220 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
2221 my $DATA_OFFSET = $_[4]; # [in] data offset
2222 my $LENGTH = $_[5]; # [in] data length
2223 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
2224 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
2225 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
2226 # (can be in form of register or numerical value)
2227 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
2228 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
2229 my $B00_03 = $_[11]; # [clobbered] temporary ZMM
2230 my $B04_07 = $_[12]; # [clobbered] temporary ZMM
2231 my $B08_11 = $_[13]; # [clobbered] temporary ZMM
2232 my $B12_15 = $_[14]; # [clobbered] temporary ZMM
2233 my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM
2234 my $GH1L = $_[16]; # [clobbered] temporary ZMM
2235 my $GH1M = $_[17]; # [clobbered] temporary ZMM
2236 my $GH1T = $_[18]; # [clobbered] temporary ZMM
2237 my $GH2H = $_[19]; # [clobbered] temporary ZMM
2238 my $GH2L = $_[20]; # [clobbered] temporary ZMM
2239 my $GH2M = $_[21]; # [clobbered] temporary ZMM
2240 my $GH2T = $_[22]; # [clobbered] temporary ZMM
2241 my $GH3H = $_[23]; # [clobbered] temporary ZMM
2242 my $GH3L = $_[24]; # [clobbered] temporary ZMM
2243 my $GH3M = $_[25]; # [clobbered] temporary ZMM
2244 my $GH3T = $_[26]; # [clobbered] temporary ZMM
2245 my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM
2246 my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM
2247 my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM
2248 my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM
2249 my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM
2250 my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM
2251 my $ZT01 = $_[33]; # [clobbered] temporary ZMM
2252 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
2253 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2254 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
2255 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
2256 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
2257 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
2258 my $ENC_DEC = $_[40]; # [in] cipher direction
2259 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
2260 my $IA0 = $_[42]; # [clobbered] GP temporary
2261 my $IA1 = $_[43]; # [clobbered] GP temporary
2262 my $MASKREG = $_[44]; # [clobbered] mask register
2263 my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
2264 my $PBLOCK_LEN = $_[46]; # [in] partial block length
2265
2266 die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
2267 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
2268
2269 my $rndsuffix = &random_string();
2270
2271 my $GH1H = $HASH_IN_OUT;
2272
2273 # ; this is to avoid additional move in do_reduction case
2274
2275 my $LAST_GHASH_BLK = $GH1L;
2276 my $LAST_CIPHER_BLK = $GH1T;
2277
2278 my $RED_POLY = $GH2T;
2279 my $RED_P1 = $GH2L;
2280 my $RED_T1 = $GH2H;
2281 my $RED_T2 = $GH2M;
2282
2283 my $DATA1 = $GH3H;
2284 my $DATA2 = $GH3L;
2285 my $DATA3 = $GH3M;
2286 my $DATA4 = $GH3T;
2287
2288 # ;; do reduction after the 16 blocks ?
2289 my $do_reduction = 0;
2290
2291 # ;; is 16 block chunk a start?
2292 my $is_start = 0;
2293
2294 if ($GHASH_TYPE eq "start_reduce") {
2295 $is_start = 1;
2296 $do_reduction = 1;
2297 }
2298
2299 if ($GHASH_TYPE eq "start") {
2300 $is_start = 1;
2301 }
2302
2303 if ($GHASH_TYPE eq "end_reduce") {
2304 $do_reduction = 1;
2305 }
2306
2307 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2308 # ;; - get load/store mask
2309 # ;; - load plain/cipher text
2310 # ;; get load/store mask
2311 $code .= <<___;
2312 lea byte64_len_to_mask_table(%rip),$IA0
2313 mov $LENGTH,$IA1
2314___
2315 if ($NUM_BLOCKS > 12) {
2316 $code .= "sub \$`3*64`,$IA1\n";
2317 } elsif ($NUM_BLOCKS > 8) {
2318 $code .= "sub \$`2*64`,$IA1\n";
2319 } elsif ($NUM_BLOCKS > 4) {
2320 $code .= "sub \$`1*64`,$IA1\n";
2321 }
2322 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
2323
2324 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2325 # ;; prepare counter blocks
2326
2327 $code .= <<___;
2328 cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
2329 jae .L_16_blocks_overflow_${rndsuffix}
2330___
2331
2332 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2333 $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
2334 $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
2335 $code .= <<___;
2336 jmp .L_16_blocks_ok_${rndsuffix}
2337
2338.L_16_blocks_overflow_${rndsuffix}:
2339 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
2340 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
2341___
2342 if ($NUM_BLOCKS > 4) {
2343 $code .= <<___;
2344 vmovdqa64 ddq_add_4444(%rip),$B12_15
2345 vpaddd $B12_15,$B00_03,$B04_07
2346___
2347 }
2348 if ($NUM_BLOCKS > 8) {
2349 $code .= "vpaddd $B12_15,$B04_07,$B08_11\n";
2350 }
2351 if ($NUM_BLOCKS > 12) {
2352 $code .= "vpaddd $B12_15,$B08_11,$B12_15\n";
2353 }
2354 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2355 $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2356 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2357 $code .= <<___;
2358.L_16_blocks_ok_${rndsuffix}:
2359
2360 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2361 # ;; - pre-load constants
2362 # ;; - add current hash into the 1st block
2363 vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1
2364___
2365 if ($is_start != 0) {
2366 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
2367 } else {
2368 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
2369 }
2370
2371 $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
2372
2373 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2374 # ;; save counter for the next round
2375 # ;; increment counter overflow check register
2376 if ($NUM_BLOCKS <= 4) {
2377 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
2378 } elsif ($NUM_BLOCKS <= 8) {
2379 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
2380 } elsif ($NUM_BLOCKS <= 12) {
2381 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
2382 } else {
2383 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
2384 }
2385 $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
2386
2387 $code .= <<___;
2388 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2389 # ;; pre-load constants
2390 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
2391 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
2392 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
2393___
2394
2395 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2396 # ;; stitch AES rounds with GHASH
2397
2398 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2399 # ;; AES round 0 - ARK
2400
2401 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2402 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2403 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2404 $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
2405
2406 $code .= <<___;
2407 # ;;==================================================
2408 # ;; GHASH 4 blocks (15 to 12)
2409 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
2410 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
2411 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
2412 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
2413 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
2414 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
2415___
2416
2417 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2418 # ;; AES round 1
2419 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2420 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2421 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2422 $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
2423
2424 $code .= <<___;
2425 # ;; =================================================
2426 # ;; GHASH 4 blocks (11 to 8)
2427 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
2428 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
2429 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
2430 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
2431 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
2432 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
2433___
2434
2435 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2436 # ;; AES round 2
2437 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2438 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2439 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2440 $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n";
2441
2442 $code .= <<___;
2443 # ;; =================================================
2444 # ;; GHASH 4 blocks (7 to 4)
2445 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
2446 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
2447 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
2448 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
2449___
2450
2451 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2452 # ;; AES rounds 3
2453 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2454 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2455 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2456 $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n";
2457
2458 $code .= <<___;
2459 # ;; =================================================
2460 # ;; Gather (XOR) GHASH for 12 blocks
2461 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
2462 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
2463 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
2464 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
2465___
2466
2467 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2468 # ;; AES rounds 4
2469 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2470 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2471 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2472 $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n";
2473
2474 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2475 # ;; load plain/cipher text
2476 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
2477
2478 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2479 # ;; AES rounds 5
2480 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2481 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2482 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2483 $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n";
2484
2485 $code .= <<___;
2486 # ;; =================================================
2487 # ;; GHASH 4 blocks (3 to 0)
2488 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
2489 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
2490 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
2491 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
2492___
2493
2494 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2495 # ;; AES round 6
2496 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2497 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2498 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2499 $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n";
2500
2501 # ;; =================================================
2502 # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
2503 # ;; - add GH2[MTLH] to GH1[MTLH]
2504 $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n";
2505 if ($do_reduction != 0) {
2506
2507 if ($is_start != 0) {
2508 $code .= "vpxorq $GH2M,$GH1M,$GH1M\n";
2509 } else {
2510 $code .= <<___;
2511 vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
2512 vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
2513 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
2514___
2515 }
2516
2517 } else {
2518
2519 # ;; Update H/M/L hash sums if not carrying reduction
2520 if ($is_start != 0) {
2521 $code .= <<___;
2522 vpxorq $GH2H,$GH1H,$TO_REDUCE_H
2523 vpxorq $GH2L,$GH1L,$TO_REDUCE_L
2524 vpxorq $GH2M,$GH1M,$TO_REDUCE_M
2525___
2526 } else {
2527 $code .= <<___;
2528 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
2529 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
2530 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
2531___
2532 }
2533
2534 }
2535
2536 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2537 # ;; AES round 7
2538 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2539 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2540 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2541 $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n";
2542
2543 # ;; =================================================
2544 # ;; prepare mid sum for adding to high & low
2545 # ;; load polynomial constant for reduction
2546 if ($do_reduction != 0) {
2547 $code .= <<___;
2548 vpsrldq \$8,$GH1M,$GH2M
2549 vpslldq \$8,$GH1M,$GH1M
2550
2551 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
2552___
2553 }
2554
2555 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2556 # ;; AES round 8
2557 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2558 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2559 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2560 $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n";
2561
2562 # ;; =================================================
2563 # ;; Add mid product to high and low
2564 if ($do_reduction != 0) {
2565 if ($is_start != 0) {
2566 $code .= <<___;
2567 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
2568 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
2569___
2570 } else {
2571 $code .= <<___;
2572 vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64
2573 vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64
2574___
2575 }
2576 }
2577
2578 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2579 # ;; AES round 9
2580 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2581 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2582 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2583
2584 # ;; =================================================
2585 # ;; horizontal xor of low and high 4x128
2586 if ($do_reduction != 0) {
2587 &VHPXORI4x128($GH1H, $GH2H);
2588 &VHPXORI4x128($GH1L, $GH2L);
2589 }
2590
2591 if (($NROUNDS >= 11)) {
2592 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
2593 }
2594
2595 # ;; =================================================
2596 # ;; first phase of reduction
2597 if ($do_reduction != 0) {
2598 $code .= <<___;
2599 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
2600 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
2601 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
2602___
2603 }
2604
2605 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2606 # ;; AES rounds up to 11 (AES192) or 13 (AES256)
2607 # ;; AES128 is done
2608 if (($NROUNDS >= 11)) {
2609 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2610 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2611 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2612 $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n";
2613
2614 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2615 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2616 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2617 if (($NROUNDS == 13)) {
2618 $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n";
2619
2620 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2621 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2622 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2623 $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n";
2624
2625 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2626 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2627 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2628 }
2629 }
2630
2631 # ;; =================================================
2632 # ;; second phase of the reduction
2633 if ($do_reduction != 0) {
2634 $code .= <<___;
2635 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
2636 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
2637 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
2638 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
2639 # ;; GH1H = GH1H + RED_T1 + RED_T2
2640 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
2641___
2642 }
2643
2644 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2645 # ;; the last AES round
2646 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2647 $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2648 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2649
2650 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2651 # ;; XOR against plain/cipher text
2652 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2653 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2654 $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4);
2655
2656 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2657 # ;; retrieve the last cipher counter block (partially XOR'ed with text)
2658 # ;; - this is needed for partial block cases
2659 if ($NUM_BLOCKS <= 4) {
2660 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2661 } elsif ($NUM_BLOCKS <= 8) {
2662 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2663 } elsif ($NUM_BLOCKS <= 12) {
2664 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2665 } else {
2666 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2667 }
2668
2669 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2670 # ;; store cipher/plain text
2671 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
2672 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
2673
2674 # ;; =================================================
2675 # ;; shuffle cipher text blocks for GHASH computation
2676 if ($ENC_DEC eq "ENC") {
2677
2678 # ;; zero bytes outside the mask before hashing
2679 if ($NUM_BLOCKS <= 4) {
2680 $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n";
2681 } elsif ($NUM_BLOCKS <= 8) {
2682 $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n";
2683 } elsif ($NUM_BLOCKS <= 12) {
2684 $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n";
2685 } else {
2686 $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n";
2687 }
2688
2689 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2690 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03,
2691 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2692 } else {
2693
2694 # ;; zero bytes outside the mask before hashing
2695 if ($NUM_BLOCKS <= 4) {
2696 $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n";
2697 } elsif ($NUM_BLOCKS <= 8) {
2698 $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n";
2699 } elsif ($NUM_BLOCKS <= 12) {
2700 $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n";
2701 } else {
2702 $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n";
2703 }
2704
2705 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2706 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1,
2707 $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2708 }
2709
2710 # ;; =================================================
2711 # ;; Extract the last block for partial / multi_call cases
2712 if ($NUM_BLOCKS <= 4) {
2713 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
2714 } elsif ($NUM_BLOCKS <= 8) {
2715 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
2716 } elsif ($NUM_BLOCKS <= 12) {
2717 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
2718 } else {
2719 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
2720 }
2721
2722 if ($do_reduction != 0) {
2723
2724 # ;; GH1H holds reduced hash value
2725 # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
2726 # ;; - register rename trick obsoletes the above move
2727 }
2728
2729 # ;; =================================================
2730 # ;; GHASH last N blocks
2731 # ;; - current hash value in HASH_IN_OUT or
2732 # ;; product parts in TO_REDUCE_H/M/L
2733 # ;; - DATA1-DATA4 include blocks for GHASH
2734
2735 if ($do_reduction == 0) {
2736 &INITIAL_BLOCKS_PARTIAL_GHASH(
2737 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
2738 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
2739 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2740 $B00_03, $B04_07, $B08_11, $B12_15,
2741 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
2742 $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M,
2743 $TO_REDUCE_L);
2744 } else {
2745 &INITIAL_BLOCKS_PARTIAL_GHASH(
2746 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
2747 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
2748 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2749 $B00_03, $B04_07, $B08_11, $B12_15,
2750 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
2751 $GHKEY1, $PBLOCK_LEN);
2752 }
2753}
2754
2755# ;; ===========================================================================
2756# ;; ===========================================================================
2757# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2758# ;; followed with GHASH of the N blocks.
2759sub GCM_ENC_DEC_LAST {
2760 my $AES_KEYS = $_[0]; # [in] key pointer
2761 my $GCM128_CTX = $_[1]; # [in] context pointer
2762 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
2763 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
2764 my $DATA_OFFSET = $_[4]; # [in] data offset
2765 my $LENGTH = $_[5]; # [in/clobbered] data length
2766 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
2767 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
2768 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
2769 # (can be register or numerical offset)
2770 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
2771 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
2772 my $ZT00 = $_[11]; # [clobbered] temporary ZMM
2773 my $ZT01 = $_[12]; # [clobbered] temporary ZMM
2774 my $ZT02 = $_[13]; # [clobbered] temporary ZMM
2775 my $ZT03 = $_[14]; # [clobbered] temporary ZMM
2776 my $ZT04 = $_[15]; # [clobbered] temporary ZMM
2777 my $ZT05 = $_[16]; # [clobbered] temporary ZMM
2778 my $ZT06 = $_[17]; # [clobbered] temporary ZMM
2779 my $ZT07 = $_[18]; # [clobbered] temporary ZMM
2780 my $ZT08 = $_[19]; # [clobbered] temporary ZMM
2781 my $ZT09 = $_[20]; # [clobbered] temporary ZMM
2782 my $ZT10 = $_[21]; # [clobbered] temporary ZMM
2783 my $ZT11 = $_[22]; # [clobbered] temporary ZMM
2784 my $ZT12 = $_[23]; # [clobbered] temporary ZMM
2785 my $ZT13 = $_[24]; # [clobbered] temporary ZMM
2786 my $ZT14 = $_[25]; # [clobbered] temporary ZMM
2787 my $ZT15 = $_[26]; # [clobbered] temporary ZMM
2788 my $ZT16 = $_[27]; # [clobbered] temporary ZMM
2789 my $ZT17 = $_[28]; # [clobbered] temporary ZMM
2790 my $ZT18 = $_[29]; # [clobbered] temporary ZMM
2791 my $ZT19 = $_[30]; # [clobbered] temporary ZMM
2792 my $ZT20 = $_[31]; # [clobbered] temporary ZMM
2793 my $ZT21 = $_[32]; # [clobbered] temporary ZMM
2794 my $ZT22 = $_[33]; # [clobbered] temporary ZMM
2795 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
2796 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2797 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
2798 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
2799 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
2800 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
2801 my $ENC_DEC = $_[40]; # [in] cipher direction
2802 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
2803 my $IA0 = $_[42]; # [clobbered] GP temporary
2804 my $IA1 = $_[43]; # [clobbered] GP temporary
2805 my $MASKREG = $_[44]; # [clobbered] mask register
2806 my $PBLOCK_LEN = $_[45]; # [in] partial block length
2807
2808 my $rndsuffix = &random_string();
2809
2810 $code .= <<___;
2811 mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
2812 add \$15,@{[DWORD($IA0)]}
2813 shr \$4,@{[DWORD($IA0)]}
2814 je .L_last_num_blocks_is_0_${rndsuffix}
2815
2816 cmp \$8,@{[DWORD($IA0)]}
2817 je .L_last_num_blocks_is_8_${rndsuffix}
2818 jb .L_last_num_blocks_is_7_1_${rndsuffix}
2819
2820
2821 cmp \$12,@{[DWORD($IA0)]}
2822 je .L_last_num_blocks_is_12_${rndsuffix}
2823 jb .L_last_num_blocks_is_11_9_${rndsuffix}
2824
2825 # ;; 16, 15, 14 or 13
2826 cmp \$15,@{[DWORD($IA0)]}
2827 je .L_last_num_blocks_is_15_${rndsuffix}
2828 ja .L_last_num_blocks_is_16_${rndsuffix}
2829 cmp \$14,@{[DWORD($IA0)]}
2830 je .L_last_num_blocks_is_14_${rndsuffix}
2831 jmp .L_last_num_blocks_is_13_${rndsuffix}
2832
2833.L_last_num_blocks_is_11_9_${rndsuffix}:
2834 # ;; 11, 10 or 9
2835 cmp \$10,@{[DWORD($IA0)]}
2836 je .L_last_num_blocks_is_10_${rndsuffix}
2837 ja .L_last_num_blocks_is_11_${rndsuffix}
2838 jmp .L_last_num_blocks_is_9_${rndsuffix}
2839
2840.L_last_num_blocks_is_7_1_${rndsuffix}:
2841 cmp \$4,@{[DWORD($IA0)]}
2842 je .L_last_num_blocks_is_4_${rndsuffix}
2843 jb .L_last_num_blocks_is_3_1_${rndsuffix}
2844 # ;; 7, 6 or 5
2845 cmp \$6,@{[DWORD($IA0)]}
2846 ja .L_last_num_blocks_is_7_${rndsuffix}
2847 je .L_last_num_blocks_is_6_${rndsuffix}
2848 jmp .L_last_num_blocks_is_5_${rndsuffix}
2849
2850.L_last_num_blocks_is_3_1_${rndsuffix}:
2851 # ;; 3, 2 or 1
2852 cmp \$2,@{[DWORD($IA0)]}
2853 ja .L_last_num_blocks_is_3_${rndsuffix}
2854 je .L_last_num_blocks_is_2_${rndsuffix}
2855___
2856
2857 # ;; fall through for `jmp .L_last_num_blocks_is_1`
2858
2859 # ;; Use rep to generate different block size variants
2860 # ;; - one block size has to be the first one
2861 for my $num_blocks (1 .. 16) {
2862 $code .= ".L_last_num_blocks_is_${num_blocks}_${rndsuffix}:\n";
2863 &GHASH_16_ENCRYPT_N_GHASH_N(
2864 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET,
2865 $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET,
2866 $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03,
2867 $ZT04, $ZT05, $ZT06, $ZT07, $ZT08,
2868 $ZT09, $ZT10, $ZT11, $ZT12, $ZT13,
2869 $ZT14, $ZT15, $ZT16, $ZT17, $ZT18,
2870 $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4,
2871 $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M,
2872 $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG,
2873 $num_blocks, $PBLOCK_LEN);
2874
2875 $code .= "jmp .L_last_blocks_done_${rndsuffix}\n";
2876 }
2877
2878 $code .= ".L_last_num_blocks_is_0_${rndsuffix}:\n";
2879
2880 # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction
2881 # ;; - convert mid into end_reduce
2882 # ;; - convert start into start_reduce
2883 if ($GHASH_TYPE eq "mid") {
2884 $GHASH_TYPE = "end_reduce";
2885 }
2886 if ($GHASH_TYPE eq "start") {
2887 $GHASH_TYPE = "start_reduce";
2888 }
2889
2890 &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp",
2891 $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01,
2892 $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09);
2893
2894 $code .= ".L_last_blocks_done_${rndsuffix}:\n";
2895}
2896
2897# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2898# ;; Main GCM macro stitching cipher with GHASH
2899# ;; - operates on single stream
2900# ;; - encrypts 16 blocks at a time
2901# ;; - ghash the 16 previously encrypted ciphertext blocks
2902# ;; - no partial block or multi_call handling here
2903sub GHASH_16_ENCRYPT_16_PARALLEL {
2904 my $AES_KEYS = $_[0]; # [in] key pointer
2905 my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer
2906 my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer
2907 my $DATA_OFFSET = $_[3]; # [in] data offset
2908 my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian
2909 my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check
2910 my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value)
2911 my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out
2912 my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in
2913 my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb
2914 my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher)
2915 my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher)
2916 my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher)
2917 my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher)
2918 my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
2919 my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher)
2920 my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher)
2921 my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher)
2922 my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher)
2923 my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash)
2924 my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash)
2925 my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash)
2926 my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash)
2927 my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash)
2928 my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash)
2929 my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash)
2930 my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash)
2931 my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash)
2932 my $ZT19 = $_[28]; # [clobbered] temporary ZMM
2933 my $ZT20 = $_[29]; # [clobbered] temporary ZMM
2934 my $ZT21 = $_[30]; # [clobbered] temporary ZMM
2935 my $ZT22 = $_[31]; # [clobbered] temporary ZMM
2936 my $ZT23 = $_[32]; # [clobbered] temporary ZMM
2937 my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian
2938 my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2939 my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum
2940 my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum
2941 my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum
2942 my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time"
2943 my $ENC_DEC = $_[39]; # [in] cipher direction
2944 my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
2945 my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
2946 my $IA0 = $_[42]; # [clobbered] temporary GPR
2947
2948 my $B00_03 = $ZT1;
2949 my $B04_07 = $ZT2;
2950 my $B08_11 = $ZT3;
2951 my $B12_15 = $ZT4;
2952
2953 my $GH1H = $ZT5;
2954
2955 # ; @note: do not change this mapping
2956 my $GH1L = $ZT6;
2957 my $GH1M = $ZT7;
2958 my $GH1T = $ZT8;
2959
2960 my $GH2H = $ZT9;
2961 my $GH2L = $ZT10;
2962 my $GH2M = $ZT11;
2963 my $GH2T = $ZT12;
2964
2965 my $RED_POLY = $GH2T;
2966 my $RED_P1 = $GH2L;
2967 my $RED_T1 = $GH2H;
2968 my $RED_T2 = $GH2M;
2969
2970 my $GH3H = $ZT13;
2971 my $GH3L = $ZT14;
2972 my $GH3M = $ZT15;
2973 my $GH3T = $ZT16;
2974
2975 my $DATA1 = $ZT13;
2976 my $DATA2 = $ZT14;
2977 my $DATA3 = $ZT15;
2978 my $DATA4 = $ZT16;
2979
2980 my $AESKEY1 = $ZT17;
2981 my $AESKEY2 = $ZT18;
2982
2983 my $GHKEY1 = $ZT19;
2984 my $GHKEY2 = $ZT20;
2985 my $GHDAT1 = $ZT21;
2986 my $GHDAT2 = $ZT22;
2987
2988 my $rndsuffix = &random_string();
2989
2990 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2991 # ;; prepare counter blocks
2992
2993 $code .= <<___;
2994 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
2995 jae .L_16_blocks_overflow_${rndsuffix}
2996 vpaddd $ADDBE_1234,$CTR_BE,$B00_03
2997 vpaddd $ADDBE_4x4,$B00_03,$B04_07
2998 vpaddd $ADDBE_4x4,$B04_07,$B08_11
2999 vpaddd $ADDBE_4x4,$B08_11,$B12_15
3000 jmp .L_16_blocks_ok_${rndsuffix}
3001.L_16_blocks_overflow_${rndsuffix}:
3002 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
3003 vmovdqa64 ddq_add_4444(%rip),$B12_15
3004 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
3005 vpaddd $B12_15,$B00_03,$B04_07
3006 vpaddd $B12_15,$B04_07,$B08_11
3007 vpaddd $B12_15,$B08_11,$B12_15
3008 vpshufb $SHFMSK,$B00_03,$B00_03
3009 vpshufb $SHFMSK,$B04_07,$B04_07
3010 vpshufb $SHFMSK,$B08_11,$B08_11
3011 vpshufb $SHFMSK,$B12_15,$B12_15
3012.L_16_blocks_ok_${rndsuffix}:
3013___
3014
3015 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3016 # ;; pre-load constants
3017 $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
3018 if ($GHASH_IN ne "no_ghash_in") {
3019 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
3020 } else {
3021 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
3022 }
3023
3024 $code .= <<___;
3025 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
3026
3027 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3028 # ;; save counter for the next round
3029 # ;; increment counter overflow check register
3030 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
3031 addb \$16,@{[BYTE($CTR_CHECK)]}
3032 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3033 # ;; pre-load constants
3034 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
3035 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
3036 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
3037
3038 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3039 # ;; stitch AES rounds with GHASH
3040
3041 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3042 # ;; AES round 0 - ARK
3043
3044 vpxorq $AESKEY1,$B00_03,$B00_03
3045 vpxorq $AESKEY1,$B04_07,$B04_07
3046 vpxorq $AESKEY1,$B08_11,$B08_11
3047 vpxorq $AESKEY1,$B12_15,$B12_15
3048 vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1
3049
3050 # ;;==================================================
3051 # ;; GHASH 4 blocks (15 to 12)
3052 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
3053 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
3054 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
3055 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
3056 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
3057 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
3058
3059 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3060 # ;; AES round 1
3061 vaesenc $AESKEY2,$B00_03,$B00_03
3062 vaesenc $AESKEY2,$B04_07,$B04_07
3063 vaesenc $AESKEY2,$B08_11,$B08_11
3064 vaesenc $AESKEY2,$B12_15,$B12_15
3065 vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2
3066
3067 # ;; =================================================
3068 # ;; GHASH 4 blocks (11 to 8)
3069 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
3070 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
3071 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
3072 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
3073 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
3074 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
3075
3076 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3077 # ;; AES round 2
3078 vaesenc $AESKEY1,$B00_03,$B00_03
3079 vaesenc $AESKEY1,$B04_07,$B04_07
3080 vaesenc $AESKEY1,$B08_11,$B08_11
3081 vaesenc $AESKEY1,$B12_15,$B12_15
3082 vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1
3083
3084 # ;; =================================================
3085 # ;; GHASH 4 blocks (7 to 4)
3086 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
3087 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
3088 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
3089 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
3090 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3091 # ;; AES rounds 3
3092 vaesenc $AESKEY2,$B00_03,$B00_03
3093 vaesenc $AESKEY2,$B04_07,$B04_07
3094 vaesenc $AESKEY2,$B08_11,$B08_11
3095 vaesenc $AESKEY2,$B12_15,$B12_15
3096 vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2
3097
3098 # ;; =================================================
3099 # ;; Gather (XOR) GHASH for 12 blocks
3100 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
3101 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
3102 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
3103 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
3104
3105 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3106 # ;; AES rounds 4
3107 vaesenc $AESKEY1,$B00_03,$B00_03
3108 vaesenc $AESKEY1,$B04_07,$B04_07
3109 vaesenc $AESKEY1,$B08_11,$B08_11
3110 vaesenc $AESKEY1,$B12_15,$B12_15
3111 vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1
3112
3113 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3114 # ;; load plain/cipher text (recycle GH3xx registers)
3115 vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
3116 vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
3117 vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
3118 vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
3119
3120 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3121 # ;; AES rounds 5
3122 vaesenc $AESKEY2,$B00_03,$B00_03
3123 vaesenc $AESKEY2,$B04_07,$B04_07
3124 vaesenc $AESKEY2,$B08_11,$B08_11
3125 vaesenc $AESKEY2,$B12_15,$B12_15
3126 vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2
3127
3128 # ;; =================================================
3129 # ;; GHASH 4 blocks (3 to 0)
3130 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
3131 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
3132 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
3133 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
3134 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3135 # ;; AES round 6
3136 vaesenc $AESKEY1,$B00_03,$B00_03
3137 vaesenc $AESKEY1,$B04_07,$B04_07
3138 vaesenc $AESKEY1,$B08_11,$B08_11
3139 vaesenc $AESKEY1,$B12_15,$B12_15
3140 vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1
3141___
3142
3143 # ;; =================================================
3144 # ;; gather GHASH in GH1L (low) and GH1H (high)
3145 if ($DO_REDUCTION eq "first_time") {
3146 $code .= <<___;
3147 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3148 vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM
3149 vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH
3150 vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL
3151___
3152 }
3153 if ($DO_REDUCTION eq "no_reduction") {
3154 $code .= <<___;
3155 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3156 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM
3157 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH
3158 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL
3159___
3160 }
3161 if ($DO_REDUCTION eq "final_reduction") {
3162 $code .= <<___;
3163 # ;; phase 1: add mid products together
3164 # ;; also load polynomial constant for reduction
3165 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3166 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
3167
3168 vpsrldq \$8,$GH1M,$GH2M
3169 vpslldq \$8,$GH1M,$GH1M
3170
3171 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
3172___
3173 }
3174
3175 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3176 # ;; AES round 7
3177 $code .= <<___;
3178 vaesenc $AESKEY2,$B00_03,$B00_03
3179 vaesenc $AESKEY2,$B04_07,$B04_07
3180 vaesenc $AESKEY2,$B08_11,$B08_11
3181 vaesenc $AESKEY2,$B12_15,$B12_15
3182 vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2
3183___
3184
3185 # ;; =================================================
3186 # ;; Add mid product to high and low
3187 if ($DO_REDUCTION eq "final_reduction") {
3188 $code .= <<___;
3189 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
3190 vpxorq $TO_REDUCE_H,$GH1H,$GH1H
3191 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
3192 vpxorq $TO_REDUCE_L,$GH1L,$GH1L
3193___
3194 }
3195
3196 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3197 # ;; AES round 8
3198 $code .= <<___;
3199 vaesenc $AESKEY1,$B00_03,$B00_03
3200 vaesenc $AESKEY1,$B04_07,$B04_07
3201 vaesenc $AESKEY1,$B08_11,$B08_11
3202 vaesenc $AESKEY1,$B12_15,$B12_15
3203 vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1
3204___
3205
3206 # ;; =================================================
3207 # ;; horizontal xor of low and high 4x128
3208 if ($DO_REDUCTION eq "final_reduction") {
3209 &VHPXORI4x128($GH1H, $GH2H);
3210 &VHPXORI4x128($GH1L, $GH2L);
3211 }
3212
3213 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3214 # ;; AES round 9
3215 $code .= <<___;
3216 vaesenc $AESKEY2,$B00_03,$B00_03
3217 vaesenc $AESKEY2,$B04_07,$B04_07
3218 vaesenc $AESKEY2,$B08_11,$B08_11
3219 vaesenc $AESKEY2,$B12_15,$B12_15
3220___
3221 if (($NROUNDS >= 11)) {
3222 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
3223 }
3224
3225 # ;; =================================================
3226 # ;; first phase of reduction
3227 if ($DO_REDUCTION eq "final_reduction") {
3228 $code .= <<___;
3229 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
3230 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
3231 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
3232___
3233 }
3234
3235 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3236 # ;; AES rounds up to 11 (AES192) or 13 (AES256)
3237 # ;; AES128 is done
3238 if (($NROUNDS >= 11)) {
3239 $code .= <<___;
3240 vaesenc $AESKEY1,$B00_03,$B00_03
3241 vaesenc $AESKEY1,$B04_07,$B04_07
3242 vaesenc $AESKEY1,$B08_11,$B08_11
3243 vaesenc $AESKEY1,$B12_15,$B12_15
3244 vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1
3245
3246 vaesenc $AESKEY2,$B00_03,$B00_03
3247 vaesenc $AESKEY2,$B04_07,$B04_07
3248 vaesenc $AESKEY2,$B08_11,$B08_11
3249 vaesenc $AESKEY2,$B12_15,$B12_15
3250___
3251 if (($NROUNDS == 13)) {
3252 $code .= <<___;
3253 vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2
3254
3255 vaesenc $AESKEY1,$B00_03,$B00_03
3256 vaesenc $AESKEY1,$B04_07,$B04_07
3257 vaesenc $AESKEY1,$B08_11,$B08_11
3258 vaesenc $AESKEY1,$B12_15,$B12_15
3259 vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1
3260
3261 vaesenc $AESKEY2,$B00_03,$B00_03
3262 vaesenc $AESKEY2,$B04_07,$B04_07
3263 vaesenc $AESKEY2,$B08_11,$B08_11
3264 vaesenc $AESKEY2,$B12_15,$B12_15
3265___
3266 }
3267 }
3268
3269 # ;; =================================================
3270 # ;; second phase of the reduction
3271 if ($DO_REDUCTION eq "final_reduction") {
3272 $code .= <<___;
3273 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
3274 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
3275 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
3276 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
3277 # ;; GH1H = GH1H x RED_T1 x RED_T2
3278 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
3279___
3280 }
3281
3282 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3283 # ;; the last AES round
3284 $code .= <<___;
3285 vaesenclast $AESKEY1,$B00_03,$B00_03
3286 vaesenclast $AESKEY1,$B04_07,$B04_07
3287 vaesenclast $AESKEY1,$B08_11,$B08_11
3288 vaesenclast $AESKEY1,$B12_15,$B12_15
3289
3290 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3291 # ;; XOR against plain/cipher text
3292 vpxorq $DATA1,$B00_03,$B00_03
3293 vpxorq $DATA2,$B04_07,$B04_07
3294 vpxorq $DATA3,$B08_11,$B08_11
3295 vpxorq $DATA4,$B12_15,$B12_15
3296
3297 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3298 # ;; store cipher/plain text
3299 mov $CIPH_PLAIN_OUT,$IA0
3300 vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
3301 vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
3302 vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
3303 vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
3304___
3305
3306 # ;; =================================================
3307 # ;; shuffle cipher text blocks for GHASH computation
3308 if ($ENC_DEC eq "ENC") {
3309 $code .= <<___;
3310 vpshufb $SHFMSK,$B00_03,$B00_03
3311 vpshufb $SHFMSK,$B04_07,$B04_07
3312 vpshufb $SHFMSK,$B08_11,$B08_11
3313 vpshufb $SHFMSK,$B12_15,$B12_15
3314___
3315 } else {
3316 $code .= <<___;
3317 vpshufb $SHFMSK,$DATA1,$B00_03
3318 vpshufb $SHFMSK,$DATA2,$B04_07
3319 vpshufb $SHFMSK,$DATA3,$B08_11
3320 vpshufb $SHFMSK,$DATA4,$B12_15
3321___
3322 }
3323
3324 # ;; =================================================
3325 # ;; store shuffled cipher text for ghashing
3326 $code .= <<___;
3327 vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
3328 vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
3329 vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
3330 vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
3331___
3332}
3333
3334# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3335# ;;; Encryption of a single block
3336sub ENCRYPT_SINGLE_BLOCK {
3337 my $AES_KEY = $_[0]; # ; [in]
3338 my $XMM0 = $_[1]; # ; [in/out]
3339 my $GPR1 = $_[2]; # ; [clobbered]
3340
3341 my $rndsuffix = &random_string();
3342
3343 $code .= <<___;
3344 # ; load number of rounds from AES_KEY structure (offset in bytes is
3345 # ; size of the |rd_key| buffer)
3346 mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]}
3347 cmp \$9,@{[DWORD($GPR1)]}
3348 je .Laes_128_${rndsuffix}
3349 cmp \$11,@{[DWORD($GPR1)]}
3350 je .Laes_192_${rndsuffix}
3351 cmp \$13,@{[DWORD($GPR1)]}
3352 je .Laes_256_${rndsuffix}
3353 jmp .Lexit_aes_${rndsuffix}
3354___
3355 for my $keylen (sort keys %aes_rounds) {
3356 my $nr = $aes_rounds{$keylen};
3357 $code .= <<___;
3358.align 32
3359.Laes_${keylen}_${rndsuffix}:
3360___
3361 $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n";
3362 for (my $i = 1; $i <= $nr; $i++) {
3363 $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n";
3364 }
3365 $code .= <<___;
3366 vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0
3367 jmp .Lexit_aes_${rndsuffix}
3368___
3369 }
3370 $code .= ".Lexit_aes_${rndsuffix}:\n\n";
3371}
3372
3373sub CALC_J0 {
3374 my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context
3375 my $IV = $_[1]; #; [in] Pointer to IV
3376 my $IV_LEN = $_[2]; #; [in] IV length
3377 my $J0 = $_[3]; #; [out] XMM reg to contain J0
3378 my $ZT0 = $_[4]; #; [clobbered] ZMM register
3379 my $ZT1 = $_[5]; #; [clobbered] ZMM register
3380 my $ZT2 = $_[6]; #; [clobbered] ZMM register
3381 my $ZT3 = $_[7]; #; [clobbered] ZMM register
3382 my $ZT4 = $_[8]; #; [clobbered] ZMM register
3383 my $ZT5 = $_[9]; #; [clobbered] ZMM register
3384 my $ZT6 = $_[10]; #; [clobbered] ZMM register
3385 my $ZT7 = $_[11]; #; [clobbered] ZMM register
3386 my $ZT8 = $_[12]; #; [clobbered] ZMM register
3387 my $ZT9 = $_[13]; #; [clobbered] ZMM register
3388 my $ZT10 = $_[14]; #; [clobbered] ZMM register
3389 my $ZT11 = $_[15]; #; [clobbered] ZMM register
3390 my $ZT12 = $_[16]; #; [clobbered] ZMM register
3391 my $ZT13 = $_[17]; #; [clobbered] ZMM register
3392 my $ZT14 = $_[18]; #; [clobbered] ZMM register
3393 my $ZT15 = $_[19]; #; [clobbered] ZMM register
3394 my $ZT16 = $_[20]; #; [clobbered] ZMM register
3395 my $T1 = $_[21]; #; [clobbered] GP register
3396 my $T2 = $_[22]; #; [clobbered] GP register
3397 my $T3 = $_[23]; #; [clobbered] GP register
3398 my $MASKREG = $_[24]; #; [clobbered] mask register
3399
3400 # ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
3401 # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */
3402
3403 # ;; Calculate GHASH of (IV || 0s)
3404 $code .= "vpxor $J0,$J0,$J0\n";
3405 &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
3406 $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG);
3407
3408 # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
3409 $code .= <<___;
3410 mov $IV_LEN,$T1
3411 shl \$3,$T1 # ; IV length in bits
3412 vmovq $T1,@{[XWORD($ZT2)]}
3413
3414 # ;; Might need shuffle of ZT2
3415 vpxorq $J0,@{[XWORD($ZT2)]},$J0
3416
3417 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]}
3418___
3419 &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]});
3420
3421 $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n";
3422}
3423
3424# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3425# ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for
3426# ;;; encoding/decoding.
3427# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3428sub GCM_INIT_IV {
3429 my $AES_KEYS = $_[0]; # [in] AES key schedule
3430 my $GCM128_CTX = $_[1]; # [in/out] GCM context
3431 my $IV = $_[2]; # [in] IV pointer
3432 my $IV_LEN = $_[3]; # [in] IV length
3433 my $GPR1 = $_[4]; # [clobbered] GP register
3434 my $GPR2 = $_[5]; # [clobbered] GP register
3435 my $GPR3 = $_[6]; # [clobbered] GP register
3436 my $MASKREG = $_[7]; # [clobbered] mask register
3437 my $CUR_COUNT = $_[8]; # [out] XMM with current counter
3438 my $ZT0 = $_[9]; # [clobbered] ZMM register
3439 my $ZT1 = $_[10]; # [clobbered] ZMM register
3440 my $ZT2 = $_[11]; # [clobbered] ZMM register
3441 my $ZT3 = $_[12]; # [clobbered] ZMM register
3442 my $ZT4 = $_[13]; # [clobbered] ZMM register
3443 my $ZT5 = $_[14]; # [clobbered] ZMM register
3444 my $ZT6 = $_[15]; # [clobbered] ZMM register
3445 my $ZT7 = $_[16]; # [clobbered] ZMM register
3446 my $ZT8 = $_[17]; # [clobbered] ZMM register
3447 my $ZT9 = $_[18]; # [clobbered] ZMM register
3448 my $ZT10 = $_[19]; # [clobbered] ZMM register
3449 my $ZT11 = $_[20]; # [clobbered] ZMM register
3450 my $ZT12 = $_[21]; # [clobbered] ZMM register
3451 my $ZT13 = $_[22]; # [clobbered] ZMM register
3452 my $ZT14 = $_[23]; # [clobbered] ZMM register
3453 my $ZT15 = $_[24]; # [clobbered] ZMM register
3454 my $ZT16 = $_[25]; # [clobbered] ZMM register
3455
3456 my $ZT0x = $ZT0;
3457 $ZT0x =~ s/zmm/xmm/;
3458
3459 $code .= <<___;
3460 cmp \$12,$IV_LEN
3461 je iv_len_12_init_IV
3462___
3463
3464 # ;; IV is different than 12 bytes
3465 &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7,
3466 $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3467 $code .= <<___;
3468 jmp skip_iv_len_12_init_IV
3469iv_len_12_init_IV: # ;; IV is 12 bytes
3470 # ;; read 12 IV bytes and pad with 0x00000001
3471 vmovdqu8 ONEf(%rip),$CUR_COUNT
3472 mov $IV,$GPR2
3473 mov \$0x0000000000000fff,@{[DWORD($GPR1)]}
3474 kmovq $GPR1,$MASKREG
3475 vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1
3476skip_iv_len_12_init_IV:
3477 vmovdqu $CUR_COUNT,$ZT0x
3478___
3479 &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0)
3480 $code .= <<___;
3481 vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage
3482
3483 # ;; store IV as counter in LE format
3484 vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT
3485 vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi
3486___
3487}
3488
3489sub GCM_UPDATE_AAD {
3490 my $GCM128_CTX = $_[0]; # [in] GCM context pointer
3491 my $A_IN = $_[1]; # [in] AAD pointer
3492 my $A_LEN = $_[2]; # [in] AAD length in bytes
3493 my $GPR1 = $_[3]; # [clobbered] GP register
3494 my $GPR2 = $_[4]; # [clobbered] GP register
3495 my $GPR3 = $_[5]; # [clobbered] GP register
3496 my $MASKREG = $_[6]; # [clobbered] mask register
3497 my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value
3498 my $ZT0 = $_[8]; # [clobbered] ZMM register
3499 my $ZT1 = $_[9]; # [clobbered] ZMM register
3500 my $ZT2 = $_[10]; # [clobbered] ZMM register
3501 my $ZT3 = $_[11]; # [clobbered] ZMM register
3502 my $ZT4 = $_[12]; # [clobbered] ZMM register
3503 my $ZT5 = $_[13]; # [clobbered] ZMM register
3504 my $ZT6 = $_[14]; # [clobbered] ZMM register
3505 my $ZT7 = $_[15]; # [clobbered] ZMM register
3506 my $ZT8 = $_[16]; # [clobbered] ZMM register
3507 my $ZT9 = $_[17]; # [clobbered] ZMM register
3508 my $ZT10 = $_[18]; # [clobbered] ZMM register
3509 my $ZT11 = $_[19]; # [clobbered] ZMM register
3510 my $ZT12 = $_[20]; # [clobbered] ZMM register
3511 my $ZT13 = $_[21]; # [clobbered] ZMM register
3512 my $ZT14 = $_[22]; # [clobbered] ZMM register
3513 my $ZT15 = $_[23]; # [clobbered] ZMM register
3514 my $ZT16 = $_[24]; # [clobbered] ZMM register
3515
3516 # ; load current hash
3517 $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n";
3518
3519 &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2,
3520 $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13,
3521 $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3522
3523 # ; load current hash
3524 $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n";
3525}
3526
3527# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3528# ;;; Cipher and ghash of payloads shorter than 256 bytes
3529# ;;; - number of blocks in the message comes as argument
3530# ;;; - depending on the number of blocks an optimized variant of
3531# ;;; INITIAL_BLOCKS_PARTIAL is invoked
3532sub GCM_ENC_DEC_SMALL {
3533 my $AES_KEYS = $_[0]; # [in] key pointer
3534 my $GCM128_CTX = $_[1]; # [in] context pointer
3535 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
3536 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
3537 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
3538 my $ENC_DEC = $_[5]; # [in] cipher direction
3539 my $DATA_OFFSET = $_[6]; # [in] data offset
3540 my $LENGTH = $_[7]; # [in] data length
3541 my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16
3542 my $CTR = $_[9]; # [in/out] XMM counter block
3543 my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value
3544 my $ZTMP0 = $_[11]; # [clobbered] ZMM register
3545 my $ZTMP1 = $_[12]; # [clobbered] ZMM register
3546 my $ZTMP2 = $_[13]; # [clobbered] ZMM register
3547 my $ZTMP3 = $_[14]; # [clobbered] ZMM register
3548 my $ZTMP4 = $_[15]; # [clobbered] ZMM register
3549 my $ZTMP5 = $_[16]; # [clobbered] ZMM register
3550 my $ZTMP6 = $_[17]; # [clobbered] ZMM register
3551 my $ZTMP7 = $_[18]; # [clobbered] ZMM register
3552 my $ZTMP8 = $_[19]; # [clobbered] ZMM register
3553 my $ZTMP9 = $_[20]; # [clobbered] ZMM register
3554 my $ZTMP10 = $_[21]; # [clobbered] ZMM register
3555 my $ZTMP11 = $_[22]; # [clobbered] ZMM register
3556 my $ZTMP12 = $_[23]; # [clobbered] ZMM register
3557 my $ZTMP13 = $_[24]; # [clobbered] ZMM register
3558 my $ZTMP14 = $_[25]; # [clobbered] ZMM register
3559 my $IA0 = $_[26]; # [clobbered] GP register
3560 my $IA1 = $_[27]; # [clobbered] GP register
3561 my $MASKREG = $_[28]; # [clobbered] mask register
3562 my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask
3563 my $PBLOCK_LEN = $_[30]; # [in] partial block length
3564
3565 my $rndsuffix = &random_string();
3566
3567 $code .= <<___;
3568 cmp \$8,$NUM_BLOCKS
3569 je .L_small_initial_num_blocks_is_8_${rndsuffix}
3570 jl .L_small_initial_num_blocks_is_7_1_${rndsuffix}
3571
3572
3573 cmp \$12,$NUM_BLOCKS
3574 je .L_small_initial_num_blocks_is_12_${rndsuffix}
3575 jl .L_small_initial_num_blocks_is_11_9_${rndsuffix}
3576
3577 # ;; 16, 15, 14 or 13
3578 cmp \$16,$NUM_BLOCKS
3579 je .L_small_initial_num_blocks_is_16_${rndsuffix}
3580 cmp \$15,$NUM_BLOCKS
3581 je .L_small_initial_num_blocks_is_15_${rndsuffix}
3582 cmp \$14,$NUM_BLOCKS
3583 je .L_small_initial_num_blocks_is_14_${rndsuffix}
3584 jmp .L_small_initial_num_blocks_is_13_${rndsuffix}
3585
3586.L_small_initial_num_blocks_is_11_9_${rndsuffix}:
3587 # ;; 11, 10 or 9
3588 cmp \$11,$NUM_BLOCKS
3589 je .L_small_initial_num_blocks_is_11_${rndsuffix}
3590 cmp \$10,$NUM_BLOCKS
3591 je .L_small_initial_num_blocks_is_10_${rndsuffix}
3592 jmp .L_small_initial_num_blocks_is_9_${rndsuffix}
3593
3594.L_small_initial_num_blocks_is_7_1_${rndsuffix}:
3595 cmp \$4,$NUM_BLOCKS
3596 je .L_small_initial_num_blocks_is_4_${rndsuffix}
3597 jl .L_small_initial_num_blocks_is_3_1_${rndsuffix}
3598 # ;; 7, 6 or 5
3599 cmp \$7,$NUM_BLOCKS
3600 je .L_small_initial_num_blocks_is_7_${rndsuffix}
3601 cmp \$6,$NUM_BLOCKS
3602 je .L_small_initial_num_blocks_is_6_${rndsuffix}
3603 jmp .L_small_initial_num_blocks_is_5_${rndsuffix}
3604
3605.L_small_initial_num_blocks_is_3_1_${rndsuffix}:
3606 # ;; 3, 2 or 1
3607 cmp \$3,$NUM_BLOCKS
3608 je .L_small_initial_num_blocks_is_3_${rndsuffix}
3609 cmp \$2,$NUM_BLOCKS
3610 je .L_small_initial_num_blocks_is_2_${rndsuffix}
3611
3612 # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed
3613
3614 # ;; Generation of different block size variants
3615 # ;; - one block size has to be the first one
3616___
3617
3618 for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) {
3619 $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${rndsuffix}:\n";
3620 &INITIAL_BLOCKS_PARTIAL(
3621 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET,
3622 $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1,
3623 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3624 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3625 $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN);
3626
3627 if ($num_blocks != 16) {
3628 $code .= "jmp .L_small_initial_blocks_encrypted_${rndsuffix}\n";
3629 }
3630 }
3631
3632 $code .= ".L_small_initial_blocks_encrypted_${rndsuffix}:\n";
3633}
3634
3635# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3636# ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context
3637# ; struct has been initialized by GCM_INIT_IV
3638# ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
3639# ; Clobbers rax, r10-r15, and zmm0-zmm31, k1
3640# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3641sub GCM_ENC_DEC {
3642 my $AES_KEYS = $_[0]; # [in] AES Key schedule
3643 my $GCM128_CTX = $_[1]; # [in] context pointer
3644 my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update
3645 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer
3646 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
3647 my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer
3648 my $ENC_DEC = $_[6]; # [in] cipher direction
3649
3650 my $IA0 = "%r10";
3651 my $IA1 = "%r12";
3652 my $IA2 = "%r13";
3653 my $IA3 = "%r15";
3654 my $IA4 = "%r11";
3655 my $IA5 = "%rax";
3656 my $IA6 = "%rbx";
3657 my $IA7 = "%r14";
3658
3659 my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
3660
3661 my $CTR_CHECK = $IA3;
3662 my $DATA_OFFSET = $IA4;
3663 my $HASHK_PTR = $IA6;
3664
3665 my $HKEYS_READY = $IA7;
3666
3667 my $CTR_BLOCKz = "%zmm2";
3668 my $CTR_BLOCKx = "%xmm2";
3669
3670 # ; hardcoded in GCM_INIT
3671
3672 my $AAD_HASHz = "%zmm14";
3673 my $AAD_HASHx = "%xmm14";
3674
3675 # ; hardcoded in GCM_COMPLETE
3676
3677 my $ZTMP0 = "%zmm0";
3678 my $ZTMP1 = "%zmm3";
3679 my $ZTMP2 = "%zmm4";
3680 my $ZTMP3 = "%zmm5";
3681 my $ZTMP4 = "%zmm6";
3682 my $ZTMP5 = "%zmm7";
3683 my $ZTMP6 = "%zmm10";
3684 my $ZTMP7 = "%zmm11";
3685 my $ZTMP8 = "%zmm12";
3686 my $ZTMP9 = "%zmm13";
3687 my $ZTMP10 = "%zmm15";
3688 my $ZTMP11 = "%zmm16";
3689 my $ZTMP12 = "%zmm17";
3690
3691 my $ZTMP13 = "%zmm19";
3692 my $ZTMP14 = "%zmm20";
3693 my $ZTMP15 = "%zmm21";
3694 my $ZTMP16 = "%zmm30";
3695 my $ZTMP17 = "%zmm31";
3696 my $ZTMP18 = "%zmm1";
3697 my $ZTMP19 = "%zmm18";
3698 my $ZTMP20 = "%zmm8";
3699 my $ZTMP21 = "%zmm22";
3700 my $ZTMP22 = "%zmm23";
3701
3702 my $GH = "%zmm24";
3703 my $GL = "%zmm25";
3704 my $GM = "%zmm26";
3705 my $SHUF_MASK = "%zmm29";
3706
3707 # ; Unused in the small packet path
3708 my $ADDBE_4x4 = "%zmm27";
3709 my $ADDBE_1234 = "%zmm28";
3710
3711 my $MASKREG = "%k1";
3712
3713 my $rndsuffix = &random_string();
3714
3715 # ;; reduction every 48 blocks, depth 32 blocks
3716 # ;; @note 48 blocks is the maximum capacity of the stack frame
3717 my $big_loop_nblocks = 48;
3718 my $big_loop_depth = 32;
3719
3720 # ;;; Macro flow depending on packet size
3721 # ;;; - LENGTH <= 16 blocks
3722 # ;;; - cipher followed by hashing (reduction)
3723 # ;;; - 16 blocks < LENGTH < 32 blocks
3724 # ;;; - cipher 16 blocks
3725 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3726 # ;;; - 32 blocks < LENGTH < 48 blocks
3727 # ;;; - cipher 2 x 16 blocks
3728 # ;;; - hash 16 blocks
3729 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3730 # ;;; - LENGTH >= 48 blocks
3731 # ;;; - cipher 2 x 16 blocks
3732 # ;;; - while (data_to_cipher >= 48 blocks):
3733 # ;;; - cipher 16 blocks & hash 16 blocks
3734 # ;;; - cipher 16 blocks & hash 16 blocks
3735 # ;;; - cipher 16 blocks & hash 16 blocks (reduction)
3736 # ;;; - if (data_to_cipher >= 32 blocks):
3737 # ;;; - cipher 16 blocks & hash 16 blocks
3738 # ;;; - cipher 16 blocks & hash 16 blocks
3739 # ;;; - hash 16 blocks (reduction)
3740 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3741 # ;;; - elif (data_to_cipher >= 16 blocks):
3742 # ;;; - cipher 16 blocks & hash 16 blocks
3743 # ;;; - hash 16 blocks
3744 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3745 # ;;; - else:
3746 # ;;; - hash 16 blocks
3747 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3748
3749 if ($win64) {
3750 $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n";
3751 } else {
3752 $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
3753 }
3754 $code .= "je .L_enc_dec_done_${rndsuffix}\n";
3755
3756 # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in
3757 # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc'
3758
3759 $code .= "xor $HKEYS_READY, $HKEYS_READY\n";
3760 $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
3761
3762 # ;; Used for the update flow - if there was a previous partial
3763 # ;; block fill the remaining bytes here.
3764 &PARTIAL_BLOCK(
3765 $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
3766 $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1,
3767 $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3,
3768 $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG);
3769
3770 $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
3771
3772 # ;; Save the amount of data left to process in $LENGTH
3773 # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
3774 if ($win64) {
3775 $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n";
3776 }
3777
3778 # ;; There may be no more data if it was consumed in the partial block.
3779 $code .= <<___;
3780 sub $DATA_OFFSET,$LENGTH
3781 je .L_enc_dec_done_${rndsuffix}
3782___
3783
3784 $code .= <<___;
3785 cmp \$`(16 * 16)`,$LENGTH
3786 jbe .L_message_below_equal_16_blocks_${rndsuffix}
3787
3788 vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK
3789 vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
3790 vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
3791
3792 # ;; start the pipeline
3793 # ;; - 32 blocks aes-ctr
3794 # ;; - 16 blocks ghash + aes-ctr
3795
3796 # ;; set up CTR_CHECK
3797 vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
3798 and \$255,@{[DWORD($CTR_CHECK)]}
3799 # ;; in LE format after init, convert to BE
3800 vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
3801 vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
3802___
3803
3804 # ;; ==== AES-CTR - first 16 blocks
3805 my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3806 my $data_in_out_offset = 0;
3807 &INITIAL_BLOCKS_16(
3808 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
3809 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
3810 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
3811 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
3812
3813 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3814 "first16");
3815
3816 $code .= <<___;
3817 cmp \$`(32 * 16)`,$LENGTH
3818 jb .L_message_below_32_blocks_${rndsuffix}
3819___
3820
3821 # ;; ==== AES-CTR - next 16 blocks
3822 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3823 $data_in_out_offset = (16 * 16);
3824 &INITIAL_BLOCKS_16(
3825 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
3826 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
3827 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
3828 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
3829
3830 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3831 "last32");
3832 $code .= "mov \$1,$HKEYS_READY\n";
3833
3834 $code .= <<___;
3835 add \$`(32 * 16)`,$DATA_OFFSET
3836 sub \$`(32 * 16)`,$LENGTH
3837
3838 cmp \$`($big_loop_nblocks * 16)`,$LENGTH
3839 jb .L_no_more_big_nblocks_${rndsuffix}
3840___
3841
3842 # ;; ====
3843 # ;; ==== AES-CTR + GHASH - 48 blocks loop
3844 # ;; ====
3845 $code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n";
3846
3847 # ;; ==== AES-CTR + GHASH - 16 blocks, start
3848 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3849 $data_in_out_offset = (0 * 16);
3850 my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3851 &GHASH_16_ENCRYPT_16_PARALLEL(
3852 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3853 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3854 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3855 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3856 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3857 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3858 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
3859 $IA0);
3860
3861 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3862 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3863 $data_in_out_offset = (16 * 16);
3864 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3865 &GHASH_16_ENCRYPT_16_PARALLEL(
3866 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3867 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3868 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3869 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3870 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3871 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3872 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3873 $IA0);
3874
3875 # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
3876 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3877 $data_in_out_offset = (32 * 16);
3878 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3879 &GHASH_16_ENCRYPT_16_PARALLEL(
3880 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3881 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3882 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3883 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3884 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3885 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3886 $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3887 $IA0);
3888
3889 # ;; === xor cipher block 0 with GHASH (ZT4)
3890 $code .= <<___;
3891 vmovdqa64 $ZTMP4,$AAD_HASHz
3892
3893 add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
3894 sub \$`($big_loop_nblocks * 16)`,$LENGTH
3895 cmp \$`($big_loop_nblocks * 16)`,$LENGTH
3896 jae .L_encrypt_big_nblocks_${rndsuffix}
3897
3898.L_no_more_big_nblocks_${rndsuffix}:
3899
3900 cmp \$`(32 * 16)`,$LENGTH
3901 jae .L_encrypt_32_blocks_${rndsuffix}
3902
3903 cmp \$`(16 * 16)`,$LENGTH
3904 jae .L_encrypt_16_blocks_${rndsuffix}
3905___
3906
3907 # ;; =====================================================
3908 # ;; =====================================================
3909 # ;; ==== GHASH 1 x 16 blocks
3910 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3911 # ;; ==== then GHASH N blocks
3912 $code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n";
3913
3914 # ;; calculate offset to the right hash key
3915 $code .= <<___;
3916mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
3917and \$~15,@{[DWORD($IA0)]}
3918mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3919sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3920___
3921
3922 # ;; ==== GHASH 32 blocks and follow with reduction
3923 &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
3924 "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3925
3926 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3927 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3928 $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
3929 &GCM_ENC_DEC_LAST(
3930 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
3931 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
3932 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3933 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
3934 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
3935 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
3936 "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
3937 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
3938
3939 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
3940 $code .= "jmp .L_ghash_done_${rndsuffix}\n";
3941
3942 # ;; =====================================================
3943 # ;; =====================================================
3944 # ;; ==== GHASH & encrypt 1 x 16 blocks
3945 # ;; ==== GHASH & encrypt 1 x 16 blocks
3946 # ;; ==== GHASH 1 x 16 blocks (reduction)
3947 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3948 # ;; ==== then GHASH N blocks
3949 $code .= ".L_encrypt_32_blocks_${rndsuffix}:\n";
3950
3951 # ;; ==== AES-CTR + GHASH - 16 blocks, start
3952 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3953 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3954 $data_in_out_offset = (0 * 16);
3955 &GHASH_16_ENCRYPT_16_PARALLEL(
3956 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3957 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3958 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3959 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3960 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3961 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3962 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
3963 $IA0);
3964
3965 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3966 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3967 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3968 $data_in_out_offset = (16 * 16);
3969 &GHASH_16_ENCRYPT_16_PARALLEL(
3970 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3971 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3972 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3973 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3974 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3975 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3976 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3977 $IA0);
3978
3979 # ;; ==== GHASH 16 blocks with reduction
3980 &GHASH_16(
3981 "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
3982 "%rsp", &HashKeyOffsetByIdx(16, "frame"),
3983 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3984
3985 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3986 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3987 $code .= <<___;
3988 sub \$`(32 * 16)`,$LENGTH
3989 add \$`(32 * 16)`,$DATA_OFFSET
3990___
3991
3992 # ;; calculate offset to the right hash key
3993 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
3994 $code .= <<___;
3995 and \$~15,@{[DWORD($IA0)]}
3996 mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3997 sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3998___
3999 &GCM_ENC_DEC_LAST(
4000 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
4001 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
4002 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4003 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4004 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
4005 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4006 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
4007 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
4008
4009 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4010 $code .= "jmp .L_ghash_done_${rndsuffix}\n";
4011
4012 # ;; =====================================================
4013 # ;; =====================================================
4014 # ;; ==== GHASH & encrypt 16 blocks (done before)
4015 # ;; ==== GHASH 1 x 16 blocks
4016 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
4017 # ;; ==== then GHASH N blocks
4018 $code .= ".L_encrypt_16_blocks_${rndsuffix}:\n";
4019
4020 # ;; ==== AES-CTR + GHASH - 16 blocks, start
4021 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4022 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4023 $data_in_out_offset = (0 * 16);
4024 &GHASH_16_ENCRYPT_16_PARALLEL(
4025 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
4026 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
4027 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
4028 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
4029 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
4030 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
4031 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4032 $IA0);
4033
4034 # ;; ==== GHASH 1 x 16 blocks
4035 &GHASH_16(
4036 "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
4037 "%rsp", &HashKeyOffsetByIdx(32, "frame"),
4038 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
4039
4040 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
4041 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4042 $code .= <<___;
4043 sub \$`(16 * 16)`,$LENGTH
4044 add \$`(16 * 16)`,$DATA_OFFSET
4045___
4046 &GCM_ENC_DEC_LAST(
4047 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
4048 $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK,
4049 &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
4050 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4,
4051 $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
4052 $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4053 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16,
4054 $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20,
4055 $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4056 "end_reduce", $GL, $GH, $GM,
4057 $ENC_DEC, $AAD_HASHz, $IA0, $IA5,
4058 $MASKREG, $PBLOCK_LEN);
4059
4060 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4061 $code .= <<___;
4062 jmp .L_ghash_done_${rndsuffix}
4063
4064.L_message_below_32_blocks_${rndsuffix}:
4065 # ;; 32 > number of blocks > 16
4066
4067 sub \$`(16 * 16)`,$LENGTH
4068 add \$`(16 * 16)`,$DATA_OFFSET
4069___
4070 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4071
4072 # ;; calculate offset to the right hash key
4073 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
4074
4075 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4076 "mid16");
4077 $code .= "mov \$1,$HKEYS_READY\n";
4078
4079 $code .= <<___;
4080and \$~15,@{[DWORD($IA0)]}
4081mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
4082sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
4083___
4084
4085 &GCM_ENC_DEC_LAST(
4086 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
4087 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
4088 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4089 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4090 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
4091 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4092 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
4093 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
4094
4095 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4096 $code .= <<___;
4097 jmp .L_ghash_done_${rndsuffix}
4098
4099.L_message_below_equal_16_blocks_${rndsuffix}:
4100 # ;; Determine how many blocks to process
4101 # ;; - process one additional block if there is a partial block
4102 mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
4103 add \$15,@{[DWORD($IA1)]}
4104 shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16
4105___
4106 &GCM_ENC_DEC_SMALL(
4107 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
4108 $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0,
4109 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4110 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4111 $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK,
4112 $PBLOCK_LEN);
4113
4114 # ;; fall through to exit
4115
4116 $code .= ".L_ghash_done_${rndsuffix}:\n";
4117
4118 # ;; save the last counter block
4119 $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n";
4120 $code .= <<___;
4121 vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4122.L_enc_dec_done_${rndsuffix}:
4123___
4124}
4125
4126# ;;; ===========================================================================
4127# ;;; Encrypt/decrypt the initial 16 blocks
4128sub INITIAL_BLOCKS_16 {
4129 my $IN = $_[0]; # [in] input buffer
4130 my $OUT = $_[1]; # [in] output buffer
4131 my $AES_KEYS = $_[2]; # [in] pointer to expanded keys
4132 my $DATA_OFFSET = $_[3]; # [in] data offset
4133 my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits)
4134 my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits
4135 my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check
4136 my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian)
4137 my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
4138 my $T0 = $_[9]; # [clobered] temporary ZMM register
4139 my $T1 = $_[10]; # [clobered] temporary ZMM register
4140 my $T2 = $_[11]; # [clobered] temporary ZMM register
4141 my $T3 = $_[12]; # [clobered] temporary ZMM register
4142 my $T4 = $_[13]; # [clobered] temporary ZMM register
4143 my $T5 = $_[14]; # [clobered] temporary ZMM register
4144 my $T6 = $_[15]; # [clobered] temporary ZMM register
4145 my $T7 = $_[16]; # [clobered] temporary ZMM register
4146 my $T8 = $_[17]; # [clobered] temporary ZMM register
4147 my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask
4148 my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector
4149 my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks
4150 my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset
4151 my $IA0 = $_[22]; # [clobered] temporary GP register
4152
4153 my $B00_03 = $T5;
4154 my $B04_07 = $T6;
4155 my $B08_11 = $T7;
4156 my $B12_15 = $T8;
4157
4158 my $rndsuffix = &random_string();
4159
4160 my $stack_offset = $BLK_OFFSET;
4161 $code .= <<___;
4162 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4163 # ;; prepare counter blocks
4164
4165 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
4166 jae .L_next_16_overflow_${rndsuffix}
4167 vpaddd $ADDBE_1234,$CTR,$B00_03
4168 vpaddd $ADDBE_4x4,$B00_03,$B04_07
4169 vpaddd $ADDBE_4x4,$B04_07,$B08_11
4170 vpaddd $ADDBE_4x4,$B08_11,$B12_15
4171 jmp .L_next_16_ok_${rndsuffix}
4172.L_next_16_overflow_${rndsuffix}:
4173 vpshufb $SHUF_MASK,$CTR,$CTR
4174 vmovdqa64 ddq_add_4444(%rip),$B12_15
4175 vpaddd ddq_add_1234(%rip),$CTR,$B00_03
4176 vpaddd $B12_15,$B00_03,$B04_07
4177 vpaddd $B12_15,$B04_07,$B08_11
4178 vpaddd $B12_15,$B08_11,$B12_15
4179 vpshufb $SHUF_MASK,$B00_03,$B00_03
4180 vpshufb $SHUF_MASK,$B04_07,$B04_07
4181 vpshufb $SHUF_MASK,$B08_11,$B08_11
4182 vpshufb $SHUF_MASK,$B12_15,$B12_15
4183.L_next_16_ok_${rndsuffix}:
4184 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR
4185 addb \$16,@{[BYTE($CTR_CHECK)]}
4186 # ;; === load 16 blocks of data
4187 vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0
4188 vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1
4189 vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2
4190 vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3
4191
4192 # ;; move to AES encryption rounds
4193 vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4
4194 vpxorq $T4,$B00_03,$B00_03
4195 vpxorq $T4,$B04_07,$B04_07
4196 vpxorq $T4,$B08_11,$B08_11
4197 vpxorq $T4,$B12_15,$B12_15
4198___
4199 foreach (1 .. ($NROUNDS)) {
4200 $code .= <<___;
4201 vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4
4202 vaesenc $T4,$B00_03,$B00_03
4203 vaesenc $T4,$B04_07,$B04_07
4204 vaesenc $T4,$B08_11,$B08_11
4205 vaesenc $T4,$B12_15,$B12_15
4206___
4207 }
4208 $code .= <<___;
4209 vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4
4210 vaesenclast $T4,$B00_03,$B00_03
4211 vaesenclast $T4,$B04_07,$B04_07
4212 vaesenclast $T4,$B08_11,$B08_11
4213 vaesenclast $T4,$B12_15,$B12_15
4214
4215 # ;; xor against text
4216 vpxorq $T0,$B00_03,$B00_03
4217 vpxorq $T1,$B04_07,$B04_07
4218 vpxorq $T2,$B08_11,$B08_11
4219 vpxorq $T3,$B12_15,$B12_15
4220
4221 # ;; store
4222 mov $OUT, $IA0
4223 vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1)
4224 vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1)
4225 vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1)
4226 vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1)
4227___
4228 if ($ENC_DEC eq "DEC") {
4229 $code .= <<___;
4230 # ;; decryption - cipher text needs to go to GHASH phase
4231 vpshufb $SHUF_MASK,$T0,$B00_03
4232 vpshufb $SHUF_MASK,$T1,$B04_07
4233 vpshufb $SHUF_MASK,$T2,$B08_11
4234 vpshufb $SHUF_MASK,$T3,$B12_15
4235___
4236 } else {
4237 $code .= <<___;
4238 # ;; encryption
4239 vpshufb $SHUF_MASK,$B00_03,$B00_03
4240 vpshufb $SHUF_MASK,$B04_07,$B04_07
4241 vpshufb $SHUF_MASK,$B08_11,$B08_11
4242 vpshufb $SHUF_MASK,$B12_15,$B12_15
4243___
4244 }
4245
4246 if ($GHASH ne "no_ghash") {
4247 $code .= <<___;
4248 # ;; === xor cipher block 0 with GHASH for the next GHASH round
4249 vpxorq $GHASH,$B00_03,$B00_03
4250___
4251 }
4252 $code .= <<___;
4253 vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp)
4254 vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp)
4255 vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp)
4256 vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp)
4257___
4258}
4259
4260# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4261# ; GCM_COMPLETE Finishes ghash calculation
4262# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4263sub GCM_COMPLETE {
4264 my $GCM128_CTX = $_[0];
4265 my $PBLOCK_LEN = $_[1];
4266
4267 my $rndsuffix = &random_string();
4268
4269 $code .= <<___;
4270 vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2
4271 vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0)
4272___
4273
4274 $code .= <<___;
4275 vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4
4276
4277 # ;; Process the final partial block.
4278 cmp \$0,$PBLOCK_LEN
4279 je .L_partial_done_${rndsuffix}
4280___
4281
4282 # ;GHASH computation for the last <16 Byte block
4283 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4284
4285 $code .= <<___;
4286.L_partial_done_${rndsuffix}:
4287 vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5
4288 vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C)
4289 vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits
4290
4291 vpxor %xmm5,%xmm4,%xmm4
4292___
4293
4294 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4295
4296 $code .= <<___;
4297 vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap
4298 vpxor %xmm4,%xmm3,%xmm3
4299
4300.L_return_T_${rndsuffix}:
4301 vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4302___
4303}
4304
4305# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4306# ;;; Functions definitions
4307# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4308
4309$code .= ".text\n";
4310{
4311 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4312 # ;void ossl_aes_gcm_init_avx512 /
4313 # ; (const void *aes_keys,
4314 # ; void *gcm128ctx)
4315 # ;
4316 # ; Precomputes hashkey table for GHASH optimization.
4317 # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4318 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4319 $code .= <<___;
4320.globl ossl_aes_gcm_init_avx512
4321.type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4322.align 32
4323ossl_aes_gcm_init_avx512:
4324.cfi_startproc
4325 endbranch
4326___
4327 if ($CHECK_FUNCTION_ARGUMENTS) {
4328 $code .= <<___;
4329 # ;; Check aes_keys != NULL
4330 test $arg1,$arg1
4331 jz .Labort_init
4332
4333 # ;; Check gcm128ctx != NULL
4334 test $arg2,$arg2
4335 jz .Labort_init
4336___
4337 }
4338 $code .= "vpxorq %xmm16,%xmm16,%xmm16\n";
4339 &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey
4340 $code .= <<___;
4341 vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
4342 # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;;
4343 vmovdqa64 %xmm16,%xmm2
4344 vpsllq \$1,%xmm16,%xmm16
4345 vpsrlq \$63,%xmm2,%xmm2
4346 vmovdqa %xmm2,%xmm1
4347 vpslldq \$8,%xmm2,%xmm2
4348 vpsrldq \$8,%xmm1,%xmm1
4349 vporq %xmm2,%xmm16,%xmm16
4350 # ;reduction
4351 vpshufd \$0b00100100,%xmm1,%xmm2
4352 vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
4353 vpand POLY(%rip),%xmm2,%xmm2
4354 vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly
4355 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4356 vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly
4357___
4358 &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4359 if ($CLEAR_SCRATCH_REGISTERS) {
4360 &clear_scratch_gps_asm();
4361 &clear_scratch_zmms_asm();
4362 } else {
4363 $code .= "vzeroupper\n";
4364 }
4365 $code .= <<___;
4366.Labort_init:
4367ret
4368.cfi_endproc
4369.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4370___
4371}
4372
4373# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4374# ;void ossl_aes_gcm_setiv_avx512
4375# ; (const void *aes_keys,
4376# ; void *gcm128ctx,
4377# ; const unsigned char *iv,
4378# ; size_t ivlen)
4379# ;
4380# ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure.
4381# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4382$code .= <<___;
4383.globl ossl_aes_gcm_setiv_avx512
4384.type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent
4385.align 32
4386ossl_aes_gcm_setiv_avx512:
4387.cfi_startproc
4388.Lsetiv_seh_begin:
4389 endbranch
4390___
4391if ($CHECK_FUNCTION_ARGUMENTS) {
4392 $code .= <<___;
4393 # ;; Check aes_keys != NULL
4394 test $arg1,$arg1
4395 jz .Labort_setiv
4396
4397 # ;; Check gcm128ctx != NULL
4398 test $arg2,$arg2
4399 jz .Labort_setiv
4400
4401 # ;; Check iv != NULL
4402 test $arg3,$arg3
4403 jz .Labort_setiv
4404
4405 # ;; Check ivlen != 0
4406 test $arg4,$arg4
4407 jz .Labort_setiv
4408___
4409}
4410
4411# ; NOTE: code before PROLOG() must not modify any registers
4412&PROLOG(
4413 1, # allocate stack space for hkeys
4414 0, # do not allocate stack space for AES blocks
4415 "setiv");
4416&GCM_INIT_IV(
4417 "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1",
4418 "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12",
4419 "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4420&EPILOG(
4421 1, # hkeys were allocated
4422 $arg4);
4423$code .= <<___;
4424.Labort_setiv:
4425ret
4426.Lsetiv_seh_end:
4427.cfi_endproc
4428.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
4429___
4430
4431# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4432# ;void ossl_aes_gcm_update_aad_avx512
4433# ; (unsigned char *gcm128ctx,
4434# ; const unsigned char *aad,
4435# ; size_t aadlen)
4436# ;
4437# ; Updates AAD hash in gcm128_context structure.
4438# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4439$code .= <<___;
4440.globl ossl_aes_gcm_update_aad_avx512
4441.type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent
4442.align 32
4443ossl_aes_gcm_update_aad_avx512:
4444.cfi_startproc
4445.Lghash_seh_begin:
4446 endbranch
4447___
4448if ($CHECK_FUNCTION_ARGUMENTS) {
4449 $code .= <<___;
4450 # ;; Check gcm128ctx != NULL
4451 test $arg1,$arg1
4452 jz .Lexit_update_aad
4453
4454 # ;; Check aad != NULL
4455 test $arg2,$arg2
4456 jz .Lexit_update_aad
4457
4458 # ;; Check aadlen != 0
4459 test $arg3,$arg3
4460 jz .Lexit_update_aad
4461___
4462}
4463
4464# ; NOTE: code before PROLOG() must not modify any registers
4465&PROLOG(
4466 1, # allocate stack space for hkeys,
4467 0, # do not allocate stack space for AES blocks
4468 "ghash");
4469&GCM_UPDATE_AAD(
4470 "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11",
4471 "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13",
4472 "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4473&EPILOG(
4474 1, # hkeys were allocated
4475 $arg3);
4476$code .= <<___;
4477.Lexit_update_aad:
4478ret
4479.Lghash_seh_end:
4480.cfi_endproc
4481.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
4482___
4483
4484# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4485# ;void ossl_aes_gcm_encrypt_avx512
4486# ; (const void* aes_keys,
4487# ; void *gcm128ctx,
4488# ; unsigned int *pblocklen,
4489# ; const unsigned char *in,
4490# ; size_t len,
4491# ; unsigned char *out);
4492# ;
4493# ; Performs encryption of data |in| of len |len|, and stores the output in |out|.
4494# ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4495# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4496$code .= <<___;
4497.globl ossl_aes_gcm_encrypt_avx512
4498.type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent
4499.align 32
4500ossl_aes_gcm_encrypt_avx512:
4501.cfi_startproc
4502.Lencrypt_seh_begin:
4503 endbranch
4504___
4505
4506# ; NOTE: code before PROLOG() must not modify any registers
4507&PROLOG(
4508 1, # allocate stack space for hkeys
4509 1, # allocate stack space for AES blocks
4510 "encrypt");
4511if ($CHECK_FUNCTION_ARGUMENTS) {
4512 $code .= <<___;
4513 # ;; Check aes_keys != NULL
4514 test $arg1,$arg1
4515 jz .Lexit_gcm_encrypt
4516
4517 # ;; Check gcm128ctx != NULL
4518 test $arg2,$arg2
4519 jz .Lexit_gcm_encrypt
4520
4521 # ;; Check pblocklen != NULL
4522 test $arg3,$arg3
4523 jz .Lexit_gcm_encrypt
4524
4525 # ;; Check in != NULL
4526 test $arg4,$arg4
4527 jz .Lexit_gcm_encrypt
4528
4529 # ;; Check if len != 0
4530 cmp \$0,$arg5
4531 jz .Lexit_gcm_encrypt
4532
4533 # ;; Check out != NULL
4534 cmp \$0,$arg6
4535 jz .Lexit_gcm_encrypt
4536___
4537}
4538$code .= <<___;
4539 # ; load number of rounds from AES_KEY structure (offset in bytes is
4540 # ; size of the |rd_key| buffer)
4541 mov `4*15*4`($arg1),%eax
4542 cmp \$9,%eax
4543 je .Laes_gcm_encrypt_128_avx512
4544 cmp \$11,%eax
4545 je .Laes_gcm_encrypt_192_avx512
4546 cmp \$13,%eax
4547 je .Laes_gcm_encrypt_256_avx512
4548 xor %eax,%eax
4549 jmp .Lexit_gcm_encrypt
4550___
4551for my $keylen (sort keys %aes_rounds) {
4552 $NROUNDS = $aes_rounds{$keylen};
4553 $code .= <<___;
4554.align 32
4555.Laes_gcm_encrypt_${keylen}_avx512:
4556___
4557 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC");
4558 $code .= "jmp .Lexit_gcm_encrypt\n";
4559}
4560$code .= ".Lexit_gcm_encrypt:\n";
4561&EPILOG(1, $arg5);
4562$code .= <<___;
4563ret
4564.Lencrypt_seh_end:
4565.cfi_endproc
4566.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
4567___
4568
4569# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4570# ;void ossl_aes_gcm_decrypt_avx512
4571# ; (const void* keys,
4572# ; void *gcm128ctx,
4573# ; unsigned int *pblocklen,
4574# ; const unsigned char *in,
4575# ; size_t len,
4576# ; unsigned char *out);
4577# ;
4578# ; Performs decryption of data |in| of len |len|, and stores the output in |out|.
4579# ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4580# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4581$code .= <<___;
4582.globl ossl_aes_gcm_decrypt_avx512
4583.type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent
4584.align 32
4585ossl_aes_gcm_decrypt_avx512:
4586.cfi_startproc
4587.Ldecrypt_seh_begin:
4588 endbranch
4589___
4590
4591# ; NOTE: code before PROLOG() must not modify any registers
4592&PROLOG(
4593 1, # allocate stack space for hkeys
4594 1, # allocate stack space for AES blocks
4595 "decrypt");
4596if ($CHECK_FUNCTION_ARGUMENTS) {
4597 $code .= <<___;
4598 # ;; Check keys != NULL
4599 test $arg1,$arg1
4600 jz .Lexit_gcm_decrypt
4601
4602 # ;; Check gcm128ctx != NULL
4603 test $arg2,$arg2
4604 jz .Lexit_gcm_decrypt
4605
4606 # ;; Check pblocklen != NULL
4607 test $arg3,$arg3
4608 jz .Lexit_gcm_decrypt
4609
4610 # ;; Check in != NULL
4611 test $arg4,$arg4
4612 jz .Lexit_gcm_decrypt
4613
4614 # ;; Check if len != 0
4615 cmp \$0,$arg5
4616 jz .Lexit_gcm_decrypt
4617
4618 # ;; Check out != NULL
4619 cmp \$0,$arg6
4620 jz .Lexit_gcm_decrypt
4621___
4622}
4623$code .= <<___;
4624 # ; load number of rounds from AES_KEY structure (offset in bytes is
4625 # ; size of the |rd_key| buffer)
4626 mov `4*15*4`($arg1),%eax
4627 cmp \$9,%eax
4628 je .Laes_gcm_decrypt_128_avx512
4629 cmp \$11,%eax
4630 je .Laes_gcm_decrypt_192_avx512
4631 cmp \$13,%eax
4632 je .Laes_gcm_decrypt_256_avx512
4633 xor %eax,%eax
4634 jmp .Lexit_gcm_decrypt
4635___
4636for my $keylen (sort keys %aes_rounds) {
4637 $NROUNDS = $aes_rounds{$keylen};
4638 $code .= <<___;
4639.align 32
4640.Laes_gcm_decrypt_${keylen}_avx512:
4641___
4642 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC");
4643 $code .= "jmp .Lexit_gcm_decrypt\n";
4644}
4645$code .= ".Lexit_gcm_decrypt:\n";
4646&EPILOG(1, $arg5);
4647$code .= <<___;
4648ret
4649.Ldecrypt_seh_end:
4650.cfi_endproc
4651.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
4652___
4653
4654# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4655# ;void ossl_aes_gcm_finalize_vaes_avx512
4656# ; (void *gcm128ctx,
4657# ; unsigned int pblocklen);
4658# ;
4659# ; Finalizes encryption / decryption
4660# ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4661# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4662$code .= <<___;
4663.globl ossl_aes_gcm_finalize_avx512
4664.type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent
4665.align 32
4666ossl_aes_gcm_finalize_avx512:
4667.cfi_startproc
4668 endbranch
4669___
4670if ($CHECK_FUNCTION_ARGUMENTS) {
4671 $code .= <<___;
4672 # ;; Check gcm128ctx != NULL
4673 test $arg1,$arg1
4674 jz .Labort_finalize
4675___
4676}
4677
4678&GCM_COMPLETE("$arg1", "$arg2");
4679
4680$code .= <<___;
4681.Labort_finalize:
4682ret
4683.cfi_endproc
4684.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
4685___
4686
4687# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4688# ;void ossl_gcm_gmult_avx512(u64 Xi[2],
4689# ; const void* gcm128ctx)
4690# ;
4691# ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4692# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4693$code .= <<___;
4694.globl ossl_gcm_gmult_avx512
4695.hidden ossl_gcm_gmult_avx512
4696.type ossl_gcm_gmult_avx512,\@abi-omnipotent
4697.align 32
4698ossl_gcm_gmult_avx512:
4699.cfi_startproc
4700 endbranch
4701___
4702if ($CHECK_FUNCTION_ARGUMENTS) {
4703 $code .= <<___;
4704 # ;; Check Xi != NULL
4705 test $arg1,$arg1
4706 jz .Labort_gmult
4707
4708 # ;; Check gcm128ctx != NULL
4709 test $arg2,$arg2
4710 jz .Labort_gmult
4711___
4712}
4713$code .= "vmovdqu64 ($arg1),%xmm1\n";
4714$code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n";
4715
4716&GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4717
4718$code .= "vmovdqu64 %xmm1,($arg1)\n";
4719if ($CLEAR_SCRATCH_REGISTERS) {
4720 &clear_scratch_gps_asm();
4721 &clear_scratch_zmms_asm();
4722} else {
4723 $code .= "vzeroupper\n";
4724}
4725$code .= <<___;
4726.Labort_gmult:
4727ret
4728.cfi_endproc
4729.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
4730___
4731
4732if ($win64) {
4733
4734 # Add unwind metadata for SEH.
4735
4736 # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160
4737 my $UWOP_PUSH_NONVOL = 0;
4738 my $UWOP_ALLOC_LARGE = 1;
4739 my $UWOP_SET_FPREG = 3;
4740 my $UWOP_SAVE_XMM128 = 8;
4741 my %UWOP_REG_NUMBER = (
4742 rax => 0,
4743 rcx => 1,
4744 rdx => 2,
4745 rbx => 3,
4746 rsp => 4,
4747 rbp => 5,
4748 rsi => 6,
4749 rdi => 7,
4750 map(("r$_" => $_), (8 .. 15)));
4751
4752 $code .= <<___;
4753.section .pdata
4754.align 4
4755 .rva .Lsetiv_seh_begin
4756 .rva .Lsetiv_seh_end
4757 .rva .Lsetiv_seh_info
4758
4759 .rva .Lghash_seh_begin
4760 .rva .Lghash_seh_end
4761 .rva .Lghash_seh_info
4762
4763 .rva .Lencrypt_seh_begin
4764 .rva .Lencrypt_seh_end
4765 .rva .Lencrypt_seh_info
4766
4767 .rva .Ldecrypt_seh_begin
4768 .rva .Ldecrypt_seh_end
4769 .rva .Ldecrypt_seh_info
4770
4771.section .xdata
4772___
4773
4774 foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") {
4775 $code .= <<___;
4776.align 8
4777.L${func_name}_seh_info:
4778 .byte 1 # version 1, no flags
224ea84b 4779 .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin
63b996e7
AM
4780 .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10
4781 # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16
4782 .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]}
4783___
4784
4785 # Metadata for %xmm15-%xmm6
4786 # Occupy 2 slots each
4787 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
4788
4789 # Scaled-by-16 stack offset
4790 my $xmm_reg_offset = ($reg_idx - 6);
4791 $code .= <<___;
224ea84b 4792 .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin
63b996e7
AM
4793 .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]}
4794 .value $xmm_reg_offset
4795___
4796 }
4797
4798 $code .= <<___;
4799 # Frame pointer (occupy 1 slot)
224ea84b 4800 .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin
63b996e7
AM
4801 .byte $UWOP_SET_FPREG
4802
4803 # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes
224ea84b 4804 .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin
63b996e7
AM
4805 .byte $UWOP_ALLOC_LARGE
4806 .value `($XMM_STORAGE + 8) / 8`
4807___
4808
4809 # Metadata for GPR regs
4810 # Occupy 1 slot each
4811 foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") {
4812 $code .= <<___;
224ea84b 4813 .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin
63b996e7
AM
4814 .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]}
4815___
4816 }
4817 }
4818}
4819
4820$code .= <<___;
4821.data
4822.align 16
4823POLY: .quad 0x0000000000000001, 0xC200000000000000
4824
4825.align 64
4826POLY2:
4827 .quad 0x00000001C2000000, 0xC200000000000000
4828 .quad 0x00000001C2000000, 0xC200000000000000
4829 .quad 0x00000001C2000000, 0xC200000000000000
4830 .quad 0x00000001C2000000, 0xC200000000000000
4831
4832.align 16
4833TWOONE: .quad 0x0000000000000001, 0x0000000100000000
4834
4835# ;;; Order of these constants should not change.
4836# ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
4837.align 64
4838SHUF_MASK:
4839 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4840 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4841 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4842 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4843
4844.align 16
4845SHIFT_MASK:
4846 .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
4847
4848ALL_F:
4849 .quad 0xffffffffffffffff, 0xffffffffffffffff
4850
4851ZERO:
4852 .quad 0x0000000000000000, 0x0000000000000000
4853
4854.align 16
4855ONE:
4856 .quad 0x0000000000000001, 0x0000000000000000
4857
4858.align 16
4859ONEf:
4860 .quad 0x0000000000000000, 0x0100000000000000
4861
4862.align 64
4863ddq_add_1234:
4864 .quad 0x0000000000000001, 0x0000000000000000
4865 .quad 0x0000000000000002, 0x0000000000000000
4866 .quad 0x0000000000000003, 0x0000000000000000
4867 .quad 0x0000000000000004, 0x0000000000000000
4868
4869.align 64
4870ddq_add_5678:
4871 .quad 0x0000000000000005, 0x0000000000000000
4872 .quad 0x0000000000000006, 0x0000000000000000
4873 .quad 0x0000000000000007, 0x0000000000000000
4874 .quad 0x0000000000000008, 0x0000000000000000
4875
4876.align 64
4877ddq_add_4444:
4878 .quad 0x0000000000000004, 0x0000000000000000
4879 .quad 0x0000000000000004, 0x0000000000000000
4880 .quad 0x0000000000000004, 0x0000000000000000
4881 .quad 0x0000000000000004, 0x0000000000000000
4882
4883.align 64
4884ddq_add_8888:
4885 .quad 0x0000000000000008, 0x0000000000000000
4886 .quad 0x0000000000000008, 0x0000000000000000
4887 .quad 0x0000000000000008, 0x0000000000000000
4888 .quad 0x0000000000000008, 0x0000000000000000
4889
4890.align 64
4891ddq_addbe_1234:
4892 .quad 0x0000000000000000, 0x0100000000000000
4893 .quad 0x0000000000000000, 0x0200000000000000
4894 .quad 0x0000000000000000, 0x0300000000000000
4895 .quad 0x0000000000000000, 0x0400000000000000
4896
4897.align 64
4898ddq_addbe_4444:
4899 .quad 0x0000000000000000, 0x0400000000000000
4900 .quad 0x0000000000000000, 0x0400000000000000
4901 .quad 0x0000000000000000, 0x0400000000000000
4902 .quad 0x0000000000000000, 0x0400000000000000
4903
4904.align 64
4905byte_len_to_mask_table:
4906 .value 0x0000, 0x0001, 0x0003, 0x0007
4907 .value 0x000f, 0x001f, 0x003f, 0x007f
4908 .value 0x00ff, 0x01ff, 0x03ff, 0x07ff
4909 .value 0x0fff, 0x1fff, 0x3fff, 0x7fff
4910 .value 0xffff
4911
4912.align 64
4913byte64_len_to_mask_table:
4914 .quad 0x0000000000000000, 0x0000000000000001
4915 .quad 0x0000000000000003, 0x0000000000000007
4916 .quad 0x000000000000000f, 0x000000000000001f
4917 .quad 0x000000000000003f, 0x000000000000007f
4918 .quad 0x00000000000000ff, 0x00000000000001ff
4919 .quad 0x00000000000003ff, 0x00000000000007ff
4920 .quad 0x0000000000000fff, 0x0000000000001fff
4921 .quad 0x0000000000003fff, 0x0000000000007fff
4922 .quad 0x000000000000ffff, 0x000000000001ffff
4923 .quad 0x000000000003ffff, 0x000000000007ffff
4924 .quad 0x00000000000fffff, 0x00000000001fffff
4925 .quad 0x00000000003fffff, 0x00000000007fffff
4926 .quad 0x0000000000ffffff, 0x0000000001ffffff
4927 .quad 0x0000000003ffffff, 0x0000000007ffffff
4928 .quad 0x000000000fffffff, 0x000000001fffffff
4929 .quad 0x000000003fffffff, 0x000000007fffffff
4930 .quad 0x00000000ffffffff, 0x00000001ffffffff
4931 .quad 0x00000003ffffffff, 0x00000007ffffffff
4932 .quad 0x0000000fffffffff, 0x0000001fffffffff
4933 .quad 0x0000003fffffffff, 0x0000007fffffffff
4934 .quad 0x000000ffffffffff, 0x000001ffffffffff
4935 .quad 0x000003ffffffffff, 0x000007ffffffffff
4936 .quad 0x00000fffffffffff, 0x00001fffffffffff
4937 .quad 0x00003fffffffffff, 0x00007fffffffffff
4938 .quad 0x0000ffffffffffff, 0x0001ffffffffffff
4939 .quad 0x0003ffffffffffff, 0x0007ffffffffffff
4940 .quad 0x000fffffffffffff, 0x001fffffffffffff
4941 .quad 0x003fffffffffffff, 0x007fffffffffffff
4942 .quad 0x00ffffffffffffff, 0x01ffffffffffffff
4943 .quad 0x03ffffffffffffff, 0x07ffffffffffffff
4944 .quad 0x0fffffffffffffff, 0x1fffffffffffffff
4945 .quad 0x3fffffffffffffff, 0x7fffffffffffffff
4946 .quad 0xffffffffffffffff
4947___
4948
4949} else {
4950# Fallback for old assembler
4951$code .= <<___;
4952.text
4953.globl ossl_vaes_vpclmulqdq_capable
4954.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
4955ossl_vaes_vpclmulqdq_capable:
4956 xor %eax,%eax
4957 ret
4958.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
4959
4960.globl ossl_aes_gcm_init_avx512
4961.globl ossl_aes_gcm_setiv_avx512
4962.globl ossl_aes_gcm_update_aad_avx512
4963.globl ossl_aes_gcm_encrypt_avx512
4964.globl ossl_aes_gcm_decrypt_avx512
4965.globl ossl_aes_gcm_finalize_avx512
4966.globl ossl_gcm_gmult_avx512
4967
4968.type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4969ossl_aes_gcm_init_avx512:
4970ossl_aes_gcm_setiv_avx512:
4971ossl_aes_gcm_update_aad_avx512:
4972ossl_aes_gcm_encrypt_avx512:
4973ossl_aes_gcm_decrypt_avx512:
4974ossl_aes_gcm_finalize_avx512:
4975ossl_gcm_gmult_avx512:
4976 .byte 0x0f,0x0b # ud2
4977 ret
4978.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4979___
4980}
4981
4982$code =~ s/\`([^\`]*)\`/eval $1/gem;
4983print $code;
4984close STDOUT or die "error closing STDOUT: $!";