]>
Commit | Line | Data |
---|---|---|
da1c088f | 1 | # Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved. |
63b996e7 AM |
2 | # Copyright (c) 2021, Intel Corporation. All Rights Reserved. |
3 | # | |
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | # | |
9 | # | |
10 | # This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ) | |
11 | # from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1 | |
12 | # (https://github.com/intel/intel-ipsec-mb). | |
13 | # Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>. | |
14 | # | |
15 | # References: | |
16 | # [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on | |
17 | # Intel Architecture Processors. August, 2010. | |
18 | # [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on | |
19 | # Intel Architecture Processors. October, 2012. | |
20 | # [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its | |
21 | # Usage for Computing the GCM Mode. May, 2010. | |
22 | # | |
23 | # | |
24 | # December 2021 | |
25 | # | |
26 | # Initial release. | |
27 | # | |
28 | # GCM128_CONTEXT structure has storage for 16 hkeys only, but this | |
29 | # implementation can use up to 48. To avoid extending the context size, | |
30 | # precompute and store in the context first 16 hkeys only, and compute the rest | |
31 | # on demand keeping them in the local frame. | |
32 | # | |
33 | #====================================================================== | |
34 | # $output is the last argument if it looks like a file (it has an extension) | |
35 | # $flavour is the first argument if it doesn't look like a file | |
36 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
37 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
38 | ||
39 | $win64 = 0; | |
40 | $win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
41 | ||
42 | $avx512vaes = 0; | |
43 | ||
44 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; | |
45 | $dir = $1; | |
46 | ($xlate = "${dir}x86_64-xlate.pl" and -f $xlate) | |
47 | or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) | |
48 | or die "can't locate x86_64-xlate.pl"; | |
49 | ||
50 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
51 | $avx512vaes = ($1 >= 2.30); | |
52 | } | |
53 | ||
54 | if (!$avx512vaes | |
55 | && $win64 | |
56 | && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) | |
57 | && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) | |
58 | { | |
59 | $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14); | |
60 | } | |
61 | ||
2dbddfab TM |
62 | if (!$avx512vaes && `$ENV{CC} -v 2>&1` |
63 | =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { | |
64 | my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 | |
65 | if ($1) { | |
66 | # Apple conditions, they use a different version series, see | |
67 | # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 | |
68 | # clang 7.0.0 is Apple clang 10.0.1 | |
69 | $avx512vaes = ($ver>=10.0001) | |
70 | } else { | |
71 | $avx512vaes = ($ver>=7.0); | |
72 | } | |
63b996e7 AM |
73 | } |
74 | ||
75 | open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"" | |
76 | or die "can't call $xlate: $!"; | |
77 | *STDOUT = *OUT; | |
78 | ||
79 | #====================================================================== | |
80 | if ($avx512vaes>0) { #<<< | |
81 | ||
82 | $code .= <<___; | |
83 | .extern OPENSSL_ia32cap_P | |
84 | .globl ossl_vaes_vpclmulqdq_capable | |
85 | .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent | |
86 | .align 32 | |
87 | ossl_vaes_vpclmulqdq_capable: | |
88 | mov OPENSSL_ia32cap_P+8(%rip), %rcx | |
89 | # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f | |
90 | mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx | |
91 | xor %eax,%eax | |
92 | and %rdx,%rcx | |
93 | cmp %rdx,%rcx | |
94 | cmove %rcx,%rax | |
95 | ret | |
96 | .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable | |
97 | ___ | |
98 | ||
99 | # ; Mapping key length -> AES rounds count | |
100 | my %aes_rounds = ( | |
101 | 128 => 9, | |
102 | 192 => 11, | |
103 | 256 => 13); | |
104 | ||
105 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
106 | # ;;; Code generation control switches | |
107 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
108 | ||
109 | # ; ABI-aware zeroing of volatile registers in EPILOG(). | |
110 | # ; Disabled due to performance reasons. | |
111 | my $CLEAR_SCRATCH_REGISTERS = 0; | |
112 | ||
113 | # ; Zero HKeys storage from the stack if they are stored there | |
114 | my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1; | |
115 | ||
116 | # ; Enable / disable check of function arguments for null pointer | |
117 | # ; Currently disabled, as this check is handled outside. | |
118 | my $CHECK_FUNCTION_ARGUMENTS = 0; | |
119 | ||
120 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
121 | # ;;; Global constants | |
122 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
123 | ||
124 | # AES block size in bytes | |
125 | my $AES_BLOCK_SIZE = 16; | |
126 | ||
127 | # Storage capacity in elements | |
128 | my $HKEYS_STORAGE_CAPACITY = 48; | |
129 | my $LOCAL_STORAGE_CAPACITY = 48; | |
130 | my $HKEYS_CONTEXT_CAPACITY = 16; | |
131 | ||
132 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
133 | # ;;; Stack frame definition | |
134 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
135 | ||
136 | # (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs | |
137 | # (2) -> +8-byte space for 16-byte alignment of XMM storage | |
138 | # (3) -> Frame pointer (%RBP) | |
139 | # (4) -> +160-byte XMM storage (Windows only, zero on Linux) | |
140 | # (5) -> +48-byte space for 64-byte alignment of %RSP from p.8 | |
141 | # (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions) | |
142 | # (7) -> +768-byte HKEYS storage | |
143 | # (8) -> Stack pointer (%RSP) aligned on 64-byte boundary | |
144 | ||
145 | my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack) | |
146 | my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers | |
147 | my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48 | |
148 | my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks | |
149 | ||
150 | my $STACK_HKEYS_OFFSET = 0; | |
151 | my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE); | |
152 | ||
153 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
154 | # ;;; Function arguments abstraction | |
155 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
156 | my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11); | |
157 | ||
158 | # ; This implementation follows the convention: for non-leaf functions (they | |
159 | # ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from | |
160 | # ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This | |
161 | # ; helps to facilitate SEH handlers writing. | |
162 | # | |
163 | # ; Leaf functions here do not use more than 4 input arguments. | |
164 | if ($win64) { | |
165 | $arg1 = "%rcx"; | |
166 | $arg2 = "%rdx"; | |
167 | $arg3 = "%r8"; | |
168 | $arg4 = "%r9"; | |
169 | $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes | |
170 | $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)"; | |
171 | $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)"; | |
172 | $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)"; | |
173 | $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)"; | |
174 | $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)"; | |
175 | $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)"; | |
176 | } else { | |
177 | $arg1 = "%rdi"; | |
178 | $arg2 = "%rsi"; | |
179 | $arg3 = "%rdx"; | |
180 | $arg4 = "%rcx"; | |
181 | $arg5 = "%r8"; | |
182 | $arg6 = "%r9"; | |
183 | $arg7 = "`$GP_STORAGE + 8*1`(%rbp)"; | |
184 | $arg8 = "`$GP_STORAGE + 8*2`(%rbp)"; | |
185 | $arg9 = "`$GP_STORAGE + 8*3`(%rbp)"; | |
186 | $arg10 = "`$GP_STORAGE + 8*4`(%rbp)"; | |
187 | $arg11 = "`$GP_STORAGE + 8*5`(%rbp)"; | |
188 | } | |
189 | ||
190 | # ; Offsets in gcm128_context structure (see include/crypto/modes.h) | |
191 | my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key | |
192 | my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer | |
193 | my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation) | |
194 | my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input | |
195 | my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted | |
196 | my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash | |
197 | my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values) | |
198 | ||
199 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
200 | # ;;; Helper functions | |
201 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
202 | ||
203 | # ; Generates "random" local labels | |
204 | sub random_string() { | |
205 | my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_'); | |
206 | my $length = 15; | |
207 | my $str; | |
208 | map { $str .= $chars[rand(33)] } 1 .. $length; | |
209 | return $str; | |
210 | } | |
211 | ||
212 | sub BYTE { | |
213 | my ($reg) = @_; | |
214 | if ($reg =~ /%r[abcd]x/i) { | |
215 | $reg =~ s/%r([abcd])x/%${1}l/i; | |
216 | } elsif ($reg =~ /%r[sdb][ip]/i) { | |
217 | $reg =~ s/%r([sdb][ip])/%${1}l/i; | |
218 | } elsif ($reg =~ /%r[0-9]{1,2}/i) { | |
219 | $reg =~ s/%(r[0-9]{1,2})/%${1}b/i; | |
220 | } else { | |
221 | die "BYTE: unknown register: $reg\n"; | |
222 | } | |
223 | return $reg; | |
224 | } | |
225 | ||
226 | sub WORD { | |
227 | my ($reg) = @_; | |
228 | if ($reg =~ /%r[abcdsdb][xip]/i) { | |
229 | $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i; | |
230 | } elsif ($reg =~ /%r[0-9]{1,2}/) { | |
231 | $reg =~ s/%(r[0-9]{1,2})/%${1}w/i; | |
232 | } else { | |
233 | die "WORD: unknown register: $reg\n"; | |
234 | } | |
235 | return $reg; | |
236 | } | |
237 | ||
238 | sub DWORD { | |
239 | my ($reg) = @_; | |
240 | if ($reg =~ /%r[abcdsdb][xip]/i) { | |
241 | $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i; | |
242 | } elsif ($reg =~ /%r[0-9]{1,2}/i) { | |
243 | $reg =~ s/%(r[0-9]{1,2})/%${1}d/i; | |
244 | } else { | |
245 | die "DWORD: unknown register: $reg\n"; | |
246 | } | |
247 | return $reg; | |
248 | } | |
249 | ||
250 | sub XWORD { | |
251 | my ($reg) = @_; | |
252 | if ($reg =~ /%[xyz]mm/i) { | |
253 | $reg =~ s/%[xyz]mm/%xmm/i; | |
254 | } else { | |
255 | die "XWORD: unknown register: $reg\n"; | |
256 | } | |
257 | return $reg; | |
258 | } | |
259 | ||
260 | sub YWORD { | |
261 | my ($reg) = @_; | |
262 | if ($reg =~ /%[xyz]mm/i) { | |
263 | $reg =~ s/%[xyz]mm/%ymm/i; | |
264 | } else { | |
265 | die "YWORD: unknown register: $reg\n"; | |
266 | } | |
267 | return $reg; | |
268 | } | |
269 | ||
270 | sub ZWORD { | |
271 | my ($reg) = @_; | |
272 | if ($reg =~ /%[xyz]mm/i) { | |
273 | $reg =~ s/%[xyz]mm/%zmm/i; | |
274 | } else { | |
275 | die "ZWORD: unknown register: $reg\n"; | |
276 | } | |
277 | return $reg; | |
278 | } | |
279 | ||
280 | # ; Helper function to construct effective address based on two kinds of | |
281 | # ; offsets: numerical or located in the register | |
282 | sub EffectiveAddress { | |
283 | my ($base, $offset, $displacement) = @_; | |
284 | $displacement = 0 if (!$displacement); | |
285 | ||
286 | if ($offset =~ /^\d+\z/) { # numerical offset | |
287 | return "`$offset + $displacement`($base)"; | |
288 | } else { # offset resides in register | |
289 | return "$displacement($base,$offset,1)"; | |
290 | } | |
291 | } | |
292 | ||
293 | # ; Provides memory location of corresponding HashKey power | |
294 | sub HashKeyByIdx { | |
295 | my ($idx, $base) = @_; | |
296 | my $base_str = ($base eq "%rsp") ? "frame" : "context"; | |
297 | ||
298 | my $offset = &HashKeyOffsetByIdx($idx, $base_str); | |
299 | return "$offset($base)"; | |
300 | } | |
301 | ||
302 | # ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage | |
303 | sub HashKeyOffsetByIdx { | |
304 | my ($idx, $base) = @_; | |
305 | die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base" | |
306 | if (($base ne "frame") && ($base ne "context")); | |
307 | ||
308 | my $offset_base; | |
309 | my $offset_idx; | |
310 | if ($base eq "frame") { # frame storage | |
311 | die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1); | |
312 | $offset_base = $STACK_HKEYS_OFFSET; | |
313 | $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx)); | |
314 | } else { # context storage | |
315 | die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1); | |
316 | $offset_base = $CTX_OFFSET_HTable; | |
317 | $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx)); | |
318 | } | |
319 | return $offset_base + $offset_idx; | |
320 | } | |
321 | ||
322 | # ; Creates local frame and does back up of non-volatile registers. | |
323 | # ; Holds stack unwinding directives. | |
324 | sub PROLOG { | |
325 | my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_; | |
326 | ||
327 | my $DYNAMIC_STACK_ALLOC_SIZE = 0; | |
328 | my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52; | |
329 | ||
330 | if ($need_hkeys_stack_storage) { | |
331 | $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE; | |
332 | } | |
333 | ||
334 | if ($need_aes_stack_storage) { | |
335 | if (!$need_hkeys_stack_storage) { | |
336 | die "PROLOG: unsupported case - aes storage without hkeys one"; | |
337 | } | |
338 | $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE; | |
339 | } | |
340 | ||
341 | $code .= <<___; | |
342 | push %rbx | |
343 | .cfi_push %rbx | |
344 | .L${func_name}_seh_push_rbx: | |
345 | push %rbp | |
346 | .cfi_push %rbp | |
347 | .L${func_name}_seh_push_rbp: | |
348 | push %r12 | |
349 | .cfi_push %r12 | |
350 | .L${func_name}_seh_push_r12: | |
351 | push %r13 | |
352 | .cfi_push %r13 | |
353 | .L${func_name}_seh_push_r13: | |
354 | push %r14 | |
355 | .cfi_push %r14 | |
356 | .L${func_name}_seh_push_r14: | |
357 | push %r15 | |
358 | .cfi_push %r15 | |
359 | .L${func_name}_seh_push_r15: | |
360 | ___ | |
361 | ||
362 | if ($win64) { | |
363 | $code .= <<___; | |
364 | push %rdi | |
365 | .L${func_name}_seh_push_rdi: | |
366 | push %rsi | |
367 | .L${func_name}_seh_push_rsi: | |
368 | ||
369 | sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment | |
370 | .L${func_name}_seh_allocstack_xmm: | |
371 | ___ | |
372 | } | |
373 | $code .= <<___; | |
374 | # ; %rbp contains stack pointer right after GP regs pushed at stack + [8 | |
375 | # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH | |
376 | # ; handlers. The requirement for a frame pointer is that its offset from | |
377 | # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer | |
378 | # ; itself seems to be reasonable to use here, because later we do 64-byte stack | |
379 | # ; alignment which gives us non-determinate offsets and complicates writing | |
380 | # ; SEH handlers. | |
381 | # | |
382 | # ; It also serves as an anchor for retrieving stack arguments on both Linux | |
383 | # ; and Windows. | |
384 | lea `$XMM_STORAGE`(%rsp),%rbp | |
385 | .cfi_def_cfa_register %rbp | |
386 | .L${func_name}_seh_setfp: | |
387 | ___ | |
388 | if ($win64) { | |
389 | ||
390 | # ; xmm6:xmm15 need to be preserved on Windows | |
391 | foreach my $reg_idx (6 .. 15) { | |
392 | my $xmm_reg_offset = ($reg_idx - 6) * 16; | |
393 | $code .= <<___; | |
394 | vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp) | |
395 | .L${func_name}_seh_save_xmm${reg_idx}: | |
396 | ___ | |
397 | } | |
398 | } | |
399 | ||
400 | $code .= <<___; | |
401 | # Prolog ends here. Next stack allocation is treated as "dynamic". | |
402 | .L${func_name}_seh_prolog_end: | |
403 | ___ | |
404 | ||
405 | if ($DYNAMIC_STACK_ALLOC_SIZE) { | |
406 | $code .= <<___; | |
407 | sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp | |
408 | and \$(-64),%rsp | |
409 | ___ | |
410 | } | |
411 | } | |
412 | ||
413 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
414 | # ;;; Restore register content for the caller. | |
415 | # ;;; And cleanup stack. | |
416 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
417 | sub EPILOG { | |
418 | my ($hkeys_storage_on_stack, $payload_len) = @_; | |
419 | ||
420 | my $rndsuffix = &random_string(); | |
421 | ||
422 | if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) { | |
423 | ||
424 | # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys | |
425 | # ; were stored in the local frame storage | |
426 | $code .= <<___; | |
427 | cmpq \$`16*16`,$payload_len | |
428 | jbe .Lskip_hkeys_cleanup_${rndsuffix} | |
429 | vpxor %xmm0,%xmm0,%xmm0 | |
430 | ___ | |
431 | for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) { | |
432 | $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n"; | |
433 | } | |
434 | $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n"; | |
435 | } | |
436 | ||
437 | if ($CLEAR_SCRATCH_REGISTERS) { | |
438 | &clear_scratch_gps_asm(); | |
439 | &clear_scratch_zmms_asm(); | |
440 | } else { | |
441 | $code .= "vzeroupper\n"; | |
442 | } | |
443 | ||
444 | if ($win64) { | |
445 | ||
446 | # ; restore xmm15:xmm6 | |
447 | for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { | |
448 | my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16; | |
449 | $code .= <<___; | |
450 | vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx}, | |
451 | ___ | |
452 | } | |
453 | } | |
454 | ||
455 | if ($win64) { | |
456 | ||
457 | # Forming valid epilog for SEH with use of frame pointer. | |
458 | # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code | |
459 | $code .= "lea 8(%rbp),%rsp\n"; | |
460 | } else { | |
461 | $code .= "lea (%rbp),%rsp\n"; | |
462 | $code .= ".cfi_def_cfa_register %rsp\n"; | |
463 | } | |
464 | ||
465 | if ($win64) { | |
466 | $code .= <<___; | |
467 | pop %rsi | |
468 | .cfi_pop %rsi | |
469 | pop %rdi | |
470 | .cfi_pop %rdi | |
471 | ___ | |
472 | } | |
473 | $code .= <<___; | |
474 | pop %r15 | |
475 | .cfi_pop %r15 | |
476 | pop %r14 | |
477 | .cfi_pop %r14 | |
478 | pop %r13 | |
479 | .cfi_pop %r13 | |
480 | pop %r12 | |
481 | .cfi_pop %r12 | |
482 | pop %rbp | |
483 | .cfi_pop %rbp | |
484 | pop %rbx | |
485 | .cfi_pop %rbx | |
486 | ___ | |
487 | } | |
488 | ||
489 | # ; Clears all scratch ZMM registers | |
490 | # ; | |
491 | # ; It should be called before restoring the XMM registers | |
492 | # ; for Windows (XMM6-XMM15). | |
493 | # ; | |
494 | sub clear_scratch_zmms_asm { | |
495 | ||
496 | # ; On Linux, all ZMM registers are scratch registers | |
497 | if (!$win64) { | |
498 | $code .= "vzeroall\n"; | |
499 | } else { | |
500 | foreach my $i (0 .. 5) { | |
501 | $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; | |
502 | } | |
503 | } | |
504 | foreach my $i (16 .. 31) { | |
505 | $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; | |
506 | } | |
507 | } | |
508 | ||
509 | # Clears all scratch GP registers | |
510 | sub clear_scratch_gps_asm { | |
511 | foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") { | |
512 | $code .= "xor $reg,$reg\n"; | |
513 | } | |
514 | if (!$win64) { | |
515 | foreach my $reg ("%rsi", "%rdi") { | |
516 | $code .= "xor $reg,$reg\n"; | |
517 | } | |
518 | } | |
519 | } | |
520 | ||
521 | sub precompute_hkeys_on_stack { | |
522 | my $GCM128_CTX = $_[0]; | |
523 | my $HKEYS_READY = $_[1]; | |
524 | my $ZTMP0 = $_[2]; | |
525 | my $ZTMP1 = $_[3]; | |
526 | my $ZTMP2 = $_[4]; | |
527 | my $ZTMP3 = $_[5]; | |
528 | my $ZTMP4 = $_[6]; | |
529 | my $ZTMP5 = $_[7]; | |
530 | my $ZTMP6 = $_[8]; | |
531 | my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32" | |
532 | ||
533 | die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE" | |
534 | if ($HKEYS_RANGE ne "first16" | |
535 | && $HKEYS_RANGE ne "mid16" | |
536 | && $HKEYS_RANGE ne "all" | |
537 | && $HKEYS_RANGE ne "first32" | |
538 | && $HKEYS_RANGE ne "last32"); | |
539 | ||
540 | my $rndsuffix = &random_string(); | |
541 | ||
542 | $code .= <<___; | |
543 | test $HKEYS_READY,$HKEYS_READY | |
544 | jnz .L_skip_hkeys_precomputation_${rndsuffix} | |
545 | ___ | |
546 | ||
547 | if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") { | |
548 | ||
549 | # ; Fill the stack with the first 16 hkeys from the context | |
550 | $code .= <<___; | |
551 | # ; Move 16 hkeys from the context to stack | |
552 | vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0 | |
553 | vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]} | |
554 | ||
555 | vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1 | |
556 | vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]} | |
557 | ||
558 | # ; broadcast HashKey^8 | |
559 | vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 | |
560 | ||
561 | vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2 | |
562 | vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]} | |
563 | ||
564 | vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3 | |
565 | vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]} | |
566 | ___ | |
567 | } | |
568 | ||
569 | if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") { | |
570 | $code .= <<___; | |
571 | vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1 | |
572 | ||
573 | # ; broadcast HashKey^8 | |
574 | vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 | |
575 | ||
576 | vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2 | |
577 | vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3 | |
578 | ___ | |
579 | ||
580 | } | |
581 | ||
582 | if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { | |
583 | ||
584 | # ; Precompute hkeys^i, i=17..32 | |
585 | my $i = 20; | |
586 | foreach (1 .. int((32 - 16) / 8)) { | |
587 | ||
588 | # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) | |
589 | &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); | |
590 | $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; | |
591 | $i += 4; | |
592 | ||
593 | # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) | |
594 | &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); | |
595 | $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; | |
596 | $i += 4; | |
597 | } | |
598 | } | |
599 | ||
600 | if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { | |
601 | ||
602 | # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48) | |
603 | my $i = 36; | |
604 | foreach (1 .. int((48 - 32) / 8)) { | |
605 | ||
606 | # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) | |
607 | &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); | |
608 | $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; | |
609 | $i += 4; | |
610 | ||
611 | # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) | |
612 | &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); | |
613 | $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; | |
614 | $i += 4; | |
615 | } | |
616 | } | |
617 | ||
618 | $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n"; | |
619 | } | |
620 | ||
621 | # ;; ============================================================================= | |
622 | # ;; Generic macro to produce code that executes $OPCODE instruction | |
623 | # ;; on selected number of AES blocks (16 bytes long ) between 0 and 16. | |
624 | # ;; All three operands of the instruction come from registers. | |
625 | # ;; Note: if 3 blocks are left at the end instruction is produced to operate all | |
626 | # ;; 4 blocks (full width of ZMM) | |
627 | sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 { | |
628 | my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) | |
629 | my $OPCODE = $_[1]; # [in] instruction name | |
630 | my @DST; | |
631 | $DST[0] = $_[2]; # [out] destination ZMM register | |
632 | $DST[1] = $_[3]; # [out] destination ZMM register | |
633 | $DST[2] = $_[4]; # [out] destination ZMM register | |
634 | $DST[3] = $_[5]; # [out] destination ZMM register | |
635 | my @SRC1; | |
636 | $SRC1[0] = $_[6]; # [in] source 1 ZMM register | |
637 | $SRC1[1] = $_[7]; # [in] source 1 ZMM register | |
638 | $SRC1[2] = $_[8]; # [in] source 1 ZMM register | |
639 | $SRC1[3] = $_[9]; # [in] source 1 ZMM register | |
640 | my @SRC2; | |
641 | $SRC2[0] = $_[10]; # [in] source 2 ZMM register | |
642 | $SRC2[1] = $_[11]; # [in] source 2 ZMM register | |
643 | $SRC2[2] = $_[12]; # [in] source 2 ZMM register | |
644 | $SRC2[3] = $_[13]; # [in] source 2 ZMM register | |
645 | ||
646 | die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" | |
647 | if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); | |
648 | ||
649 | my $reg_idx = 0; | |
650 | my $blocks_left = $NUM_BLOCKS; | |
651 | ||
652 | foreach (1 .. ($NUM_BLOCKS / 4)) { | |
653 | $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n"; | |
654 | $reg_idx++; | |
655 | $blocks_left -= 4; | |
656 | } | |
657 | ||
658 | my $DSTREG = $DST[$reg_idx]; | |
659 | my $SRC1REG = $SRC1[$reg_idx]; | |
660 | my $SRC2REG = $SRC2[$reg_idx]; | |
661 | ||
662 | if ($blocks_left == 1) { | |
663 | $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n"; | |
664 | } elsif ($blocks_left == 2) { | |
665 | $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n"; | |
666 | } elsif ($blocks_left == 3) { | |
667 | $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n"; | |
668 | } | |
669 | } | |
670 | ||
671 | # ;; ============================================================================= | |
672 | # ;; Loads specified number of AES blocks into ZMM registers using mask register | |
673 | # ;; for the last loaded register (xmm, ymm or zmm). | |
674 | # ;; Loads take place at 1 byte granularity. | |
675 | sub ZMM_LOAD_MASKED_BLOCKS_0_16 { | |
676 | my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) | |
677 | my $INP = $_[1]; # [in] input data pointer to read from | |
678 | my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) | |
679 | my @DST; | |
680 | $DST[0] = $_[3]; # [out] ZMM register with loaded data | |
681 | $DST[1] = $_[4]; # [out] ZMM register with loaded data | |
682 | $DST[2] = $_[5]; # [out] ZMM register with loaded data | |
683 | $DST[3] = $_[6]; # [out] ZMM register with loaded data | |
684 | my $MASK = $_[7]; # [in] mask register | |
685 | ||
686 | die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" | |
687 | if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); | |
688 | ||
689 | my $src_offset = 0; | |
690 | my $dst_idx = 0; | |
691 | my $blocks_left = $NUM_BLOCKS; | |
692 | ||
693 | if ($NUM_BLOCKS > 0) { | |
694 | foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { | |
695 | $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n"; | |
696 | $src_offset += 64; | |
697 | $dst_idx++; | |
698 | $blocks_left -= 4; | |
699 | } | |
700 | } | |
701 | ||
702 | my $DSTREG = $DST[$dst_idx]; | |
703 | ||
704 | if ($blocks_left == 1) { | |
705 | $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n"; | |
706 | } elsif ($blocks_left == 2) { | |
707 | $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n"; | |
708 | } elsif (($blocks_left == 3 || $blocks_left == 4)) { | |
709 | $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n"; | |
710 | } | |
711 | } | |
712 | ||
713 | # ;; ============================================================================= | |
714 | # ;; Stores specified number of AES blocks from ZMM registers with mask register | |
715 | # ;; for the last loaded register (xmm, ymm or zmm). | |
716 | # ;; Stores take place at 1 byte granularity. | |
717 | sub ZMM_STORE_MASKED_BLOCKS_0_16 { | |
718 | my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) | |
719 | my $OUTP = $_[1]; # [in] output data pointer to write to | |
720 | my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) | |
721 | my @SRC; | |
722 | $SRC[0] = $_[3]; # [in] ZMM register with data to store | |
723 | $SRC[1] = $_[4]; # [in] ZMM register with data to store | |
724 | $SRC[2] = $_[5]; # [in] ZMM register with data to store | |
725 | $SRC[3] = $_[6]; # [in] ZMM register with data to store | |
726 | my $MASK = $_[7]; # [in] mask register | |
727 | ||
728 | die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" | |
729 | if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); | |
730 | ||
731 | my $dst_offset = 0; | |
732 | my $src_idx = 0; | |
733 | my $blocks_left = $NUM_BLOCKS; | |
734 | ||
735 | if ($NUM_BLOCKS > 0) { | |
736 | foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { | |
737 | $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n"; | |
738 | $dst_offset += 64; | |
739 | $src_idx++; | |
740 | $blocks_left -= 4; | |
741 | } | |
742 | } | |
743 | ||
744 | my $SRCREG = $SRC[$src_idx]; | |
745 | ||
746 | if ($blocks_left == 1) { | |
747 | $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; | |
748 | } elsif ($blocks_left == 2) { | |
749 | $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; | |
750 | } elsif ($blocks_left == 3 || $blocks_left == 4) { | |
751 | $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; | |
752 | } | |
753 | } | |
754 | ||
755 | # ;;; =========================================================================== | |
756 | # ;;; Handles AES encryption rounds | |
757 | # ;;; It handles special cases: the last and first rounds | |
758 | # ;;; Optionally, it performs XOR with data after the last AES round. | |
759 | # ;;; Uses NROUNDS parameter to check what needs to be done for the current round. | |
760 | # ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). | |
761 | sub ZMM_AESENC_ROUND_BLOCKS_0_16 { | |
762 | my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3 | |
763 | my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7 | |
764 | my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11 | |
765 | my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15 | |
766 | my $KEY = $_[4]; # [in] zmm containing round key | |
767 | my $ROUND = $_[5]; # [in] round number | |
768 | my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3 | |
769 | my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7 | |
770 | my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11 | |
771 | my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15 | |
772 | my $NUMBL = $_[10]; # [in] number of blocks; numerical value | |
773 | my $NROUNDS = $_[11]; # [in] number of rounds; numerical value | |
774 | ||
775 | # ;;; === first AES round | |
776 | if ($ROUND < 1) { | |
777 | ||
778 | # ;; round 0 | |
779 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
780 | $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, | |
781 | $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); | |
782 | } | |
783 | ||
784 | # ;;; === middle AES rounds | |
785 | if ($ROUND >= 1 && $ROUND <= $NROUNDS) { | |
786 | ||
787 | # ;; rounds 1 to 9/11/13 | |
788 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
789 | $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, | |
790 | $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); | |
791 | } | |
792 | ||
793 | # ;;; === last AES round | |
794 | if ($ROUND > $NROUNDS) { | |
795 | ||
796 | # ;; the last round - mix enclast with text xor's | |
797 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
798 | $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, | |
799 | $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); | |
800 | ||
801 | # ;;; === XOR with data | |
802 | if ( ($D0_3 ne "no_data") | |
803 | && ($D4_7 ne "no_data") | |
804 | && ($D8_11 ne "no_data") | |
805 | && ($D12_15 ne "no_data")) | |
806 | { | |
807 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
808 | $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, | |
809 | $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15); | |
810 | } | |
811 | } | |
812 | } | |
813 | ||
814 | # ;;; Horizontal XOR - 4 x 128bits xored together | |
815 | sub VHPXORI4x128 { | |
816 | my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output | |
817 | my $TMP = $_[1]; # [clobbered] ZMM temporary register | |
818 | $code .= <<___; | |
819 | vextracti64x4 \$1,$REG,@{[YWORD($TMP)]} | |
820 | vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]} | |
821 | vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]} | |
822 | vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]} | |
823 | ___ | |
824 | } | |
825 | ||
826 | # ;;; AVX512 reduction macro | |
827 | sub VCLMUL_REDUCE { | |
828 | my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128) | |
829 | my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial | |
830 | my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce | |
831 | my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce | |
832 | my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register | |
833 | my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register | |
834 | ||
835 | $code .= <<___; | |
836 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
837 | # ;; first phase of the reduction | |
838 | vpclmulqdq \$0x01,$LO128,$POLY,$TMP0 | |
839 | vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs | |
840 | vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete | |
841 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
842 | # ;; second phase of the reduction | |
843 | vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1 | |
844 | vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R | |
845 | vpclmulqdq \$0x10,$TMP0,$POLY,$OUT | |
846 | vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts | |
847 | vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128 | |
848 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
849 | ___ | |
850 | } | |
851 | ||
852 | # ;; =========================================================================== | |
853 | # ;; schoolbook multiply of 16 blocks (16 x 16 bytes) | |
854 | # ;; - it is assumed that data read from $INPTR is already shuffled and | |
855 | # ;; $INPTR address is 64 byte aligned | |
856 | # ;; - there is an option to pass ready blocks through ZMM registers too. | |
857 | # ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty | |
858 | sub GHASH_16 { | |
859 | my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction), | |
860 | # end_reduce (end with reduction), start_reduce | |
861 | my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits | |
862 | my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits | |
863 | my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits | |
864 | my $INPTR = $_[4]; # [in] data input pointer | |
865 | my $INOFF = $_[5]; # [in] data input offset | |
866 | my $INDIS = $_[6]; # [in] data input displacement | |
867 | my $HKPTR = $_[7]; # [in] hash key pointer | |
868 | my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset) | |
869 | my $HKDIS = $_[9]; # [in] hash key displacement | |
870 | my $HASH = $_[10]; # [in/out] ZMM hash value in/out | |
871 | my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM | |
872 | my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM | |
873 | my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM | |
874 | my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM | |
875 | my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM | |
876 | my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM | |
877 | my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM | |
878 | my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM | |
879 | my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM | |
880 | my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided | |
881 | my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) | |
882 | my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) | |
883 | my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) | |
884 | my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) | |
885 | ||
886 | my $start_ghash = 0; | |
887 | my $do_reduction = 0; | |
888 | if ($TYPE eq "start") { | |
889 | $start_ghash = 1; | |
890 | } | |
891 | ||
892 | if ($TYPE eq "start_reduce") { | |
893 | $start_ghash = 1; | |
894 | $do_reduction = 1; | |
895 | } | |
896 | ||
897 | if ($TYPE eq "end_reduce") { | |
898 | $do_reduction = 1; | |
899 | } | |
900 | ||
901 | # ;; ghash blocks 0-3 | |
902 | if (scalar(@_) == 21) { | |
903 | $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n"; | |
904 | } else { | |
905 | $ZTMP9 = $DAT0; | |
906 | } | |
907 | ||
908 | if ($start_ghash != 0) { | |
909 | $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n"; | |
910 | } | |
911 | $code .= <<___; | |
912 | vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8 | |
913 | vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 | |
914 | vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 | |
915 | vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 | |
916 | vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 | |
917 | ___ | |
918 | ||
919 | # ;; ghash blocks 4-7 | |
920 | if (scalar(@_) == 21) { | |
921 | $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n"; | |
922 | } else { | |
923 | $ZTMP9 = $DAT1; | |
924 | } | |
925 | $code .= <<___; | |
926 | vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8 | |
927 | vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 | |
928 | vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 | |
929 | vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 | |
930 | vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 | |
931 | ___ | |
932 | ||
933 | # ;; update sums | |
934 | if ($start_ghash != 0) { | |
935 | $code .= <<___; | |
936 | vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1 | |
937 | vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H | |
938 | vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L | |
939 | vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1 | |
940 | ___ | |
941 | } else { # ;; mid, end, end_reduce | |
942 | $code .= <<___; | |
943 | vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 | |
944 | vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H | |
945 | vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L | |
946 | vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 | |
947 | ___ | |
948 | } | |
949 | ||
950 | # ;; ghash blocks 8-11 | |
951 | if (scalar(@_) == 21) { | |
952 | $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n"; | |
953 | } else { | |
954 | $ZTMP9 = $DAT2; | |
955 | } | |
956 | $code .= <<___; | |
957 | vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8 | |
958 | vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 | |
959 | vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 | |
960 | vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 | |
961 | vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 | |
962 | ___ | |
963 | ||
964 | # ;; ghash blocks 12-15 | |
965 | if (scalar(@_) == 21) { | |
966 | $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n"; | |
967 | } else { | |
968 | $ZTMP9 = $DAT3; | |
969 | } | |
970 | $code .= <<___; | |
971 | vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8 | |
972 | vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 | |
973 | vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 | |
974 | vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 | |
975 | vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 | |
976 | # ;; update sums | |
977 | vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 | |
978 | vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H | |
979 | vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L | |
980 | vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 | |
981 | ___ | |
982 | if ($do_reduction != 0) { | |
983 | $code .= <<___; | |
984 | # ;; integrate GM into GH and GL | |
985 | vpsrldq \$8,$GM,$ZTMP0 | |
986 | vpslldq \$8,$GM,$ZTMP1 | |
987 | vpxorq $ZTMP0,$GH,$GH | |
988 | vpxorq $ZTMP1,$GL,$GL | |
989 | ___ | |
990 | ||
991 | # ;; add GH and GL 128-bit words horizontally | |
992 | &VHPXORI4x128($GH, $ZTMP0); | |
993 | &VHPXORI4x128($GL, $ZTMP1); | |
994 | ||
995 | # ;; reduction | |
996 | $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n"; | |
997 | &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1)); | |
998 | } | |
999 | } | |
1000 | ||
1001 | # ;; =========================================================================== | |
1002 | # ;; GHASH 1 to 16 blocks of cipher text | |
1003 | # ;; - performs reduction at the end | |
1004 | # ;; - it doesn't load the data and it assumed it is already loaded and shuffled | |
1005 | sub GHASH_1_TO_16 { | |
1006 | my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys | |
1007 | my $GHASH = $_[1]; # [out] ghash output | |
1008 | my $T0H = $_[2]; # [clobbered] temporary ZMM | |
1009 | my $T0L = $_[3]; # [clobbered] temporary ZMM | |
1010 | my $T0M1 = $_[4]; # [clobbered] temporary ZMM | |
1011 | my $T0M2 = $_[5]; # [clobbered] temporary ZMM | |
1012 | my $T1H = $_[6]; # [clobbered] temporary ZMM | |
1013 | my $T1L = $_[7]; # [clobbered] temporary ZMM | |
1014 | my $T1M1 = $_[8]; # [clobbered] temporary ZMM | |
1015 | my $T1M2 = $_[9]; # [clobbered] temporary ZMM | |
1016 | my $HK = $_[10]; # [clobbered] temporary ZMM | |
1017 | my $AAD_HASH_IN = $_[11]; # [in] input hash value | |
1018 | my @CIPHER_IN; | |
1019 | $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3 | |
1020 | $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7 | |
1021 | $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11 | |
1022 | $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15 | |
1023 | my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks | |
1024 | my $GH = $_[17]; # [in] ZMM with hi product part | |
1025 | my $GM = $_[18]; # [in] ZMM with mid product part | |
1026 | my $GL = $_[19]; # [in] ZMM with lo product part | |
1027 | ||
1028 | die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); | |
1029 | ||
1030 | if (scalar(@_) == 17) { | |
1031 | $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n"; | |
1032 | } | |
1033 | ||
1034 | if ($NUM_BLOCKS == 16) { | |
1035 | $code .= <<___; | |
1036 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK | |
1037 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 | |
1038 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 | |
1039 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 | |
1040 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 | |
1041 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK | |
1042 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 | |
1043 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 | |
1044 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 | |
1045 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 | |
1046 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK | |
1047 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1 | |
1048 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0 | |
1049 | vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H | |
1050 | vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L | |
1051 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0 | |
1052 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1 | |
1053 | vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1 | |
1054 | vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2 | |
1055 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK | |
1056 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1 | |
1057 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0 | |
1058 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0 | |
1059 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1 | |
1060 | vpxorq $T1H,$T0H,$T1H | |
1061 | vpxorq $T1L,$T0L,$T1L | |
1062 | vpxorq $T1M1,$T0M1,$T1M1 | |
1063 | vpxorq $T1M2,$T0M2,$T1M2 | |
1064 | ___ | |
1065 | } elsif ($NUM_BLOCKS >= 12) { | |
1066 | $code .= <<___; | |
1067 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK | |
1068 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 | |
1069 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 | |
1070 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 | |
1071 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 | |
1072 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK | |
1073 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 | |
1074 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 | |
1075 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 | |
1076 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 | |
1077 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK | |
1078 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1 | |
1079 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0 | |
1080 | vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H | |
1081 | vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L | |
1082 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0 | |
1083 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1 | |
1084 | vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1 | |
1085 | vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2 | |
1086 | ___ | |
1087 | } elsif ($NUM_BLOCKS >= 8) { | |
1088 | $code .= <<___; | |
1089 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK | |
1090 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 | |
1091 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 | |
1092 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 | |
1093 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 | |
1094 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK | |
1095 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 | |
1096 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 | |
1097 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 | |
1098 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 | |
1099 | vpxorq $T1H,$T0H,$T1H | |
1100 | vpxorq $T1L,$T0L,$T1L | |
1101 | vpxorq $T1M1,$T0M1,$T1M1 | |
1102 | vpxorq $T1M2,$T0M2,$T1M2 | |
1103 | ___ | |
1104 | } elsif ($NUM_BLOCKS >= 4) { | |
1105 | $code .= <<___; | |
1106 | vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK | |
1107 | vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1 | |
1108 | vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0 | |
1109 | vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0 | |
1110 | vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1 | |
1111 | ___ | |
1112 | } | |
1113 | ||
1114 | # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4) | |
1115 | my $blocks_left = ($NUM_BLOCKS % 4); | |
1116 | if ($blocks_left > 0) { | |
1117 | ||
1118 | # ;; ===================================================== | |
1119 | # ;; There are 1, 2 or 3 blocks left to process. | |
1120 | # ;; It may also be that they are the only blocks to process. | |
1121 | ||
1122 | # ;; Set hash key and register index position for the remaining 1 to 3 blocks | |
1123 | my $reg_idx = ($NUM_BLOCKS / 4); | |
1124 | my $REG_IN = $CIPHER_IN[$reg_idx]; | |
1125 | ||
1126 | if ($blocks_left == 1) { | |
1127 | $code .= <<___; | |
1128 | vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]} | |
1129 | vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0 | |
1130 | vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1 | |
1131 | vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1 | |
1132 | vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0 | |
1133 | ___ | |
1134 | } elsif ($blocks_left == 2) { | |
1135 | $code .= <<___; | |
1136 | vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]} | |
1137 | vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0 | |
1138 | vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1 | |
1139 | vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1 | |
1140 | vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0 | |
1141 | ___ | |
1142 | } else { # ; blocks_left == 3 | |
1143 | $code .= <<___; | |
1144 | vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]} | |
1145 | vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK | |
1146 | vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0 | |
1147 | vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1 | |
1148 | vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1 | |
1149 | vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0 | |
1150 | ___ | |
1151 | } | |
1152 | ||
1153 | if (scalar(@_) == 20) { | |
1154 | ||
1155 | # ;; *** GH/GM/GL passed as arguments | |
1156 | if ($NUM_BLOCKS >= 4) { | |
1157 | $code .= <<___; | |
1158 | # ;; add ghash product sums from the first 4, 8 or 12 blocks | |
1159 | vpxorq $T1M1,$T0M1,$T0M1 | |
1160 | vpternlogq \$0x96,$T1M2,$GM,$T0M2 | |
1161 | vpternlogq \$0x96,$T1H,$GH,$T0H | |
1162 | vpternlogq \$0x96,$T1L,$GL,$T0L | |
1163 | ___ | |
1164 | } else { | |
1165 | $code .= <<___; | |
1166 | vpxorq $GM,$T0M1,$T0M1 | |
1167 | vpxorq $GH,$T0H,$T0H | |
1168 | vpxorq $GL,$T0L,$T0L | |
1169 | ___ | |
1170 | } | |
1171 | } else { | |
1172 | ||
1173 | # ;; *** GH/GM/GL NOT passed as arguments | |
1174 | if ($NUM_BLOCKS >= 4) { | |
1175 | $code .= <<___; | |
1176 | # ;; add ghash product sums from the first 4, 8 or 12 blocks | |
1177 | vpxorq $T1M1,$T0M1,$T0M1 | |
1178 | vpxorq $T1M2,$T0M2,$T0M2 | |
1179 | vpxorq $T1H,$T0H,$T0H | |
1180 | vpxorq $T1L,$T0L,$T0L | |
1181 | ___ | |
1182 | } | |
1183 | } | |
1184 | $code .= <<___; | |
1185 | # ;; integrate TM into TH and TL | |
1186 | vpxorq $T0M2,$T0M1,$T0M1 | |
1187 | vpsrldq \$8,$T0M1,$T1M1 | |
1188 | vpslldq \$8,$T0M1,$T1M2 | |
1189 | vpxorq $T1M1,$T0H,$T0H | |
1190 | vpxorq $T1M2,$T0L,$T0L | |
1191 | ___ | |
1192 | } else { | |
1193 | ||
1194 | # ;; ===================================================== | |
1195 | # ;; number of blocks is 4, 8, 12 or 16 | |
1196 | # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2 | |
1197 | if (scalar(@_) == 20) { | |
1198 | $code .= <<___; | |
1199 | # ;; *** GH/GM/GL passed as arguments | |
1200 | vpxorq $GM,$T1M1,$T1M1 | |
1201 | vpxorq $GH,$T1H,$T1H | |
1202 | vpxorq $GL,$T1L,$T1L | |
1203 | ___ | |
1204 | } | |
1205 | $code .= <<___; | |
1206 | # ;; integrate TM into TH and TL | |
1207 | vpxorq $T1M2,$T1M1,$T1M1 | |
1208 | vpsrldq \$8,$T1M1,$T0M1 | |
1209 | vpslldq \$8,$T1M1,$T0M2 | |
1210 | vpxorq $T0M1,$T1H,$T0H | |
1211 | vpxorq $T0M2,$T1L,$T0L | |
1212 | ___ | |
1213 | } | |
1214 | ||
1215 | # ;; add TH and TL 128-bit words horizontally | |
1216 | &VHPXORI4x128($T0H, $T1M1); | |
1217 | &VHPXORI4x128($T0L, $T1M2); | |
1218 | ||
1219 | # ;; reduction | |
1220 | $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n"; | |
1221 | &VCLMUL_REDUCE( | |
1222 | @{[XWORD($GHASH)]}, | |
1223 | @{[XWORD($HK)]}, | |
1224 | @{[XWORD($T0H)]}, | |
1225 | @{[XWORD($T0L)]}, | |
1226 | @{[XWORD($T0M1)]}, | |
1227 | @{[XWORD($T0M2)]}); | |
1228 | } | |
1229 | ||
1230 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1231 | # ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1) | |
1232 | # ;; Input: A and B (128-bits each, bit-reflected) | |
1233 | # ;; Output: C = A*B*x mod poly, (i.e. >>1 ) | |
1234 | # ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | |
1235 | # ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | |
1236 | # ;; | |
eb4129e1 | 1237 | # ;; Refer to [3] for more details. |
63b996e7 AM |
1238 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
1239 | sub GHASH_MUL { | |
1240 | my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits) | |
1241 | my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits) | |
1242 | my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm | |
1243 | my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm | |
1244 | my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm | |
1245 | ||
1246 | $code .= <<___; | |
1247 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1248 | vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1 | |
1249 | vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0 | |
1250 | vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0 | |
1251 | vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1 | |
1252 | vpxorq $T3,$GH,$GH | |
1253 | ||
1254 | vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs | |
1255 | vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs | |
1256 | vpxorq $T3,$T1,$T1 | |
1257 | vpxorq $T2,$GH,$GH | |
1258 | ||
1259 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1260 | # ;first phase of the reduction | |
1261 | vmovdqu64 POLY2(%rip),$T3 | |
1262 | ||
1263 | vpclmulqdq \$0x01,$GH,$T3,$T2 | |
1264 | vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs | |
1265 | vpxorq $T2,$GH,$GH # ; first phase of the reduction complete | |
1266 | ||
1267 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1268 | # ;second phase of the reduction | |
1269 | vpclmulqdq \$0x00,$GH,$T3,$T2 | |
1270 | vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R | |
1271 | vpclmulqdq \$0x10,$GH,$T3,$GH | |
1272 | vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts | |
1273 | # ; second phase of the reduction complete, the result is in $GH | |
1274 | vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2 | |
1275 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1276 | ___ | |
1277 | } | |
1278 | ||
1279 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1280 | # ;;; PRECOMPUTE computes HashKey_i | |
1281 | sub PRECOMPUTE { | |
1282 | my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated | |
1283 | my $HK = $_[1]; #; [in] xmm, hash key | |
1284 | my $T1 = $_[2]; #; [clobbered] xmm | |
1285 | my $T2 = $_[3]; #; [clobbered] xmm | |
1286 | my $T3 = $_[4]; #; [clobbered] xmm | |
1287 | my $T4 = $_[5]; #; [clobbered] xmm | |
1288 | my $T5 = $_[6]; #; [clobbered] xmm | |
1289 | my $T6 = $_[7]; #; [clobbered] xmm | |
1290 | ||
1291 | my $ZT1 = &ZWORD($T1); | |
1292 | my $ZT2 = &ZWORD($T2); | |
1293 | my $ZT3 = &ZWORD($T3); | |
1294 | my $ZT4 = &ZWORD($T4); | |
1295 | my $ZT5 = &ZWORD($T5); | |
1296 | my $ZT6 = &ZWORD($T6); | |
1297 | ||
1298 | my $YT1 = &YWORD($T1); | |
1299 | my $YT2 = &YWORD($T2); | |
1300 | my $YT3 = &YWORD($T3); | |
1301 | my $YT4 = &YWORD($T4); | |
1302 | my $YT5 = &YWORD($T5); | |
1303 | my $YT6 = &YWORD($T6); | |
1304 | ||
1305 | $code .= <<___; | |
1306 | vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5 | |
1307 | vmovdqa $YT5,$YT4 | |
1308 | ___ | |
1309 | ||
1310 | # ;; calculate HashKey^2<<1 mod poly | |
1311 | &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3); | |
1312 | ||
1313 | $code .= <<___; | |
1314 | vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]} | |
1315 | vinserti64x2 \$1,$HK,$YT4,$YT5 | |
1316 | vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2 | |
1317 | ___ | |
1318 | ||
1319 | # ;; use 2x128-bit computation | |
1320 | # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly | |
1321 | &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4 | |
1322 | ||
1323 | $code .= <<___; | |
1324 | vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]} | |
1325 | ||
1326 | vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5 | |
1327 | ||
1328 | # ;; switch to 4x128-bit computations now | |
1329 | vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4 | |
1330 | vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6 | |
1331 | ___ | |
1332 | ||
1333 | # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly | |
1334 | &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3); | |
1335 | $code .= <<___; | |
1336 | vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now | |
1337 | vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4 | |
1338 | ___ | |
1339 | ||
1340 | # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly | |
1341 | # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution | |
1342 | ||
1343 | # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9) | |
1344 | &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3); | |
1345 | $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n"; | |
1346 | ||
1347 | # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13) | |
1348 | &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3); | |
1349 | $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n"; | |
1350 | ||
1351 | # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys | |
1352 | } | |
1353 | ||
1354 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1355 | # ;; READ_SMALL_DATA_INPUT | |
1356 | # ;; Packs xmm register with data when data input is less or equal to 16 bytes | |
1357 | # ;; Returns 0 if data has length 0 | |
1358 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1359 | sub READ_SMALL_DATA_INPUT { | |
1360 | my $OUTPUT = $_[0]; # [out] xmm register | |
1361 | my $INPUT = $_[1]; # [in] buffer pointer to read from | |
1362 | my $LENGTH = $_[2]; # [in] number of bytes to read | |
1363 | my $TMP1 = $_[3]; # [clobbered] | |
1364 | my $TMP2 = $_[4]; # [clobbered] | |
1365 | my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask | |
1366 | ||
1367 | $code .= <<___; | |
1368 | mov \$16,@{[DWORD($TMP2)]} | |
1369 | lea byte_len_to_mask_table(%rip),$TMP1 | |
1370 | cmp $TMP2,$LENGTH | |
1371 | cmovc $LENGTH,$TMP2 | |
1372 | ___ | |
1373 | if ($win64) { | |
1374 | $code .= <<___; | |
1375 | add $TMP2,$TMP1 | |
1376 | add $TMP2,$TMP1 | |
1377 | kmovw ($TMP1),$MASK | |
1378 | ___ | |
1379 | } else { | |
1380 | $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n"; | |
1381 | } | |
1382 | $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n"; | |
1383 | } | |
1384 | ||
1385 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1386 | # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. | |
1387 | # Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). | |
1388 | # Output: The hash of the data (AAD_HASH). | |
1389 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1390 | sub CALC_AAD_HASH { | |
1391 | my $A_IN = $_[0]; # [in] AAD text pointer | |
1392 | my $A_LEN = $_[1]; # [in] AAD length | |
1393 | my $AAD_HASH = $_[2]; # [in/out] xmm ghash value | |
1394 | my $GCM128_CTX = $_[3]; # [in] pointer to context | |
1395 | my $ZT0 = $_[4]; # [clobbered] ZMM register | |
1396 | my $ZT1 = $_[5]; # [clobbered] ZMM register | |
1397 | my $ZT2 = $_[6]; # [clobbered] ZMM register | |
1398 | my $ZT3 = $_[7]; # [clobbered] ZMM register | |
1399 | my $ZT4 = $_[8]; # [clobbered] ZMM register | |
1400 | my $ZT5 = $_[9]; # [clobbered] ZMM register | |
1401 | my $ZT6 = $_[10]; # [clobbered] ZMM register | |
1402 | my $ZT7 = $_[11]; # [clobbered] ZMM register | |
1403 | my $ZT8 = $_[12]; # [clobbered] ZMM register | |
1404 | my $ZT9 = $_[13]; # [clobbered] ZMM register | |
1405 | my $ZT10 = $_[14]; # [clobbered] ZMM register | |
1406 | my $ZT11 = $_[15]; # [clobbered] ZMM register | |
1407 | my $ZT12 = $_[16]; # [clobbered] ZMM register | |
1408 | my $ZT13 = $_[17]; # [clobbered] ZMM register | |
1409 | my $ZT14 = $_[18]; # [clobbered] ZMM register | |
1410 | my $ZT15 = $_[19]; # [clobbered] ZMM register | |
1411 | my $ZT16 = $_[20]; # [clobbered] ZMM register | |
1412 | my $T1 = $_[21]; # [clobbered] GP register | |
1413 | my $T2 = $_[22]; # [clobbered] GP register | |
1414 | my $T3 = $_[23]; # [clobbered] GP register | |
1415 | my $MASKREG = $_[24]; # [clobbered] mask register | |
1416 | ||
1417 | my $HKEYS_READY = "%rbx"; | |
1418 | ||
1419 | my $SHFMSK = $ZT13; | |
1420 | ||
1421 | my $rndsuffix = &random_string(); | |
1422 | ||
1423 | $code .= <<___; | |
1424 | mov $A_IN,$T1 # ; T1 = AAD | |
1425 | mov $A_LEN,$T2 # ; T2 = aadLen | |
1426 | or $T2,$T2 | |
1427 | jz .L_CALC_AAD_done_${rndsuffix} | |
1428 | ||
1429 | xor $HKEYS_READY,$HKEYS_READY | |
1430 | vmovdqa64 SHUF_MASK(%rip),$SHFMSK | |
1431 | ||
1432 | .L_get_AAD_loop48x16_${rndsuffix}: | |
1433 | cmp \$`(48*16)`,$T2 | |
1434 | jl .L_exit_AAD_loop48x16_${rndsuffix} | |
1435 | ___ | |
1436 | ||
1437 | $code .= <<___; | |
1438 | vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3 | |
1439 | vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7 | |
1440 | vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11 | |
1441 | vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15 | |
1442 | vpshufb $SHFMSK,$ZT1,$ZT1 | |
1443 | vpshufb $SHFMSK,$ZT2,$ZT2 | |
1444 | vpshufb $SHFMSK,$ZT3,$ZT3 | |
1445 | vpshufb $SHFMSK,$ZT4,$ZT4 | |
1446 | ___ | |
1447 | ||
1448 | &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all"); | |
1449 | $code .= "mov \$1,$HKEYS_READY\n"; | |
1450 | ||
1451 | &GHASH_16( | |
1452 | "start", $ZT5, $ZT6, $ZT7, | |
1453 | "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", | |
1454 | &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0, | |
1455 | $ZT8, $ZT9, $ZT10, $ZT11, | |
1456 | $ZT12, $ZT14, $ZT15, $ZT16, | |
1457 | "NO_ZMM", $ZT1, $ZT2, $ZT3, | |
1458 | $ZT4); | |
1459 | ||
1460 | $code .= <<___; | |
1461 | vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19 | |
1462 | vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23 | |
1463 | vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27 | |
1464 | vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31 | |
1465 | vpshufb $SHFMSK,$ZT1,$ZT1 | |
1466 | vpshufb $SHFMSK,$ZT2,$ZT2 | |
1467 | vpshufb $SHFMSK,$ZT3,$ZT3 | |
1468 | vpshufb $SHFMSK,$ZT4,$ZT4 | |
1469 | ___ | |
1470 | ||
1471 | &GHASH_16( | |
1472 | "mid", $ZT5, $ZT6, $ZT7, | |
1473 | "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", | |
1474 | &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0, | |
1475 | $ZT8, $ZT9, $ZT10, $ZT11, | |
1476 | $ZT12, $ZT14, $ZT15, $ZT16, | |
1477 | "NO_ZMM", $ZT1, $ZT2, $ZT3, | |
1478 | $ZT4); | |
1479 | ||
1480 | $code .= <<___; | |
1481 | vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35 | |
1482 | vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39 | |
1483 | vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43 | |
1484 | vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47 | |
1485 | vpshufb $SHFMSK,$ZT1,$ZT1 | |
1486 | vpshufb $SHFMSK,$ZT2,$ZT2 | |
1487 | vpshufb $SHFMSK,$ZT3,$ZT3 | |
1488 | vpshufb $SHFMSK,$ZT4,$ZT4 | |
1489 | ___ | |
1490 | ||
1491 | &GHASH_16( | |
1492 | "end_reduce", $ZT5, $ZT6, $ZT7, | |
1493 | "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", | |
1494 | &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, | |
1495 | $ZT8, $ZT9, $ZT10, $ZT11, | |
1496 | $ZT12, $ZT14, $ZT15, $ZT16, | |
1497 | "NO_ZMM", $ZT1, $ZT2, $ZT3, | |
1498 | $ZT4); | |
1499 | ||
1500 | $code .= <<___; | |
1501 | sub \$`(48*16)`,$T2 | |
1502 | je .L_CALC_AAD_done_${rndsuffix} | |
1503 | ||
1504 | add \$`(48*16)`,$T1 | |
1505 | jmp .L_get_AAD_loop48x16_${rndsuffix} | |
1506 | ||
1507 | .L_exit_AAD_loop48x16_${rndsuffix}: | |
1508 | # ; Less than 48x16 bytes remaining | |
1509 | cmp \$`(32*16)`,$T2 | |
1510 | jl .L_less_than_32x16_${rndsuffix} | |
1511 | ___ | |
1512 | ||
1513 | $code .= <<___; | |
1514 | # ; Get next 16 blocks | |
1515 | vmovdqu64 `64*0`($T1),$ZT1 | |
1516 | vmovdqu64 `64*1`($T1),$ZT2 | |
1517 | vmovdqu64 `64*2`($T1),$ZT3 | |
1518 | vmovdqu64 `64*3`($T1),$ZT4 | |
1519 | vpshufb $SHFMSK,$ZT1,$ZT1 | |
1520 | vpshufb $SHFMSK,$ZT2,$ZT2 | |
1521 | vpshufb $SHFMSK,$ZT3,$ZT3 | |
1522 | vpshufb $SHFMSK,$ZT4,$ZT4 | |
1523 | ___ | |
1524 | ||
1525 | &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32"); | |
1526 | $code .= "mov \$1,$HKEYS_READY\n"; | |
1527 | ||
1528 | &GHASH_16( | |
1529 | "start", $ZT5, $ZT6, $ZT7, | |
1530 | "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", | |
1531 | &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, | |
1532 | $ZT8, $ZT9, $ZT10, $ZT11, | |
1533 | $ZT12, $ZT14, $ZT15, $ZT16, | |
1534 | "NO_ZMM", $ZT1, $ZT2, $ZT3, | |
1535 | $ZT4); | |
1536 | ||
1537 | $code .= <<___; | |
1538 | vmovdqu64 `16*16 + 64*0`($T1),$ZT1 | |
1539 | vmovdqu64 `16*16 + 64*1`($T1),$ZT2 | |
1540 | vmovdqu64 `16*16 + 64*2`($T1),$ZT3 | |
1541 | vmovdqu64 `16*16 + 64*3`($T1),$ZT4 | |
1542 | vpshufb $SHFMSK,$ZT1,$ZT1 | |
1543 | vpshufb $SHFMSK,$ZT2,$ZT2 | |
1544 | vpshufb $SHFMSK,$ZT3,$ZT3 | |
1545 | vpshufb $SHFMSK,$ZT4,$ZT4 | |
1546 | ___ | |
1547 | ||
1548 | &GHASH_16( | |
1549 | "end_reduce", $ZT5, $ZT6, $ZT7, | |
1550 | "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", | |
1551 | &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, | |
1552 | $ZT8, $ZT9, $ZT10, $ZT11, | |
1553 | $ZT12, $ZT14, $ZT15, $ZT16, | |
1554 | "NO_ZMM", $ZT1, $ZT2, $ZT3, | |
1555 | $ZT4); | |
1556 | ||
1557 | $code .= <<___; | |
1558 | sub \$`(32*16)`,$T2 | |
1559 | je .L_CALC_AAD_done_${rndsuffix} | |
1560 | ||
1561 | add \$`(32*16)`,$T1 | |
1562 | jmp .L_less_than_16x16_${rndsuffix} | |
1563 | ||
1564 | .L_less_than_32x16_${rndsuffix}: | |
1565 | cmp \$`(16*16)`,$T2 | |
1566 | jl .L_less_than_16x16_${rndsuffix} | |
1567 | # ; Get next 16 blocks | |
1568 | vmovdqu64 `64*0`($T1),$ZT1 | |
1569 | vmovdqu64 `64*1`($T1),$ZT2 | |
1570 | vmovdqu64 `64*2`($T1),$ZT3 | |
1571 | vmovdqu64 `64*3`($T1),$ZT4 | |
1572 | vpshufb $SHFMSK,$ZT1,$ZT1 | |
1573 | vpshufb $SHFMSK,$ZT2,$ZT2 | |
1574 | vpshufb $SHFMSK,$ZT3,$ZT3 | |
1575 | vpshufb $SHFMSK,$ZT4,$ZT4 | |
1576 | ___ | |
1577 | ||
1578 | # ; This code path does not use more than 16 hkeys, so they can be taken from the context | |
1579 | # ; (not from the stack storage) | |
1580 | &GHASH_16( | |
1581 | "start_reduce", $ZT5, $ZT6, $ZT7, | |
1582 | "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX, | |
1583 | &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0, | |
1584 | $ZT8, $ZT9, $ZT10, $ZT11, | |
1585 | $ZT12, $ZT14, $ZT15, $ZT16, | |
1586 | "NO_ZMM", $ZT1, $ZT2, $ZT3, | |
1587 | $ZT4); | |
1588 | ||
1589 | $code .= <<___; | |
1590 | sub \$`(16*16)`,$T2 | |
1591 | je .L_CALC_AAD_done_${rndsuffix} | |
1592 | ||
1593 | add \$`(16*16)`,$T1 | |
1594 | # ; Less than 16x16 bytes remaining | |
1595 | .L_less_than_16x16_${rndsuffix}: | |
1596 | # ;; prep mask source address | |
1597 | lea byte64_len_to_mask_table(%rip),$T3 | |
1598 | lea ($T3,$T2,8),$T3 | |
1599 | ||
1600 | # ;; calculate number of blocks to ghash (including partial bytes) | |
1601 | add \$15,@{[DWORD($T2)]} | |
1602 | shr \$4,@{[DWORD($T2)]} | |
1603 | cmp \$2,@{[DWORD($T2)]} | |
1604 | jb .L_AAD_blocks_1_${rndsuffix} | |
1605 | je .L_AAD_blocks_2_${rndsuffix} | |
1606 | cmp \$4,@{[DWORD($T2)]} | |
1607 | jb .L_AAD_blocks_3_${rndsuffix} | |
1608 | je .L_AAD_blocks_4_${rndsuffix} | |
1609 | cmp \$6,@{[DWORD($T2)]} | |
1610 | jb .L_AAD_blocks_5_${rndsuffix} | |
1611 | je .L_AAD_blocks_6_${rndsuffix} | |
1612 | cmp \$8,@{[DWORD($T2)]} | |
1613 | jb .L_AAD_blocks_7_${rndsuffix} | |
1614 | je .L_AAD_blocks_8_${rndsuffix} | |
1615 | cmp \$10,@{[DWORD($T2)]} | |
1616 | jb .L_AAD_blocks_9_${rndsuffix} | |
1617 | je .L_AAD_blocks_10_${rndsuffix} | |
1618 | cmp \$12,@{[DWORD($T2)]} | |
1619 | jb .L_AAD_blocks_11_${rndsuffix} | |
1620 | je .L_AAD_blocks_12_${rndsuffix} | |
1621 | cmp \$14,@{[DWORD($T2)]} | |
1622 | jb .L_AAD_blocks_13_${rndsuffix} | |
1623 | je .L_AAD_blocks_14_${rndsuffix} | |
1624 | cmp \$15,@{[DWORD($T2)]} | |
1625 | je .L_AAD_blocks_15_${rndsuffix} | |
1626 | ___ | |
1627 | ||
1628 | # ;; fall through for 16 blocks | |
1629 | ||
1630 | # ;; The flow of each of these cases is identical: | |
1631 | # ;; - load blocks plain text | |
1632 | # ;; - shuffle loaded blocks | |
1633 | # ;; - xor in current hash value into block 0 | |
1634 | # ;; - perform up multiplications with ghash keys | |
1635 | # ;; - jump to reduction code | |
1636 | ||
1637 | for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) { | |
1638 | $code .= ".L_AAD_blocks_${aad_blocks}_${rndsuffix}:\n"; | |
1639 | if ($aad_blocks > 12) { | |
1640 | $code .= "sub \$`12*16*8`, $T3\n"; | |
1641 | } elsif ($aad_blocks > 8) { | |
1642 | $code .= "sub \$`8*16*8`, $T3\n"; | |
1643 | } elsif ($aad_blocks > 4) { | |
1644 | $code .= "sub \$`4*16*8`, $T3\n"; | |
1645 | } | |
1646 | $code .= "kmovq ($T3),$MASKREG\n"; | |
1647 | ||
1648 | &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG); | |
1649 | ||
1650 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4, | |
1651 | $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); | |
1652 | ||
1653 | &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH), | |
1654 | $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks); | |
1655 | ||
1656 | if ($aad_blocks > 1) { | |
1657 | ||
1658 | # ;; fall through to CALC_AAD_done in 1 block case | |
1659 | $code .= "jmp .L_CALC_AAD_done_${rndsuffix}\n"; | |
1660 | } | |
1661 | ||
1662 | } | |
1663 | $code .= ".L_CALC_AAD_done_${rndsuffix}:\n"; | |
1664 | ||
1665 | # ;; result in AAD_HASH | |
1666 | } | |
1667 | ||
1668 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1669 | # ;; PARTIAL_BLOCK | |
1670 | # ;; Handles encryption/decryption and the tag partial blocks between | |
1671 | # ;; update calls. | |
1672 | # ;; Requires the input data be at least 1 byte long. | |
1673 | # ;; Output: | |
1674 | # ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT), | |
1675 | # ;; AAD_HASH and updated GCM128_CTX | |
1676 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1677 | sub PARTIAL_BLOCK { | |
1678 | my $GCM128_CTX = $_[0]; # [in] key pointer | |
1679 | my $PBLOCK_LEN = $_[1]; # [in] partial block length | |
1680 | my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer | |
1681 | my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer | |
1682 | my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length | |
1683 | my $DATA_OFFSET = $_[5]; # [out] data offset (gets set) | |
1684 | my $AAD_HASH = $_[6]; # [out] updated GHASH value | |
1685 | my $ENC_DEC = $_[7]; # [in] cipher direction | |
1686 | my $GPTMP0 = $_[8]; # [clobbered] GP temporary register | |
1687 | my $GPTMP1 = $_[9]; # [clobbered] GP temporary register | |
1688 | my $GPTMP2 = $_[10]; # [clobbered] GP temporary register | |
1689 | my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register | |
1690 | my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register | |
1691 | my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register | |
1692 | my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register | |
1693 | my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register | |
1694 | my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register | |
1695 | my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register | |
1696 | my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register | |
1697 | my $MASKREG = $_[19]; # [clobbered] mask temporary register | |
1698 | ||
1699 | my $XTMP0 = &XWORD($ZTMP0); | |
1700 | my $XTMP1 = &XWORD($ZTMP1); | |
1701 | my $XTMP2 = &XWORD($ZTMP2); | |
1702 | my $XTMP3 = &XWORD($ZTMP3); | |
1703 | my $XTMP4 = &XWORD($ZTMP4); | |
1704 | my $XTMP5 = &XWORD($ZTMP5); | |
1705 | my $XTMP6 = &XWORD($ZTMP6); | |
1706 | my $XTMP7 = &XWORD($ZTMP7); | |
1707 | ||
1708 | my $LENGTH = $DATA_OFFSET; | |
1709 | my $IA0 = $GPTMP1; | |
1710 | my $IA1 = $GPTMP2; | |
1711 | my $IA2 = $GPTMP0; | |
1712 | ||
1713 | my $rndsuffix = &random_string(); | |
1714 | ||
1715 | $code .= <<___; | |
1716 | # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero | |
1717 | mov ($PBLOCK_LEN),$LENGTH | |
1718 | or $LENGTH,$LENGTH | |
1719 | je .L_partial_block_done_${rndsuffix} # ;Leave Macro if no partial blocks | |
1720 | ___ | |
1721 | ||
1722 | &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG); | |
1723 | ||
1724 | $code .= <<___; | |
1725 | # ;; XTMP1 = my_ctx_data.partial_block_enc_key | |
1726 | vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1 | |
1727 | vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2 | |
1728 | ||
1729 | # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes | |
1730 | # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16) | |
1731 | lea SHIFT_MASK(%rip),$IA0 | |
1732 | add $LENGTH,$IA0 | |
1733 | vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask | |
1734 | vpshufb $XTMP3,$XTMP1,$XTMP1 | |
1735 | ___ | |
1736 | ||
1737 | if ($ENC_DEC eq "DEC") { | |
1738 | $code .= <<___; | |
1739 | # ;; keep copy of cipher text in $XTMP4 | |
1740 | vmovdqa64 $XTMP0,$XTMP4 | |
1741 | ___ | |
1742 | } | |
1743 | $code .= <<___; | |
1744 | vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn) | |
1745 | # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block | |
1746 | # ;; Determine if partial block is not being filled and shift mask accordingly | |
1747 | ___ | |
1748 | if ($win64) { | |
1749 | $code .= <<___; | |
1750 | mov $PLAIN_CIPH_LEN,$IA1 | |
1751 | add $LENGTH,$IA1 | |
1752 | ___ | |
1753 | } else { | |
1754 | $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n"; | |
1755 | } | |
1756 | $code .= <<___; | |
1757 | sub \$16,$IA1 | |
1758 | jge .L_no_extra_mask_${rndsuffix} | |
1759 | sub $IA1,$IA0 | |
1760 | .L_no_extra_mask_${rndsuffix}: | |
1761 | # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1 | |
1762 | # ;; - mask out bottom $LENGTH bytes of $XTMP1 | |
1763 | # ;; sizeof(SHIFT_MASK) == 16 bytes | |
1764 | vmovdqu64 16($IA0),$XTMP0 | |
1765 | vpand $XTMP0,$XTMP1,$XTMP1 | |
1766 | ___ | |
1767 | ||
1768 | if ($ENC_DEC eq "DEC") { | |
1769 | $code .= <<___; | |
1770 | vpand $XTMP0,$XTMP4,$XTMP4 | |
1771 | vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4 | |
1772 | vpshufb $XTMP3,$XTMP4,$XTMP4 | |
1773 | vpxorq $XTMP4,$AAD_HASH,$AAD_HASH | |
1774 | ___ | |
1775 | } else { | |
1776 | $code .= <<___; | |
1777 | vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1 | |
1778 | vpshufb $XTMP3,$XTMP1,$XTMP1 | |
1779 | vpxorq $XTMP1,$AAD_HASH,$AAD_HASH | |
1780 | ___ | |
1781 | } | |
1782 | $code .= <<___; | |
1783 | cmp \$0,$IA1 | |
1784 | jl .L_partial_incomplete_${rndsuffix} | |
1785 | ___ | |
1786 | ||
1787 | # ;; GHASH computation for the last <16 Byte block | |
1788 | &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7); | |
1789 | ||
1790 | $code .= <<___; | |
1791 | movq \$0, ($PBLOCK_LEN) | |
1792 | # ;; Set $LENGTH to be the number of bytes to write out | |
1793 | mov $LENGTH,$IA0 | |
1794 | mov \$16,$LENGTH | |
1795 | sub $IA0,$LENGTH | |
1796 | jmp .L_enc_dec_done_${rndsuffix} | |
1797 | ||
1798 | .L_partial_incomplete_${rndsuffix}: | |
1799 | ___ | |
1800 | if ($win64) { | |
1801 | $code .= <<___; | |
1802 | mov $PLAIN_CIPH_LEN,$IA0 | |
1803 | add $IA0,($PBLOCK_LEN) | |
1804 | ___ | |
1805 | } else { | |
1806 | $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n"; | |
1807 | } | |
1808 | $code .= <<___; | |
1809 | mov $PLAIN_CIPH_LEN,$LENGTH | |
1810 | ||
1811 | .L_enc_dec_done_${rndsuffix}: | |
1812 | # ;; output encrypted Bytes | |
1813 | ||
1814 | lea byte_len_to_mask_table(%rip),$IA0 | |
1815 | kmovw ($IA0,$LENGTH,2),$MASKREG | |
1816 | vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX) | |
1817 | ___ | |
1818 | ||
1819 | if ($ENC_DEC eq "ENC") { | |
1820 | $code .= <<___; | |
1821 | # ;; shuffle XTMP1 back to output as ciphertext | |
1822 | vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1 | |
1823 | vpshufb $XTMP3,$XTMP1,$XTMP1 | |
1824 | ___ | |
1825 | } | |
1826 | $code .= <<___; | |
1827 | mov $CIPH_PLAIN_OUT,$IA0 | |
1828 | vmovdqu8 $XTMP1,($IA0){$MASKREG} | |
1829 | .L_partial_block_done_${rndsuffix}: | |
1830 | ___ | |
1831 | } | |
1832 | ||
1833 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1834 | # ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation | |
1835 | sub INITIAL_BLOCKS_PARTIAL_CIPHER { | |
1836 | my $AES_KEYS = $_[0]; # [in] key pointer | |
1837 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
1838 | my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer | |
1839 | my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer | |
1840 | my $LENGTH = $_[4]; # [in/clobbered] length in bytes | |
1841 | my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated) | |
1842 | my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) | |
1843 | my $CTR = $_[7]; # [in/out] current counter value | |
1844 | my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC) | |
1845 | my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH | |
1846 | my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH | |
1847 | my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH | |
1848 | my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH | |
1849 | my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text | |
1850 | my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH | |
1851 | my $CTR0 = $_[15]; # [clobbered] ZMM temporary | |
1852 | my $CTR1 = $_[16]; # [clobbered] ZMM temporary | |
1853 | my $CTR2 = $_[17]; # [clobbered] ZMM temporary | |
1854 | my $CTR3 = $_[18]; # [clobbered] ZMM temporary | |
1855 | my $ZT1 = $_[19]; # [clobbered] ZMM temporary | |
1856 | my $IA0 = $_[20]; # [clobbered] GP temporary | |
1857 | my $IA1 = $_[21]; # [clobbered] GP temporary | |
1858 | my $MASKREG = $_[22]; # [clobbered] mask register | |
1859 | my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask | |
1860 | ||
1861 | if ($NUM_BLOCKS == 1) { | |
1862 | $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n"; | |
1863 | } elsif ($NUM_BLOCKS == 2) { | |
1864 | $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n"; | |
1865 | } else { | |
1866 | $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n"; | |
1867 | } | |
1868 | ||
1869 | # ;; prepare AES counter blocks | |
1870 | if ($NUM_BLOCKS == 1) { | |
1871 | $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n"; | |
1872 | } elsif ($NUM_BLOCKS == 2) { | |
1873 | $code .= <<___; | |
1874 | vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]} | |
1875 | vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]} | |
1876 | ___ | |
1877 | } else { | |
1878 | $code .= <<___; | |
1879 | vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]} | |
1880 | vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0 | |
1881 | ___ | |
1882 | if ($NUM_BLOCKS > 4) { | |
1883 | $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n"; | |
1884 | } | |
1885 | if ($NUM_BLOCKS > 8) { | |
1886 | $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n"; | |
1887 | } | |
1888 | if ($NUM_BLOCKS > 12) { | |
1889 | $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n"; | |
1890 | } | |
1891 | } | |
1892 | ||
1893 | # ;; get load/store mask | |
1894 | $code .= <<___; | |
1895 | lea byte64_len_to_mask_table(%rip),$IA0 | |
1896 | mov $LENGTH,$IA1 | |
1897 | ___ | |
1898 | if ($NUM_BLOCKS > 12) { | |
1899 | $code .= "sub \$`3*64`,$IA1\n"; | |
1900 | } elsif ($NUM_BLOCKS > 8) { | |
1901 | $code .= "sub \$`2*64`,$IA1\n"; | |
1902 | } elsif ($NUM_BLOCKS > 4) { | |
1903 | $code .= "sub \$`1*64`,$IA1\n"; | |
1904 | } | |
1905 | $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n"; | |
1906 | ||
1907 | # ;; extract new counter value | |
1908 | # ;; shuffle the counters for AES rounds | |
1909 | if ($NUM_BLOCKS <= 4) { | |
1910 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n"; | |
1911 | } elsif ($NUM_BLOCKS <= 8) { | |
1912 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n"; | |
1913 | } elsif ($NUM_BLOCKS <= 12) { | |
1914 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n"; | |
1915 | } else { | |
1916 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n"; | |
1917 | } | |
1918 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
1919 | $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0, | |
1920 | $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); | |
1921 | ||
1922 | # ;; load plain/cipher text | |
1923 | &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG); | |
1924 | ||
1925 | # ;; AES rounds and XOR with plain/cipher text | |
1926 | foreach my $j (0 .. ($NROUNDS + 1)) { | |
1927 | $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n"; | |
1928 | &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j, | |
1929 | $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS); | |
1930 | } | |
1931 | ||
1932 | # ;; retrieve the last cipher counter block (partially XOR'ed with text) | |
1933 | # ;; - this is needed for partial block cases | |
1934 | if ($NUM_BLOCKS <= 4) { | |
1935 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n"; | |
1936 | } elsif ($NUM_BLOCKS <= 8) { | |
1937 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n"; | |
1938 | } elsif ($NUM_BLOCKS <= 12) { | |
1939 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n"; | |
1940 | } else { | |
1941 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n"; | |
1942 | } | |
1943 | ||
1944 | # ;; write cipher/plain text back to output and | |
1945 | $code .= "mov $CIPH_PLAIN_OUT,$IA0\n"; | |
1946 | &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG); | |
1947 | ||
1948 | # ;; zero bytes outside the mask before hashing | |
1949 | if ($NUM_BLOCKS <= 4) { | |
1950 | $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n"; | |
1951 | } elsif ($NUM_BLOCKS <= 8) { | |
1952 | $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n"; | |
1953 | } elsif ($NUM_BLOCKS <= 12) { | |
1954 | $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n"; | |
1955 | } else { | |
1956 | $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n"; | |
1957 | } | |
1958 | ||
1959 | # ;; Shuffle the cipher text blocks for hashing part | |
1960 | # ;; ZT5 and ZT6 are expected outputs with blocks for hashing | |
1961 | if ($ENC_DEC eq "DEC") { | |
1962 | ||
1963 | # ;; Decrypt case | |
1964 | # ;; - cipher blocks are in ZT5 & ZT6 | |
1965 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
1966 | $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0, | |
1967 | $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); | |
1968 | } else { | |
1969 | ||
1970 | # ;; Encrypt case | |
1971 | # ;; - cipher blocks are in CTR0-CTR3 | |
1972 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
1973 | $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0, | |
1974 | $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); | |
1975 | } | |
1976 | ||
1977 | # ;; Extract the last block for partials and multi_call cases | |
1978 | if ($NUM_BLOCKS <= 4) { | |
1979 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n"; | |
1980 | } elsif ($NUM_BLOCKS <= 8) { | |
1981 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n"; | |
1982 | } elsif ($NUM_BLOCKS <= 12) { | |
1983 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n"; | |
1984 | } else { | |
1985 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n"; | |
1986 | } | |
1987 | ||
1988 | } | |
1989 | ||
1990 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1991 | # ;; Computes GHASH on 1 to 16 blocks | |
1992 | sub INITIAL_BLOCKS_PARTIAL_GHASH { | |
1993 | my $AES_KEYS = $_[0]; # [in] key pointer | |
1994 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
1995 | my $LENGTH = $_[2]; # [in/clobbered] length in bytes | |
1996 | my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) | |
1997 | my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value | |
1998 | my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC) | |
1999 | my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH | |
2000 | my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH | |
2001 | my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH | |
2002 | my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH | |
2003 | my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text | |
2004 | my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH | |
2005 | my $ZT0 = $_[12]; # [clobbered] ZMM temporary | |
2006 | my $ZT1 = $_[13]; # [clobbered] ZMM temporary | |
2007 | my $ZT2 = $_[14]; # [clobbered] ZMM temporary | |
2008 | my $ZT3 = $_[15]; # [clobbered] ZMM temporary | |
2009 | my $ZT4 = $_[16]; # [clobbered] ZMM temporary | |
2010 | my $ZT5 = $_[17]; # [clobbered] ZMM temporary | |
2011 | my $ZT6 = $_[18]; # [clobbered] ZMM temporary | |
2012 | my $ZT7 = $_[19]; # [clobbered] ZMM temporary | |
2013 | my $ZT8 = $_[20]; # [clobbered] ZMM temporary | |
2014 | my $PBLOCK_LEN = $_[21]; # [in] partial block length | |
2015 | my $GH = $_[22]; # [in] ZMM with hi product part | |
2016 | my $GM = $_[23]; # [in] ZMM with mid prodcut part | |
2017 | my $GL = $_[24]; # [in] ZMM with lo product part | |
2018 | ||
2019 | my $rndsuffix = &random_string(); | |
2020 | ||
2021 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2022 | # ;;; - Hash all but the last partial block of data | |
2023 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2024 | ||
2025 | # ;; update data offset | |
2026 | if ($NUM_BLOCKS > 1) { | |
2027 | ||
2028 | # ;; The final block of data may be <16B | |
2029 | $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n"; | |
2030 | } | |
2031 | ||
2032 | if ($NUM_BLOCKS < 16) { | |
2033 | $code .= <<___; | |
2034 | # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16. | |
2035 | # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256. | |
2036 | cmp \$16,$LENGTH | |
2037 | jl .L_small_initial_partial_block_${rndsuffix} | |
2038 | ||
2039 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2040 | # ;;; Handle a full length final block - encrypt and hash all blocks | |
2041 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2042 | ||
2043 | sub \$16,$LENGTH | |
2044 | movq \$0,($PBLOCK_LEN) | |
2045 | ___ | |
2046 | ||
2047 | # ;; Hash all of the data | |
2048 | if (scalar(@_) == 22) { | |
2049 | ||
2050 | # ;; start GHASH compute | |
2051 | &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, | |
2052 | $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS); | |
2053 | } elsif (scalar(@_) == 25) { | |
2054 | ||
2055 | # ;; continue GHASH compute | |
2056 | &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, | |
2057 | $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL); | |
2058 | } | |
2059 | $code .= "jmp .L_small_initial_compute_done_${rndsuffix}\n"; | |
2060 | } | |
2061 | ||
2062 | $code .= <<___; | |
2063 | .L_small_initial_partial_block_${rndsuffix}: | |
2064 | ||
2065 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2066 | # ;;; Handle ghash for a <16B final block | |
2067 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2068 | ||
2069 | # ;; As it's an init / update / finalize series we need to leave the | |
2070 | # ;; last block if it's less than a full block of data. | |
2071 | ||
2072 | mov $LENGTH,($PBLOCK_LEN) | |
2073 | vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX) | |
2074 | ___ | |
2075 | ||
2076 | my $k = ($NUM_BLOCKS - 1); | |
2077 | my $last_block_to_hash = 1; | |
2078 | if (($NUM_BLOCKS > $last_block_to_hash)) { | |
2079 | ||
2080 | # ;; ZT12-ZT20 - temporary registers | |
2081 | if (scalar(@_) == 22) { | |
2082 | ||
2083 | # ;; start GHASH compute | |
2084 | &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, | |
2085 | $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k); | |
2086 | } elsif (scalar(@_) == 25) { | |
2087 | ||
2088 | # ;; continue GHASH compute | |
2089 | &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, | |
2090 | $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL); | |
2091 | } | |
2092 | ||
2093 | # ;; just fall through no jmp needed | |
2094 | } else { | |
2095 | ||
2096 | if (scalar(@_) == 25) { | |
2097 | $code .= <<___; | |
2098 | # ;; Reduction is required in this case. | |
2099 | # ;; Integrate GM into GH and GL. | |
2100 | vpsrldq \$8,$GM,$ZT0 | |
2101 | vpslldq \$8,$GM,$ZT1 | |
2102 | vpxorq $ZT0,$GH,$GH | |
2103 | vpxorq $ZT1,$GL,$GL | |
2104 | ___ | |
2105 | ||
2106 | # ;; Add GH and GL 128-bit words horizontally | |
2107 | &VHPXORI4x128($GH, $ZT0); | |
2108 | &VHPXORI4x128($GL, $ZT1); | |
2109 | ||
2110 | # ;; 256-bit to 128-bit reduction | |
2111 | $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n"; | |
2112 | &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2)); | |
2113 | } | |
2114 | $code .= <<___; | |
2115 | # ;; Record that a reduction is not needed - | |
2116 | # ;; In this case no hashes are computed because there | |
2117 | # ;; is only one initial block and it is < 16B in length. | |
2118 | # ;; We only need to check if a reduction is needed if | |
2119 | # ;; initial_blocks == 1 and init/update/final is being used. | |
2120 | # ;; In this case we may just have a partial block, and that | |
2121 | # ;; gets hashed in finalize. | |
2122 | ||
2123 | # ;; The hash should end up in HASH_IN_OUT. | |
2124 | # ;; The only way we should get here is if there is | |
2125 | # ;; a partial block of data, so xor that into the hash. | |
2126 | vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT | |
2127 | # ;; The result is in $HASH_IN_OUT | |
2128 | jmp .L_after_reduction_${rndsuffix} | |
2129 | ___ | |
2130 | } | |
2131 | ||
2132 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2133 | # ;;; After GHASH reduction | |
2134 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2135 | ||
2136 | $code .= ".L_small_initial_compute_done_${rndsuffix}:\n"; | |
2137 | ||
2138 | # ;; If using init/update/finalize, we need to xor any partial block data | |
2139 | # ;; into the hash. | |
2140 | if ($NUM_BLOCKS > 1) { | |
2141 | ||
2142 | # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place | |
2143 | if ($NUM_BLOCKS != 16) { | |
2144 | $code .= <<___; | |
2145 | # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero | |
2146 | or $LENGTH,$LENGTH | |
2147 | je .L_after_reduction_${rndsuffix} | |
2148 | ___ | |
2149 | } | |
2150 | $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n"; | |
2151 | } | |
2152 | ||
2153 | $code .= ".L_after_reduction_${rndsuffix}:\n"; | |
2154 | ||
2155 | # ;; Final hash is now in HASH_IN_OUT | |
2156 | } | |
2157 | ||
2158 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2159 | # ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. | |
2160 | # ;; It may look similar to INITIAL_BLOCKS but its usage is different: | |
2161 | # ;; - first encrypts/decrypts required number of blocks and then | |
2162 | # ;; ghashes these blocks | |
2163 | # ;; - Small packets or left over data chunks (<256 bytes) | |
2164 | # ;; - Remaining data chunks below 256 bytes (multi buffer code) | |
2165 | # ;; | |
2166 | # ;; num_initial_blocks is expected to include the partial final block | |
2167 | # ;; in the count. | |
2168 | sub INITIAL_BLOCKS_PARTIAL { | |
2169 | my $AES_KEYS = $_[0]; # [in] key pointer | |
2170 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
2171 | my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer | |
2172 | my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer | |
2173 | my $LENGTH = $_[4]; # [in/clobbered] length in bytes | |
2174 | my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated) | |
2175 | my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) | |
2176 | my $CTR = $_[7]; # [in/out] current counter value | |
2177 | my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value | |
2178 | my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC) | |
2179 | my $CTR0 = $_[10]; # [clobbered] ZMM temporary | |
2180 | my $CTR1 = $_[11]; # [clobbered] ZMM temporary | |
2181 | my $CTR2 = $_[12]; # [clobbered] ZMM temporary | |
2182 | my $CTR3 = $_[13]; # [clobbered] ZMM temporary | |
2183 | my $DAT0 = $_[14]; # [clobbered] ZMM temporary | |
2184 | my $DAT1 = $_[15]; # [clobbered] ZMM temporary | |
2185 | my $DAT2 = $_[16]; # [clobbered] ZMM temporary | |
2186 | my $DAT3 = $_[17]; # [clobbered] ZMM temporary | |
2187 | my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary | |
2188 | my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary | |
2189 | my $ZT0 = $_[20]; # [clobbered] ZMM temporary | |
2190 | my $ZT1 = $_[21]; # [clobbered] ZMM temporary | |
2191 | my $ZT2 = $_[22]; # [clobbered] ZMM temporary | |
2192 | my $ZT3 = $_[23]; # [clobbered] ZMM temporary | |
2193 | my $ZT4 = $_[24]; # [clobbered] ZMM temporary | |
2194 | my $IA0 = $_[25]; # [clobbered] GP temporary | |
2195 | my $IA1 = $_[26]; # [clobbered] GP temporary | |
2196 | my $MASKREG = $_[27]; # [clobbered] mask register | |
2197 | my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask | |
2198 | my $PBLOCK_LEN = $_[29]; # [in] partial block length | |
2199 | ||
2200 | &INITIAL_BLOCKS_PARTIAL_CIPHER( | |
2201 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, | |
2202 | $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR, | |
2203 | $ENC_DEC, $DAT0, $DAT1, $DAT2, | |
2204 | $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0, | |
2205 | $CTR1, $CTR2, $CTR3, $ZT0, | |
2206 | $IA0, $IA1, $MASKREG, $SHUFMASK); | |
2207 | ||
2208 | &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0, | |
2209 | $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK), | |
2210 | &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN); | |
2211 | } | |
2212 | ||
2213 | # ;; =========================================================================== | |
2214 | # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks | |
2215 | # ;; followed with GHASH of the N blocks. | |
2216 | sub GHASH_16_ENCRYPT_N_GHASH_N { | |
2217 | my $AES_KEYS = $_[0]; # [in] key pointer | |
2218 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
2219 | my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer | |
2220 | my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer | |
2221 | my $DATA_OFFSET = $_[4]; # [in] data offset | |
2222 | my $LENGTH = $_[5]; # [in] data length | |
2223 | my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian | |
2224 | my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check | |
2225 | my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key | |
2226 | # (can be in form of register or numerical value) | |
2227 | my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in | |
2228 | my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb | |
2229 | my $B00_03 = $_[11]; # [clobbered] temporary ZMM | |
2230 | my $B04_07 = $_[12]; # [clobbered] temporary ZMM | |
2231 | my $B08_11 = $_[13]; # [clobbered] temporary ZMM | |
2232 | my $B12_15 = $_[14]; # [clobbered] temporary ZMM | |
2233 | my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM | |
2234 | my $GH1L = $_[16]; # [clobbered] temporary ZMM | |
2235 | my $GH1M = $_[17]; # [clobbered] temporary ZMM | |
2236 | my $GH1T = $_[18]; # [clobbered] temporary ZMM | |
2237 | my $GH2H = $_[19]; # [clobbered] temporary ZMM | |
2238 | my $GH2L = $_[20]; # [clobbered] temporary ZMM | |
2239 | my $GH2M = $_[21]; # [clobbered] temporary ZMM | |
2240 | my $GH2T = $_[22]; # [clobbered] temporary ZMM | |
2241 | my $GH3H = $_[23]; # [clobbered] temporary ZMM | |
2242 | my $GH3L = $_[24]; # [clobbered] temporary ZMM | |
2243 | my $GH3M = $_[25]; # [clobbered] temporary ZMM | |
2244 | my $GH3T = $_[26]; # [clobbered] temporary ZMM | |
2245 | my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM | |
2246 | my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM | |
2247 | my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM | |
2248 | my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM | |
2249 | my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM | |
2250 | my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM | |
2251 | my $ZT01 = $_[33]; # [clobbered] temporary ZMM | |
2252 | my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian | |
2253 | my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian | |
2254 | my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce" | |
2255 | my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum | |
2256 | my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum | |
2257 | my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum | |
2258 | my $ENC_DEC = $_[40]; # [in] cipher direction | |
2259 | my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value | |
2260 | my $IA0 = $_[42]; # [clobbered] GP temporary | |
2261 | my $IA1 = $_[43]; # [clobbered] GP temporary | |
2262 | my $MASKREG = $_[44]; # [clobbered] mask register | |
2263 | my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16) | |
2264 | my $PBLOCK_LEN = $_[46]; # [in] partial block length | |
2265 | ||
2266 | die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n" | |
2267 | if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); | |
2268 | ||
2269 | my $rndsuffix = &random_string(); | |
2270 | ||
2271 | my $GH1H = $HASH_IN_OUT; | |
2272 | ||
2273 | # ; this is to avoid additional move in do_reduction case | |
2274 | ||
2275 | my $LAST_GHASH_BLK = $GH1L; | |
2276 | my $LAST_CIPHER_BLK = $GH1T; | |
2277 | ||
2278 | my $RED_POLY = $GH2T; | |
2279 | my $RED_P1 = $GH2L; | |
2280 | my $RED_T1 = $GH2H; | |
2281 | my $RED_T2 = $GH2M; | |
2282 | ||
2283 | my $DATA1 = $GH3H; | |
2284 | my $DATA2 = $GH3L; | |
2285 | my $DATA3 = $GH3M; | |
2286 | my $DATA4 = $GH3T; | |
2287 | ||
2288 | # ;; do reduction after the 16 blocks ? | |
2289 | my $do_reduction = 0; | |
2290 | ||
2291 | # ;; is 16 block chunk a start? | |
2292 | my $is_start = 0; | |
2293 | ||
2294 | if ($GHASH_TYPE eq "start_reduce") { | |
2295 | $is_start = 1; | |
2296 | $do_reduction = 1; | |
2297 | } | |
2298 | ||
2299 | if ($GHASH_TYPE eq "start") { | |
2300 | $is_start = 1; | |
2301 | } | |
2302 | ||
2303 | if ($GHASH_TYPE eq "end_reduce") { | |
2304 | $do_reduction = 1; | |
2305 | } | |
2306 | ||
2307 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2308 | # ;; - get load/store mask | |
2309 | # ;; - load plain/cipher text | |
2310 | # ;; get load/store mask | |
2311 | $code .= <<___; | |
2312 | lea byte64_len_to_mask_table(%rip),$IA0 | |
2313 | mov $LENGTH,$IA1 | |
2314 | ___ | |
2315 | if ($NUM_BLOCKS > 12) { | |
2316 | $code .= "sub \$`3*64`,$IA1\n"; | |
2317 | } elsif ($NUM_BLOCKS > 8) { | |
2318 | $code .= "sub \$`2*64`,$IA1\n"; | |
2319 | } elsif ($NUM_BLOCKS > 4) { | |
2320 | $code .= "sub \$`1*64`,$IA1\n"; | |
2321 | } | |
2322 | $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n"; | |
2323 | ||
2324 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2325 | # ;; prepare counter blocks | |
2326 | ||
2327 | $code .= <<___; | |
2328 | cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]} | |
2329 | jae .L_16_blocks_overflow_${rndsuffix} | |
2330 | ___ | |
2331 | ||
2332 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2333 | $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE, | |
2334 | $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4); | |
2335 | $code .= <<___; | |
2336 | jmp .L_16_blocks_ok_${rndsuffix} | |
2337 | ||
2338 | .L_16_blocks_overflow_${rndsuffix}: | |
2339 | vpshufb $SHFMSK,$CTR_BE,$CTR_BE | |
2340 | vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03 | |
2341 | ___ | |
2342 | if ($NUM_BLOCKS > 4) { | |
2343 | $code .= <<___; | |
2344 | vmovdqa64 ddq_add_4444(%rip),$B12_15 | |
2345 | vpaddd $B12_15,$B00_03,$B04_07 | |
2346 | ___ | |
2347 | } | |
2348 | if ($NUM_BLOCKS > 8) { | |
2349 | $code .= "vpaddd $B12_15,$B04_07,$B08_11\n"; | |
2350 | } | |
2351 | if ($NUM_BLOCKS > 12) { | |
2352 | $code .= "vpaddd $B12_15,$B08_11,$B12_15\n"; | |
2353 | } | |
2354 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2355 | $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2356 | $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); | |
2357 | $code .= <<___; | |
2358 | .L_16_blocks_ok_${rndsuffix}: | |
2359 | ||
2360 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2361 | # ;; - pre-load constants | |
2362 | # ;; - add current hash into the 1st block | |
2363 | vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1 | |
2364 | ___ | |
2365 | if ($is_start != 0) { | |
2366 | $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n"; | |
2367 | } else { | |
2368 | $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n"; | |
2369 | } | |
2370 | ||
2371 | $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n"; | |
2372 | ||
2373 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2374 | # ;; save counter for the next round | |
2375 | # ;; increment counter overflow check register | |
2376 | if ($NUM_BLOCKS <= 4) { | |
2377 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n"; | |
2378 | } elsif ($NUM_BLOCKS <= 8) { | |
2379 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n"; | |
2380 | } elsif ($NUM_BLOCKS <= 12) { | |
2381 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n"; | |
2382 | } else { | |
2383 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n"; | |
2384 | } | |
2385 | $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n"; | |
2386 | ||
2387 | $code .= <<___; | |
2388 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2389 | # ;; pre-load constants | |
2390 | vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2 | |
2391 | vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2 | |
2392 | vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2 | |
2393 | ___ | |
2394 | ||
2395 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2396 | # ;; stitch AES rounds with GHASH | |
2397 | ||
2398 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2399 | # ;; AES round 0 - ARK | |
2400 | ||
2401 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2402 | $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2403 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2404 | $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n"; | |
2405 | ||
2406 | $code .= <<___; | |
2407 | # ;;================================================== | |
2408 | # ;; GHASH 4 blocks (15 to 12) | |
2409 | vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1 | |
2410 | vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0 | |
2411 | vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0 | |
2412 | vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1 | |
2413 | vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1 | |
2414 | vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1 | |
2415 | ___ | |
2416 | ||
2417 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2418 | # ;; AES round 1 | |
2419 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2420 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2421 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2422 | $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n"; | |
2423 | ||
2424 | $code .= <<___; | |
2425 | # ;; ================================================= | |
2426 | # ;; GHASH 4 blocks (11 to 8) | |
2427 | vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 | |
2428 | vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 | |
2429 | vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 | |
2430 | vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 | |
2431 | vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2 | |
2432 | vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2 | |
2433 | ___ | |
2434 | ||
2435 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2436 | # ;; AES round 2 | |
2437 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2438 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2439 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2440 | $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n"; | |
2441 | ||
2442 | $code .= <<___; | |
2443 | # ;; ================================================= | |
2444 | # ;; GHASH 4 blocks (7 to 4) | |
2445 | vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1 | |
2446 | vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0 | |
2447 | vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1 | |
2448 | vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0 | |
2449 | ___ | |
2450 | ||
2451 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2452 | # ;; AES rounds 3 | |
2453 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2454 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2455 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2456 | $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n"; | |
2457 | ||
2458 | $code .= <<___; | |
2459 | # ;; ================================================= | |
2460 | # ;; Gather (XOR) GHASH for 12 blocks | |
2461 | vpternlogq \$0x96,$GH3H,$GH2H,$GH1H | |
2462 | vpternlogq \$0x96,$GH3L,$GH2L,$GH1L | |
2463 | vpternlogq \$0x96,$GH3T,$GH2T,$GH1T | |
2464 | vpternlogq \$0x96,$GH3M,$GH2M,$GH1M | |
2465 | ___ | |
2466 | ||
2467 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2468 | # ;; AES rounds 4 | |
2469 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2470 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2471 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2472 | $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n"; | |
2473 | ||
2474 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2475 | # ;; load plain/cipher text | |
2476 | &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG); | |
2477 | ||
2478 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2479 | # ;; AES rounds 5 | |
2480 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2481 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2482 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2483 | $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n"; | |
2484 | ||
2485 | $code .= <<___; | |
2486 | # ;; ================================================= | |
2487 | # ;; GHASH 4 blocks (3 to 0) | |
2488 | vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 | |
2489 | vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 | |
2490 | vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 | |
2491 | vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 | |
2492 | ___ | |
2493 | ||
2494 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2495 | # ;; AES round 6 | |
2496 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2497 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2498 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2499 | $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n"; | |
2500 | ||
2501 | # ;; ================================================= | |
2502 | # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid) | |
2503 | # ;; - add GH2[MTLH] to GH1[MTLH] | |
2504 | $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n"; | |
2505 | if ($do_reduction != 0) { | |
2506 | ||
2507 | if ($is_start != 0) { | |
2508 | $code .= "vpxorq $GH2M,$GH1M,$GH1M\n"; | |
2509 | } else { | |
2510 | $code .= <<___; | |
2511 | vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H | |
2512 | vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L | |
2513 | vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M | |
2514 | ___ | |
2515 | } | |
2516 | ||
2517 | } else { | |
2518 | ||
2519 | # ;; Update H/M/L hash sums if not carrying reduction | |
2520 | if ($is_start != 0) { | |
2521 | $code .= <<___; | |
2522 | vpxorq $GH2H,$GH1H,$TO_REDUCE_H | |
2523 | vpxorq $GH2L,$GH1L,$TO_REDUCE_L | |
2524 | vpxorq $GH2M,$GH1M,$TO_REDUCE_M | |
2525 | ___ | |
2526 | } else { | |
2527 | $code .= <<___; | |
2528 | vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H | |
2529 | vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L | |
2530 | vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M | |
2531 | ___ | |
2532 | } | |
2533 | ||
2534 | } | |
2535 | ||
2536 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2537 | # ;; AES round 7 | |
2538 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2539 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2540 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2541 | $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n"; | |
2542 | ||
2543 | # ;; ================================================= | |
2544 | # ;; prepare mid sum for adding to high & low | |
2545 | # ;; load polynomial constant for reduction | |
2546 | if ($do_reduction != 0) { | |
2547 | $code .= <<___; | |
2548 | vpsrldq \$8,$GH1M,$GH2M | |
2549 | vpslldq \$8,$GH1M,$GH1M | |
2550 | ||
2551 | vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]} | |
2552 | ___ | |
2553 | } | |
2554 | ||
2555 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2556 | # ;; AES round 8 | |
2557 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2558 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2559 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2560 | $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n"; | |
2561 | ||
2562 | # ;; ================================================= | |
2563 | # ;; Add mid product to high and low | |
2564 | if ($do_reduction != 0) { | |
2565 | if ($is_start != 0) { | |
2566 | $code .= <<___; | |
2567 | vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64 | |
2568 | vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64 | |
2569 | ___ | |
2570 | } else { | |
2571 | $code .= <<___; | |
2572 | vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64 | |
2573 | vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64 | |
2574 | ___ | |
2575 | } | |
2576 | } | |
2577 | ||
2578 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2579 | # ;; AES round 9 | |
2580 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2581 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2582 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2583 | ||
2584 | # ;; ================================================= | |
2585 | # ;; horizontal xor of low and high 4x128 | |
2586 | if ($do_reduction != 0) { | |
2587 | &VHPXORI4x128($GH1H, $GH2H); | |
2588 | &VHPXORI4x128($GH1L, $GH2L); | |
2589 | } | |
2590 | ||
2591 | if (($NROUNDS >= 11)) { | |
2592 | $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n"; | |
2593 | } | |
2594 | ||
2595 | # ;; ================================================= | |
2596 | # ;; first phase of reduction | |
2597 | if ($do_reduction != 0) { | |
2598 | $code .= <<___; | |
2599 | vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]} | |
2600 | vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs | |
2601 | vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct | |
2602 | ___ | |
2603 | } | |
2604 | ||
2605 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2606 | # ;; AES rounds up to 11 (AES192) or 13 (AES256) | |
2607 | # ;; AES128 is done | |
2608 | if (($NROUNDS >= 11)) { | |
2609 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2610 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2611 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2612 | $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n"; | |
2613 | ||
2614 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2615 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2616 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2617 | if (($NROUNDS == 13)) { | |
2618 | $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n"; | |
2619 | ||
2620 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2621 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2622 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2623 | $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n"; | |
2624 | ||
2625 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2626 | $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2627 | $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); | |
2628 | } | |
2629 | } | |
2630 | ||
2631 | # ;; ================================================= | |
2632 | # ;; second phase of the reduction | |
2633 | if ($do_reduction != 0) { | |
2634 | $code .= <<___; | |
2635 | vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]} | |
2636 | vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R | |
2637 | vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]} | |
2638 | vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts | |
2639 | # ;; GH1H = GH1H + RED_T1 + RED_T2 | |
2640 | vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]} | |
2641 | ___ | |
2642 | } | |
2643 | ||
2644 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2645 | # ;; the last AES round | |
2646 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2647 | $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2648 | $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); | |
2649 | ||
2650 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2651 | # ;; XOR against plain/cipher text | |
2652 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2653 | $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, | |
2654 | $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4); | |
2655 | ||
2656 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2657 | # ;; retrieve the last cipher counter block (partially XOR'ed with text) | |
2658 | # ;; - this is needed for partial block cases | |
2659 | if ($NUM_BLOCKS <= 4) { | |
2660 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n"; | |
2661 | } elsif ($NUM_BLOCKS <= 8) { | |
2662 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n"; | |
2663 | } elsif ($NUM_BLOCKS <= 12) { | |
2664 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n"; | |
2665 | } else { | |
2666 | $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n"; | |
2667 | } | |
2668 | ||
2669 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2670 | # ;; store cipher/plain text | |
2671 | $code .= "mov $CIPH_PLAIN_OUT,$IA0\n"; | |
2672 | &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG); | |
2673 | ||
2674 | # ;; ================================================= | |
2675 | # ;; shuffle cipher text blocks for GHASH computation | |
2676 | if ($ENC_DEC eq "ENC") { | |
2677 | ||
2678 | # ;; zero bytes outside the mask before hashing | |
2679 | if ($NUM_BLOCKS <= 4) { | |
2680 | $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n"; | |
2681 | } elsif ($NUM_BLOCKS <= 8) { | |
2682 | $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n"; | |
2683 | } elsif ($NUM_BLOCKS <= 12) { | |
2684 | $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n"; | |
2685 | } else { | |
2686 | $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n"; | |
2687 | } | |
2688 | ||
2689 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2690 | $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03, | |
2691 | $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); | |
2692 | } else { | |
2693 | ||
2694 | # ;; zero bytes outside the mask before hashing | |
2695 | if ($NUM_BLOCKS <= 4) { | |
2696 | $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n"; | |
2697 | } elsif ($NUM_BLOCKS <= 8) { | |
2698 | $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n"; | |
2699 | } elsif ($NUM_BLOCKS <= 12) { | |
2700 | $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n"; | |
2701 | } else { | |
2702 | $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n"; | |
2703 | } | |
2704 | ||
2705 | &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( | |
2706 | $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1, | |
2707 | $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); | |
2708 | } | |
2709 | ||
2710 | # ;; ================================================= | |
2711 | # ;; Extract the last block for partial / multi_call cases | |
2712 | if ($NUM_BLOCKS <= 4) { | |
2713 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n"; | |
2714 | } elsif ($NUM_BLOCKS <= 8) { | |
2715 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n"; | |
2716 | } elsif ($NUM_BLOCKS <= 12) { | |
2717 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n"; | |
2718 | } else { | |
2719 | $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n"; | |
2720 | } | |
2721 | ||
2722 | if ($do_reduction != 0) { | |
2723 | ||
2724 | # ;; GH1H holds reduced hash value | |
2725 | # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)" | |
2726 | # ;; - register rename trick obsoletes the above move | |
2727 | } | |
2728 | ||
2729 | # ;; ================================================= | |
2730 | # ;; GHASH last N blocks | |
2731 | # ;; - current hash value in HASH_IN_OUT or | |
2732 | # ;; product parts in TO_REDUCE_H/M/L | |
2733 | # ;; - DATA1-DATA4 include blocks for GHASH | |
2734 | ||
2735 | if ($do_reduction == 0) { | |
2736 | &INITIAL_BLOCKS_PARTIAL_GHASH( | |
2737 | $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, | |
2738 | &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2, | |
2739 | $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), | |
2740 | $B00_03, $B04_07, $B08_11, $B12_15, | |
2741 | $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2, | |
2742 | $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M, | |
2743 | $TO_REDUCE_L); | |
2744 | } else { | |
2745 | &INITIAL_BLOCKS_PARTIAL_GHASH( | |
2746 | $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, | |
2747 | &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2, | |
2748 | $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), | |
2749 | $B00_03, $B04_07, $B08_11, $B12_15, | |
2750 | $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2, | |
2751 | $GHKEY1, $PBLOCK_LEN); | |
2752 | } | |
2753 | } | |
2754 | ||
2755 | # ;; =========================================================================== | |
2756 | # ;; =========================================================================== | |
2757 | # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks | |
2758 | # ;; followed with GHASH of the N blocks. | |
2759 | sub GCM_ENC_DEC_LAST { | |
2760 | my $AES_KEYS = $_[0]; # [in] key pointer | |
2761 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
2762 | my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer | |
2763 | my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer | |
2764 | my $DATA_OFFSET = $_[4]; # [in] data offset | |
2765 | my $LENGTH = $_[5]; # [in/clobbered] data length | |
2766 | my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian | |
2767 | my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check | |
2768 | my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key | |
2769 | # (can be register or numerical offset) | |
2770 | my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in | |
2771 | my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb | |
2772 | my $ZT00 = $_[11]; # [clobbered] temporary ZMM | |
2773 | my $ZT01 = $_[12]; # [clobbered] temporary ZMM | |
2774 | my $ZT02 = $_[13]; # [clobbered] temporary ZMM | |
2775 | my $ZT03 = $_[14]; # [clobbered] temporary ZMM | |
2776 | my $ZT04 = $_[15]; # [clobbered] temporary ZMM | |
2777 | my $ZT05 = $_[16]; # [clobbered] temporary ZMM | |
2778 | my $ZT06 = $_[17]; # [clobbered] temporary ZMM | |
2779 | my $ZT07 = $_[18]; # [clobbered] temporary ZMM | |
2780 | my $ZT08 = $_[19]; # [clobbered] temporary ZMM | |
2781 | my $ZT09 = $_[20]; # [clobbered] temporary ZMM | |
2782 | my $ZT10 = $_[21]; # [clobbered] temporary ZMM | |
2783 | my $ZT11 = $_[22]; # [clobbered] temporary ZMM | |
2784 | my $ZT12 = $_[23]; # [clobbered] temporary ZMM | |
2785 | my $ZT13 = $_[24]; # [clobbered] temporary ZMM | |
2786 | my $ZT14 = $_[25]; # [clobbered] temporary ZMM | |
2787 | my $ZT15 = $_[26]; # [clobbered] temporary ZMM | |
2788 | my $ZT16 = $_[27]; # [clobbered] temporary ZMM | |
2789 | my $ZT17 = $_[28]; # [clobbered] temporary ZMM | |
2790 | my $ZT18 = $_[29]; # [clobbered] temporary ZMM | |
2791 | my $ZT19 = $_[30]; # [clobbered] temporary ZMM | |
2792 | my $ZT20 = $_[31]; # [clobbered] temporary ZMM | |
2793 | my $ZT21 = $_[32]; # [clobbered] temporary ZMM | |
2794 | my $ZT22 = $_[33]; # [clobbered] temporary ZMM | |
2795 | my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian | |
2796 | my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian | |
2797 | my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce" | |
2798 | my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum | |
2799 | my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum | |
2800 | my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum | |
2801 | my $ENC_DEC = $_[40]; # [in] cipher direction | |
2802 | my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value | |
2803 | my $IA0 = $_[42]; # [clobbered] GP temporary | |
2804 | my $IA1 = $_[43]; # [clobbered] GP temporary | |
2805 | my $MASKREG = $_[44]; # [clobbered] mask register | |
2806 | my $PBLOCK_LEN = $_[45]; # [in] partial block length | |
2807 | ||
2808 | my $rndsuffix = &random_string(); | |
2809 | ||
2810 | $code .= <<___; | |
2811 | mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]} | |
2812 | add \$15,@{[DWORD($IA0)]} | |
2813 | shr \$4,@{[DWORD($IA0)]} | |
2814 | je .L_last_num_blocks_is_0_${rndsuffix} | |
2815 | ||
2816 | cmp \$8,@{[DWORD($IA0)]} | |
2817 | je .L_last_num_blocks_is_8_${rndsuffix} | |
2818 | jb .L_last_num_blocks_is_7_1_${rndsuffix} | |
2819 | ||
2820 | ||
2821 | cmp \$12,@{[DWORD($IA0)]} | |
2822 | je .L_last_num_blocks_is_12_${rndsuffix} | |
2823 | jb .L_last_num_blocks_is_11_9_${rndsuffix} | |
2824 | ||
2825 | # ;; 16, 15, 14 or 13 | |
2826 | cmp \$15,@{[DWORD($IA0)]} | |
2827 | je .L_last_num_blocks_is_15_${rndsuffix} | |
2828 | ja .L_last_num_blocks_is_16_${rndsuffix} | |
2829 | cmp \$14,@{[DWORD($IA0)]} | |
2830 | je .L_last_num_blocks_is_14_${rndsuffix} | |
2831 | jmp .L_last_num_blocks_is_13_${rndsuffix} | |
2832 | ||
2833 | .L_last_num_blocks_is_11_9_${rndsuffix}: | |
2834 | # ;; 11, 10 or 9 | |
2835 | cmp \$10,@{[DWORD($IA0)]} | |
2836 | je .L_last_num_blocks_is_10_${rndsuffix} | |
2837 | ja .L_last_num_blocks_is_11_${rndsuffix} | |
2838 | jmp .L_last_num_blocks_is_9_${rndsuffix} | |
2839 | ||
2840 | .L_last_num_blocks_is_7_1_${rndsuffix}: | |
2841 | cmp \$4,@{[DWORD($IA0)]} | |
2842 | je .L_last_num_blocks_is_4_${rndsuffix} | |
2843 | jb .L_last_num_blocks_is_3_1_${rndsuffix} | |
2844 | # ;; 7, 6 or 5 | |
2845 | cmp \$6,@{[DWORD($IA0)]} | |
2846 | ja .L_last_num_blocks_is_7_${rndsuffix} | |
2847 | je .L_last_num_blocks_is_6_${rndsuffix} | |
2848 | jmp .L_last_num_blocks_is_5_${rndsuffix} | |
2849 | ||
2850 | .L_last_num_blocks_is_3_1_${rndsuffix}: | |
2851 | # ;; 3, 2 or 1 | |
2852 | cmp \$2,@{[DWORD($IA0)]} | |
2853 | ja .L_last_num_blocks_is_3_${rndsuffix} | |
2854 | je .L_last_num_blocks_is_2_${rndsuffix} | |
2855 | ___ | |
2856 | ||
2857 | # ;; fall through for `jmp .L_last_num_blocks_is_1` | |
2858 | ||
2859 | # ;; Use rep to generate different block size variants | |
2860 | # ;; - one block size has to be the first one | |
2861 | for my $num_blocks (1 .. 16) { | |
2862 | $code .= ".L_last_num_blocks_is_${num_blocks}_${rndsuffix}:\n"; | |
2863 | &GHASH_16_ENCRYPT_N_GHASH_N( | |
2864 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, | |
2865 | $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET, | |
2866 | $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03, | |
2867 | $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, | |
2868 | $ZT09, $ZT10, $ZT11, $ZT12, $ZT13, | |
2869 | $ZT14, $ZT15, $ZT16, $ZT17, $ZT18, | |
2870 | $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4, | |
2871 | $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M, | |
2872 | $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG, | |
2873 | $num_blocks, $PBLOCK_LEN); | |
2874 | ||
2875 | $code .= "jmp .L_last_blocks_done_${rndsuffix}\n"; | |
2876 | } | |
2877 | ||
2878 | $code .= ".L_last_num_blocks_is_0_${rndsuffix}:\n"; | |
2879 | ||
2880 | # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction | |
2881 | # ;; - convert mid into end_reduce | |
2882 | # ;; - convert start into start_reduce | |
2883 | if ($GHASH_TYPE eq "mid") { | |
2884 | $GHASH_TYPE = "end_reduce"; | |
2885 | } | |
2886 | if ($GHASH_TYPE eq "start") { | |
2887 | $GHASH_TYPE = "start_reduce"; | |
2888 | } | |
2889 | ||
2890 | &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp", | |
2891 | $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01, | |
2892 | $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09); | |
2893 | ||
2894 | $code .= ".L_last_blocks_done_${rndsuffix}:\n"; | |
2895 | } | |
2896 | ||
2897 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2898 | # ;; Main GCM macro stitching cipher with GHASH | |
2899 | # ;; - operates on single stream | |
2900 | # ;; - encrypts 16 blocks at a time | |
2901 | # ;; - ghash the 16 previously encrypted ciphertext blocks | |
2902 | # ;; - no partial block or multi_call handling here | |
2903 | sub GHASH_16_ENCRYPT_16_PARALLEL { | |
2904 | my $AES_KEYS = $_[0]; # [in] key pointer | |
2905 | my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer | |
2906 | my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer | |
2907 | my $DATA_OFFSET = $_[3]; # [in] data offset | |
2908 | my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian | |
2909 | my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check | |
2910 | my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value) | |
2911 | my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out | |
2912 | my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in | |
2913 | my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb | |
2914 | my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher) | |
2915 | my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher) | |
2916 | my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher) | |
2917 | my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher) | |
2918 | my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction) | |
2919 | my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher) | |
2920 | my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher) | |
2921 | my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher) | |
2922 | my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher) | |
2923 | my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash) | |
2924 | my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash) | |
2925 | my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash) | |
2926 | my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash) | |
2927 | my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash) | |
2928 | my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash) | |
2929 | my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash) | |
2930 | my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash) | |
2931 | my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash) | |
2932 | my $ZT19 = $_[28]; # [clobbered] temporary ZMM | |
2933 | my $ZT20 = $_[29]; # [clobbered] temporary ZMM | |
2934 | my $ZT21 = $_[30]; # [clobbered] temporary ZMM | |
2935 | my $ZT22 = $_[31]; # [clobbered] temporary ZMM | |
2936 | my $ZT23 = $_[32]; # [clobbered] temporary ZMM | |
2937 | my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian | |
2938 | my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian | |
2939 | my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum | |
2940 | my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum | |
2941 | my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum | |
2942 | my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time" | |
2943 | my $ENC_DEC = $_[39]; # [in] cipher direction | |
2944 | my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset | |
2945 | my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in" | |
2946 | my $IA0 = $_[42]; # [clobbered] temporary GPR | |
2947 | ||
2948 | my $B00_03 = $ZT1; | |
2949 | my $B04_07 = $ZT2; | |
2950 | my $B08_11 = $ZT3; | |
2951 | my $B12_15 = $ZT4; | |
2952 | ||
2953 | my $GH1H = $ZT5; | |
2954 | ||
2955 | # ; @note: do not change this mapping | |
2956 | my $GH1L = $ZT6; | |
2957 | my $GH1M = $ZT7; | |
2958 | my $GH1T = $ZT8; | |
2959 | ||
2960 | my $GH2H = $ZT9; | |
2961 | my $GH2L = $ZT10; | |
2962 | my $GH2M = $ZT11; | |
2963 | my $GH2T = $ZT12; | |
2964 | ||
2965 | my $RED_POLY = $GH2T; | |
2966 | my $RED_P1 = $GH2L; | |
2967 | my $RED_T1 = $GH2H; | |
2968 | my $RED_T2 = $GH2M; | |
2969 | ||
2970 | my $GH3H = $ZT13; | |
2971 | my $GH3L = $ZT14; | |
2972 | my $GH3M = $ZT15; | |
2973 | my $GH3T = $ZT16; | |
2974 | ||
2975 | my $DATA1 = $ZT13; | |
2976 | my $DATA2 = $ZT14; | |
2977 | my $DATA3 = $ZT15; | |
2978 | my $DATA4 = $ZT16; | |
2979 | ||
2980 | my $AESKEY1 = $ZT17; | |
2981 | my $AESKEY2 = $ZT18; | |
2982 | ||
2983 | my $GHKEY1 = $ZT19; | |
2984 | my $GHKEY2 = $ZT20; | |
2985 | my $GHDAT1 = $ZT21; | |
2986 | my $GHDAT2 = $ZT22; | |
2987 | ||
2988 | my $rndsuffix = &random_string(); | |
2989 | ||
2990 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2991 | # ;; prepare counter blocks | |
2992 | ||
2993 | $code .= <<___; | |
2994 | cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} | |
2995 | jae .L_16_blocks_overflow_${rndsuffix} | |
2996 | vpaddd $ADDBE_1234,$CTR_BE,$B00_03 | |
2997 | vpaddd $ADDBE_4x4,$B00_03,$B04_07 | |
2998 | vpaddd $ADDBE_4x4,$B04_07,$B08_11 | |
2999 | vpaddd $ADDBE_4x4,$B08_11,$B12_15 | |
3000 | jmp .L_16_blocks_ok_${rndsuffix} | |
3001 | .L_16_blocks_overflow_${rndsuffix}: | |
3002 | vpshufb $SHFMSK,$CTR_BE,$CTR_BE | |
3003 | vmovdqa64 ddq_add_4444(%rip),$B12_15 | |
3004 | vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03 | |
3005 | vpaddd $B12_15,$B00_03,$B04_07 | |
3006 | vpaddd $B12_15,$B04_07,$B08_11 | |
3007 | vpaddd $B12_15,$B08_11,$B12_15 | |
3008 | vpshufb $SHFMSK,$B00_03,$B00_03 | |
3009 | vpshufb $SHFMSK,$B04_07,$B04_07 | |
3010 | vpshufb $SHFMSK,$B08_11,$B08_11 | |
3011 | vpshufb $SHFMSK,$B12_15,$B12_15 | |
3012 | .L_16_blocks_ok_${rndsuffix}: | |
3013 | ___ | |
3014 | ||
3015 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3016 | # ;; pre-load constants | |
3017 | $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n"; | |
3018 | if ($GHASH_IN ne "no_ghash_in") { | |
3019 | $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n"; | |
3020 | } else { | |
3021 | $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n"; | |
3022 | } | |
3023 | ||
3024 | $code .= <<___; | |
3025 | vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1 | |
3026 | ||
3027 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3028 | # ;; save counter for the next round | |
3029 | # ;; increment counter overflow check register | |
3030 | vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE | |
3031 | addb \$16,@{[BYTE($CTR_CHECK)]} | |
3032 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3033 | # ;; pre-load constants | |
3034 | vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2 | |
3035 | vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2 | |
3036 | vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2 | |
3037 | ||
3038 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3039 | # ;; stitch AES rounds with GHASH | |
3040 | ||
3041 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3042 | # ;; AES round 0 - ARK | |
3043 | ||
3044 | vpxorq $AESKEY1,$B00_03,$B00_03 | |
3045 | vpxorq $AESKEY1,$B04_07,$B04_07 | |
3046 | vpxorq $AESKEY1,$B08_11,$B08_11 | |
3047 | vpxorq $AESKEY1,$B12_15,$B12_15 | |
3048 | vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1 | |
3049 | ||
3050 | # ;;================================================== | |
3051 | # ;; GHASH 4 blocks (15 to 12) | |
3052 | vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1 | |
3053 | vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0 | |
3054 | vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0 | |
3055 | vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1 | |
3056 | vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1 | |
3057 | vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1 | |
3058 | ||
3059 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3060 | # ;; AES round 1 | |
3061 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3062 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3063 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3064 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3065 | vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2 | |
3066 | ||
3067 | # ;; ================================================= | |
3068 | # ;; GHASH 4 blocks (11 to 8) | |
3069 | vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 | |
3070 | vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 | |
3071 | vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 | |
3072 | vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 | |
3073 | vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2 | |
3074 | vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2 | |
3075 | ||
3076 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3077 | # ;; AES round 2 | |
3078 | vaesenc $AESKEY1,$B00_03,$B00_03 | |
3079 | vaesenc $AESKEY1,$B04_07,$B04_07 | |
3080 | vaesenc $AESKEY1,$B08_11,$B08_11 | |
3081 | vaesenc $AESKEY1,$B12_15,$B12_15 | |
3082 | vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1 | |
3083 | ||
3084 | # ;; ================================================= | |
3085 | # ;; GHASH 4 blocks (7 to 4) | |
3086 | vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1 | |
3087 | vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0 | |
3088 | vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1 | |
3089 | vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0 | |
3090 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3091 | # ;; AES rounds 3 | |
3092 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3093 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3094 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3095 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3096 | vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2 | |
3097 | ||
3098 | # ;; ================================================= | |
3099 | # ;; Gather (XOR) GHASH for 12 blocks | |
3100 | vpternlogq \$0x96,$GH3H,$GH2H,$GH1H | |
3101 | vpternlogq \$0x96,$GH3L,$GH2L,$GH1L | |
3102 | vpternlogq \$0x96,$GH3T,$GH2T,$GH1T | |
3103 | vpternlogq \$0x96,$GH3M,$GH2M,$GH1M | |
3104 | ||
3105 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3106 | # ;; AES rounds 4 | |
3107 | vaesenc $AESKEY1,$B00_03,$B00_03 | |
3108 | vaesenc $AESKEY1,$B04_07,$B04_07 | |
3109 | vaesenc $AESKEY1,$B08_11,$B08_11 | |
3110 | vaesenc $AESKEY1,$B12_15,$B12_15 | |
3111 | vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1 | |
3112 | ||
3113 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3114 | # ;; load plain/cipher text (recycle GH3xx registers) | |
3115 | vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1 | |
3116 | vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2 | |
3117 | vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3 | |
3118 | vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4 | |
3119 | ||
3120 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3121 | # ;; AES rounds 5 | |
3122 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3123 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3124 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3125 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3126 | vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2 | |
3127 | ||
3128 | # ;; ================================================= | |
3129 | # ;; GHASH 4 blocks (3 to 0) | |
3130 | vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 | |
3131 | vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 | |
3132 | vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 | |
3133 | vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 | |
3134 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3135 | # ;; AES round 6 | |
3136 | vaesenc $AESKEY1,$B00_03,$B00_03 | |
3137 | vaesenc $AESKEY1,$B04_07,$B04_07 | |
3138 | vaesenc $AESKEY1,$B08_11,$B08_11 | |
3139 | vaesenc $AESKEY1,$B12_15,$B12_15 | |
3140 | vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1 | |
3141 | ___ | |
3142 | ||
3143 | # ;; ================================================= | |
3144 | # ;; gather GHASH in GH1L (low) and GH1H (high) | |
3145 | if ($DO_REDUCTION eq "first_time") { | |
3146 | $code .= <<___; | |
3147 | vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM | |
3148 | vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM | |
3149 | vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH | |
3150 | vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL | |
3151 | ___ | |
3152 | } | |
3153 | if ($DO_REDUCTION eq "no_reduction") { | |
3154 | $code .= <<___; | |
3155 | vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM | |
3156 | vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM | |
3157 | vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH | |
3158 | vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL | |
3159 | ___ | |
3160 | } | |
3161 | if ($DO_REDUCTION eq "final_reduction") { | |
3162 | $code .= <<___; | |
3163 | # ;; phase 1: add mid products together | |
3164 | # ;; also load polynomial constant for reduction | |
3165 | vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM | |
3166 | vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M | |
3167 | ||
3168 | vpsrldq \$8,$GH1M,$GH2M | |
3169 | vpslldq \$8,$GH1M,$GH1M | |
3170 | ||
3171 | vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]} | |
3172 | ___ | |
3173 | } | |
3174 | ||
3175 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3176 | # ;; AES round 7 | |
3177 | $code .= <<___; | |
3178 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3179 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3180 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3181 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3182 | vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2 | |
3183 | ___ | |
3184 | ||
3185 | # ;; ================================================= | |
3186 | # ;; Add mid product to high and low | |
3187 | if ($DO_REDUCTION eq "final_reduction") { | |
3188 | $code .= <<___; | |
3189 | vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64 | |
3190 | vpxorq $TO_REDUCE_H,$GH1H,$GH1H | |
3191 | vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64 | |
3192 | vpxorq $TO_REDUCE_L,$GH1L,$GH1L | |
3193 | ___ | |
3194 | } | |
3195 | ||
3196 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3197 | # ;; AES round 8 | |
3198 | $code .= <<___; | |
3199 | vaesenc $AESKEY1,$B00_03,$B00_03 | |
3200 | vaesenc $AESKEY1,$B04_07,$B04_07 | |
3201 | vaesenc $AESKEY1,$B08_11,$B08_11 | |
3202 | vaesenc $AESKEY1,$B12_15,$B12_15 | |
3203 | vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1 | |
3204 | ___ | |
3205 | ||
3206 | # ;; ================================================= | |
3207 | # ;; horizontal xor of low and high 4x128 | |
3208 | if ($DO_REDUCTION eq "final_reduction") { | |
3209 | &VHPXORI4x128($GH1H, $GH2H); | |
3210 | &VHPXORI4x128($GH1L, $GH2L); | |
3211 | } | |
3212 | ||
3213 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3214 | # ;; AES round 9 | |
3215 | $code .= <<___; | |
3216 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3217 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3218 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3219 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3220 | ___ | |
3221 | if (($NROUNDS >= 11)) { | |
3222 | $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n"; | |
3223 | } | |
3224 | ||
3225 | # ;; ================================================= | |
3226 | # ;; first phase of reduction | |
3227 | if ($DO_REDUCTION eq "final_reduction") { | |
3228 | $code .= <<___; | |
3229 | vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]} | |
3230 | vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs | |
3231 | vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct | |
3232 | ___ | |
3233 | } | |
3234 | ||
3235 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3236 | # ;; AES rounds up to 11 (AES192) or 13 (AES256) | |
3237 | # ;; AES128 is done | |
3238 | if (($NROUNDS >= 11)) { | |
3239 | $code .= <<___; | |
3240 | vaesenc $AESKEY1,$B00_03,$B00_03 | |
3241 | vaesenc $AESKEY1,$B04_07,$B04_07 | |
3242 | vaesenc $AESKEY1,$B08_11,$B08_11 | |
3243 | vaesenc $AESKEY1,$B12_15,$B12_15 | |
3244 | vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1 | |
3245 | ||
3246 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3247 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3248 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3249 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3250 | ___ | |
3251 | if (($NROUNDS == 13)) { | |
3252 | $code .= <<___; | |
3253 | vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2 | |
3254 | ||
3255 | vaesenc $AESKEY1,$B00_03,$B00_03 | |
3256 | vaesenc $AESKEY1,$B04_07,$B04_07 | |
3257 | vaesenc $AESKEY1,$B08_11,$B08_11 | |
3258 | vaesenc $AESKEY1,$B12_15,$B12_15 | |
3259 | vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1 | |
3260 | ||
3261 | vaesenc $AESKEY2,$B00_03,$B00_03 | |
3262 | vaesenc $AESKEY2,$B04_07,$B04_07 | |
3263 | vaesenc $AESKEY2,$B08_11,$B08_11 | |
3264 | vaesenc $AESKEY2,$B12_15,$B12_15 | |
3265 | ___ | |
3266 | } | |
3267 | } | |
3268 | ||
3269 | # ;; ================================================= | |
3270 | # ;; second phase of the reduction | |
3271 | if ($DO_REDUCTION eq "final_reduction") { | |
3272 | $code .= <<___; | |
3273 | vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]} | |
3274 | vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R | |
3275 | vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]} | |
3276 | vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts | |
3277 | # ;; GH1H = GH1H x RED_T1 x RED_T2 | |
3278 | vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]} | |
3279 | ___ | |
3280 | } | |
3281 | ||
3282 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3283 | # ;; the last AES round | |
3284 | $code .= <<___; | |
3285 | vaesenclast $AESKEY1,$B00_03,$B00_03 | |
3286 | vaesenclast $AESKEY1,$B04_07,$B04_07 | |
3287 | vaesenclast $AESKEY1,$B08_11,$B08_11 | |
3288 | vaesenclast $AESKEY1,$B12_15,$B12_15 | |
3289 | ||
3290 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3291 | # ;; XOR against plain/cipher text | |
3292 | vpxorq $DATA1,$B00_03,$B00_03 | |
3293 | vpxorq $DATA2,$B04_07,$B04_07 | |
3294 | vpxorq $DATA3,$B08_11,$B08_11 | |
3295 | vpxorq $DATA4,$B12_15,$B12_15 | |
3296 | ||
3297 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3298 | # ;; store cipher/plain text | |
3299 | mov $CIPH_PLAIN_OUT,$IA0 | |
3300 | vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1) | |
3301 | vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1) | |
3302 | vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1) | |
3303 | vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1) | |
3304 | ___ | |
3305 | ||
3306 | # ;; ================================================= | |
3307 | # ;; shuffle cipher text blocks for GHASH computation | |
3308 | if ($ENC_DEC eq "ENC") { | |
3309 | $code .= <<___; | |
3310 | vpshufb $SHFMSK,$B00_03,$B00_03 | |
3311 | vpshufb $SHFMSK,$B04_07,$B04_07 | |
3312 | vpshufb $SHFMSK,$B08_11,$B08_11 | |
3313 | vpshufb $SHFMSK,$B12_15,$B12_15 | |
3314 | ___ | |
3315 | } else { | |
3316 | $code .= <<___; | |
3317 | vpshufb $SHFMSK,$DATA1,$B00_03 | |
3318 | vpshufb $SHFMSK,$DATA2,$B04_07 | |
3319 | vpshufb $SHFMSK,$DATA3,$B08_11 | |
3320 | vpshufb $SHFMSK,$DATA4,$B12_15 | |
3321 | ___ | |
3322 | } | |
3323 | ||
3324 | # ;; ================================================= | |
3325 | # ;; store shuffled cipher text for ghashing | |
3326 | $code .= <<___; | |
3327 | vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp) | |
3328 | vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp) | |
3329 | vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp) | |
3330 | vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp) | |
3331 | ___ | |
3332 | } | |
3333 | ||
3334 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3335 | # ;;; Encryption of a single block | |
3336 | sub ENCRYPT_SINGLE_BLOCK { | |
3337 | my $AES_KEY = $_[0]; # ; [in] | |
3338 | my $XMM0 = $_[1]; # ; [in/out] | |
3339 | my $GPR1 = $_[2]; # ; [clobbered] | |
3340 | ||
3341 | my $rndsuffix = &random_string(); | |
3342 | ||
3343 | $code .= <<___; | |
3344 | # ; load number of rounds from AES_KEY structure (offset in bytes is | |
3345 | # ; size of the |rd_key| buffer) | |
3346 | mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]} | |
3347 | cmp \$9,@{[DWORD($GPR1)]} | |
3348 | je .Laes_128_${rndsuffix} | |
3349 | cmp \$11,@{[DWORD($GPR1)]} | |
3350 | je .Laes_192_${rndsuffix} | |
3351 | cmp \$13,@{[DWORD($GPR1)]} | |
3352 | je .Laes_256_${rndsuffix} | |
3353 | jmp .Lexit_aes_${rndsuffix} | |
3354 | ___ | |
3355 | for my $keylen (sort keys %aes_rounds) { | |
3356 | my $nr = $aes_rounds{$keylen}; | |
3357 | $code .= <<___; | |
3358 | .align 32 | |
3359 | .Laes_${keylen}_${rndsuffix}: | |
3360 | ___ | |
3361 | $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n"; | |
3362 | for (my $i = 1; $i <= $nr; $i++) { | |
3363 | $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n"; | |
3364 | } | |
3365 | $code .= <<___; | |
3366 | vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0 | |
3367 | jmp .Lexit_aes_${rndsuffix} | |
3368 | ___ | |
3369 | } | |
3370 | $code .= ".Lexit_aes_${rndsuffix}:\n\n"; | |
3371 | } | |
3372 | ||
3373 | sub CALC_J0 { | |
3374 | my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context | |
3375 | my $IV = $_[1]; #; [in] Pointer to IV | |
3376 | my $IV_LEN = $_[2]; #; [in] IV length | |
3377 | my $J0 = $_[3]; #; [out] XMM reg to contain J0 | |
3378 | my $ZT0 = $_[4]; #; [clobbered] ZMM register | |
3379 | my $ZT1 = $_[5]; #; [clobbered] ZMM register | |
3380 | my $ZT2 = $_[6]; #; [clobbered] ZMM register | |
3381 | my $ZT3 = $_[7]; #; [clobbered] ZMM register | |
3382 | my $ZT4 = $_[8]; #; [clobbered] ZMM register | |
3383 | my $ZT5 = $_[9]; #; [clobbered] ZMM register | |
3384 | my $ZT6 = $_[10]; #; [clobbered] ZMM register | |
3385 | my $ZT7 = $_[11]; #; [clobbered] ZMM register | |
3386 | my $ZT8 = $_[12]; #; [clobbered] ZMM register | |
3387 | my $ZT9 = $_[13]; #; [clobbered] ZMM register | |
3388 | my $ZT10 = $_[14]; #; [clobbered] ZMM register | |
3389 | my $ZT11 = $_[15]; #; [clobbered] ZMM register | |
3390 | my $ZT12 = $_[16]; #; [clobbered] ZMM register | |
3391 | my $ZT13 = $_[17]; #; [clobbered] ZMM register | |
3392 | my $ZT14 = $_[18]; #; [clobbered] ZMM register | |
3393 | my $ZT15 = $_[19]; #; [clobbered] ZMM register | |
3394 | my $ZT16 = $_[20]; #; [clobbered] ZMM register | |
3395 | my $T1 = $_[21]; #; [clobbered] GP register | |
3396 | my $T2 = $_[22]; #; [clobbered] GP register | |
3397 | my $T3 = $_[23]; #; [clobbered] GP register | |
3398 | my $MASKREG = $_[24]; #; [clobbered] mask register | |
3399 | ||
3400 | # ;; J0 = GHASH(IV || 0s+64 || len(IV)64) | |
3401 | # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ | |
3402 | ||
3403 | # ;; Calculate GHASH of (IV || 0s) | |
3404 | $code .= "vpxor $J0,$J0,$J0\n"; | |
3405 | &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, | |
3406 | $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG); | |
3407 | ||
3408 | # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) | |
3409 | $code .= <<___; | |
3410 | mov $IV_LEN,$T1 | |
3411 | shl \$3,$T1 # ; IV length in bits | |
3412 | vmovq $T1,@{[XWORD($ZT2)]} | |
3413 | ||
3414 | # ;; Might need shuffle of ZT2 | |
3415 | vpxorq $J0,@{[XWORD($ZT2)]},$J0 | |
3416 | ||
3417 | vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]} | |
3418 | ___ | |
3419 | &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]}); | |
3420 | ||
3421 | $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n"; | |
3422 | } | |
3423 | ||
3424 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3425 | # ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for | |
3426 | # ;;; encoding/decoding. | |
3427 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3428 | sub GCM_INIT_IV { | |
3429 | my $AES_KEYS = $_[0]; # [in] AES key schedule | |
3430 | my $GCM128_CTX = $_[1]; # [in/out] GCM context | |
3431 | my $IV = $_[2]; # [in] IV pointer | |
3432 | my $IV_LEN = $_[3]; # [in] IV length | |
3433 | my $GPR1 = $_[4]; # [clobbered] GP register | |
3434 | my $GPR2 = $_[5]; # [clobbered] GP register | |
3435 | my $GPR3 = $_[6]; # [clobbered] GP register | |
3436 | my $MASKREG = $_[7]; # [clobbered] mask register | |
3437 | my $CUR_COUNT = $_[8]; # [out] XMM with current counter | |
3438 | my $ZT0 = $_[9]; # [clobbered] ZMM register | |
3439 | my $ZT1 = $_[10]; # [clobbered] ZMM register | |
3440 | my $ZT2 = $_[11]; # [clobbered] ZMM register | |
3441 | my $ZT3 = $_[12]; # [clobbered] ZMM register | |
3442 | my $ZT4 = $_[13]; # [clobbered] ZMM register | |
3443 | my $ZT5 = $_[14]; # [clobbered] ZMM register | |
3444 | my $ZT6 = $_[15]; # [clobbered] ZMM register | |
3445 | my $ZT7 = $_[16]; # [clobbered] ZMM register | |
3446 | my $ZT8 = $_[17]; # [clobbered] ZMM register | |
3447 | my $ZT9 = $_[18]; # [clobbered] ZMM register | |
3448 | my $ZT10 = $_[19]; # [clobbered] ZMM register | |
3449 | my $ZT11 = $_[20]; # [clobbered] ZMM register | |
3450 | my $ZT12 = $_[21]; # [clobbered] ZMM register | |
3451 | my $ZT13 = $_[22]; # [clobbered] ZMM register | |
3452 | my $ZT14 = $_[23]; # [clobbered] ZMM register | |
3453 | my $ZT15 = $_[24]; # [clobbered] ZMM register | |
3454 | my $ZT16 = $_[25]; # [clobbered] ZMM register | |
3455 | ||
3456 | my $ZT0x = $ZT0; | |
3457 | $ZT0x =~ s/zmm/xmm/; | |
3458 | ||
3459 | $code .= <<___; | |
3460 | cmp \$12,$IV_LEN | |
3461 | je iv_len_12_init_IV | |
3462 | ___ | |
3463 | ||
3464 | # ;; IV is different than 12 bytes | |
3465 | &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, | |
3466 | $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG); | |
3467 | $code .= <<___; | |
3468 | jmp skip_iv_len_12_init_IV | |
3469 | iv_len_12_init_IV: # ;; IV is 12 bytes | |
3470 | # ;; read 12 IV bytes and pad with 0x00000001 | |
3471 | vmovdqu8 ONEf(%rip),$CUR_COUNT | |
3472 | mov $IV,$GPR2 | |
3473 | mov \$0x0000000000000fff,@{[DWORD($GPR1)]} | |
3474 | kmovq $GPR1,$MASKREG | |
3475 | vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1 | |
3476 | skip_iv_len_12_init_IV: | |
3477 | vmovdqu $CUR_COUNT,$ZT0x | |
3478 | ___ | |
3479 | &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0) | |
3480 | $code .= <<___; | |
3481 | vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage | |
3482 | ||
3483 | # ;; store IV as counter in LE format | |
3484 | vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT | |
3485 | vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi | |
3486 | ___ | |
3487 | } | |
3488 | ||
3489 | sub GCM_UPDATE_AAD { | |
3490 | my $GCM128_CTX = $_[0]; # [in] GCM context pointer | |
3491 | my $A_IN = $_[1]; # [in] AAD pointer | |
3492 | my $A_LEN = $_[2]; # [in] AAD length in bytes | |
3493 | my $GPR1 = $_[3]; # [clobbered] GP register | |
3494 | my $GPR2 = $_[4]; # [clobbered] GP register | |
3495 | my $GPR3 = $_[5]; # [clobbered] GP register | |
3496 | my $MASKREG = $_[6]; # [clobbered] mask register | |
3497 | my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value | |
3498 | my $ZT0 = $_[8]; # [clobbered] ZMM register | |
3499 | my $ZT1 = $_[9]; # [clobbered] ZMM register | |
3500 | my $ZT2 = $_[10]; # [clobbered] ZMM register | |
3501 | my $ZT3 = $_[11]; # [clobbered] ZMM register | |
3502 | my $ZT4 = $_[12]; # [clobbered] ZMM register | |
3503 | my $ZT5 = $_[13]; # [clobbered] ZMM register | |
3504 | my $ZT6 = $_[14]; # [clobbered] ZMM register | |
3505 | my $ZT7 = $_[15]; # [clobbered] ZMM register | |
3506 | my $ZT8 = $_[16]; # [clobbered] ZMM register | |
3507 | my $ZT9 = $_[17]; # [clobbered] ZMM register | |
3508 | my $ZT10 = $_[18]; # [clobbered] ZMM register | |
3509 | my $ZT11 = $_[19]; # [clobbered] ZMM register | |
3510 | my $ZT12 = $_[20]; # [clobbered] ZMM register | |
3511 | my $ZT13 = $_[21]; # [clobbered] ZMM register | |
3512 | my $ZT14 = $_[22]; # [clobbered] ZMM register | |
3513 | my $ZT15 = $_[23]; # [clobbered] ZMM register | |
3514 | my $ZT16 = $_[24]; # [clobbered] ZMM register | |
3515 | ||
3516 | # ; load current hash | |
3517 | $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n"; | |
3518 | ||
3519 | &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2, | |
3520 | $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, | |
3521 | $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG); | |
3522 | ||
3523 | # ; load current hash | |
3524 | $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n"; | |
3525 | } | |
3526 | ||
3527 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3528 | # ;;; Cipher and ghash of payloads shorter than 256 bytes | |
3529 | # ;;; - number of blocks in the message comes as argument | |
3530 | # ;;; - depending on the number of blocks an optimized variant of | |
3531 | # ;;; INITIAL_BLOCKS_PARTIAL is invoked | |
3532 | sub GCM_ENC_DEC_SMALL { | |
3533 | my $AES_KEYS = $_[0]; # [in] key pointer | |
3534 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
3535 | my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer | |
3536 | my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer | |
3537 | my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length | |
3538 | my $ENC_DEC = $_[5]; # [in] cipher direction | |
3539 | my $DATA_OFFSET = $_[6]; # [in] data offset | |
3540 | my $LENGTH = $_[7]; # [in] data length | |
3541 | my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16 | |
3542 | my $CTR = $_[9]; # [in/out] XMM counter block | |
3543 | my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value | |
3544 | my $ZTMP0 = $_[11]; # [clobbered] ZMM register | |
3545 | my $ZTMP1 = $_[12]; # [clobbered] ZMM register | |
3546 | my $ZTMP2 = $_[13]; # [clobbered] ZMM register | |
3547 | my $ZTMP3 = $_[14]; # [clobbered] ZMM register | |
3548 | my $ZTMP4 = $_[15]; # [clobbered] ZMM register | |
3549 | my $ZTMP5 = $_[16]; # [clobbered] ZMM register | |
3550 | my $ZTMP6 = $_[17]; # [clobbered] ZMM register | |
3551 | my $ZTMP7 = $_[18]; # [clobbered] ZMM register | |
3552 | my $ZTMP8 = $_[19]; # [clobbered] ZMM register | |
3553 | my $ZTMP9 = $_[20]; # [clobbered] ZMM register | |
3554 | my $ZTMP10 = $_[21]; # [clobbered] ZMM register | |
3555 | my $ZTMP11 = $_[22]; # [clobbered] ZMM register | |
3556 | my $ZTMP12 = $_[23]; # [clobbered] ZMM register | |
3557 | my $ZTMP13 = $_[24]; # [clobbered] ZMM register | |
3558 | my $ZTMP14 = $_[25]; # [clobbered] ZMM register | |
3559 | my $IA0 = $_[26]; # [clobbered] GP register | |
3560 | my $IA1 = $_[27]; # [clobbered] GP register | |
3561 | my $MASKREG = $_[28]; # [clobbered] mask register | |
3562 | my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask | |
3563 | my $PBLOCK_LEN = $_[30]; # [in] partial block length | |
3564 | ||
3565 | my $rndsuffix = &random_string(); | |
3566 | ||
3567 | $code .= <<___; | |
3568 | cmp \$8,$NUM_BLOCKS | |
3569 | je .L_small_initial_num_blocks_is_8_${rndsuffix} | |
3570 | jl .L_small_initial_num_blocks_is_7_1_${rndsuffix} | |
3571 | ||
3572 | ||
3573 | cmp \$12,$NUM_BLOCKS | |
3574 | je .L_small_initial_num_blocks_is_12_${rndsuffix} | |
3575 | jl .L_small_initial_num_blocks_is_11_9_${rndsuffix} | |
3576 | ||
3577 | # ;; 16, 15, 14 or 13 | |
3578 | cmp \$16,$NUM_BLOCKS | |
3579 | je .L_small_initial_num_blocks_is_16_${rndsuffix} | |
3580 | cmp \$15,$NUM_BLOCKS | |
3581 | je .L_small_initial_num_blocks_is_15_${rndsuffix} | |
3582 | cmp \$14,$NUM_BLOCKS | |
3583 | je .L_small_initial_num_blocks_is_14_${rndsuffix} | |
3584 | jmp .L_small_initial_num_blocks_is_13_${rndsuffix} | |
3585 | ||
3586 | .L_small_initial_num_blocks_is_11_9_${rndsuffix}: | |
3587 | # ;; 11, 10 or 9 | |
3588 | cmp \$11,$NUM_BLOCKS | |
3589 | je .L_small_initial_num_blocks_is_11_${rndsuffix} | |
3590 | cmp \$10,$NUM_BLOCKS | |
3591 | je .L_small_initial_num_blocks_is_10_${rndsuffix} | |
3592 | jmp .L_small_initial_num_blocks_is_9_${rndsuffix} | |
3593 | ||
3594 | .L_small_initial_num_blocks_is_7_1_${rndsuffix}: | |
3595 | cmp \$4,$NUM_BLOCKS | |
3596 | je .L_small_initial_num_blocks_is_4_${rndsuffix} | |
3597 | jl .L_small_initial_num_blocks_is_3_1_${rndsuffix} | |
3598 | # ;; 7, 6 or 5 | |
3599 | cmp \$7,$NUM_BLOCKS | |
3600 | je .L_small_initial_num_blocks_is_7_${rndsuffix} | |
3601 | cmp \$6,$NUM_BLOCKS | |
3602 | je .L_small_initial_num_blocks_is_6_${rndsuffix} | |
3603 | jmp .L_small_initial_num_blocks_is_5_${rndsuffix} | |
3604 | ||
3605 | .L_small_initial_num_blocks_is_3_1_${rndsuffix}: | |
3606 | # ;; 3, 2 or 1 | |
3607 | cmp \$3,$NUM_BLOCKS | |
3608 | je .L_small_initial_num_blocks_is_3_${rndsuffix} | |
3609 | cmp \$2,$NUM_BLOCKS | |
3610 | je .L_small_initial_num_blocks_is_2_${rndsuffix} | |
3611 | ||
3612 | # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed | |
3613 | ||
3614 | # ;; Generation of different block size variants | |
3615 | # ;; - one block size has to be the first one | |
3616 | ___ | |
3617 | ||
3618 | for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) { | |
3619 | $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${rndsuffix}:\n"; | |
3620 | &INITIAL_BLOCKS_PARTIAL( | |
3621 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET, | |
3622 | $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1, | |
3623 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
3624 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
3625 | $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN); | |
3626 | ||
3627 | if ($num_blocks != 16) { | |
3628 | $code .= "jmp .L_small_initial_blocks_encrypted_${rndsuffix}\n"; | |
3629 | } | |
3630 | } | |
3631 | ||
3632 | $code .= ".L_small_initial_blocks_encrypted_${rndsuffix}:\n"; | |
3633 | } | |
3634 | ||
3635 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3636 | # ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context | |
3637 | # ; struct has been initialized by GCM_INIT_IV | |
3638 | # ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. | |
3639 | # ; Clobbers rax, r10-r15, and zmm0-zmm31, k1 | |
3640 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
3641 | sub GCM_ENC_DEC { | |
3642 | my $AES_KEYS = $_[0]; # [in] AES Key schedule | |
3643 | my $GCM128_CTX = $_[1]; # [in] context pointer | |
3644 | my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update | |
3645 | my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer | |
3646 | my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length | |
3647 | my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer | |
3648 | my $ENC_DEC = $_[6]; # [in] cipher direction | |
3649 | ||
3650 | my $IA0 = "%r10"; | |
3651 | my $IA1 = "%r12"; | |
3652 | my $IA2 = "%r13"; | |
3653 | my $IA3 = "%r15"; | |
3654 | my $IA4 = "%r11"; | |
3655 | my $IA5 = "%rax"; | |
3656 | my $IA6 = "%rbx"; | |
3657 | my $IA7 = "%r14"; | |
3658 | ||
3659 | my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN; | |
3660 | ||
3661 | my $CTR_CHECK = $IA3; | |
3662 | my $DATA_OFFSET = $IA4; | |
3663 | my $HASHK_PTR = $IA6; | |
3664 | ||
3665 | my $HKEYS_READY = $IA7; | |
3666 | ||
3667 | my $CTR_BLOCKz = "%zmm2"; | |
3668 | my $CTR_BLOCKx = "%xmm2"; | |
3669 | ||
3670 | # ; hardcoded in GCM_INIT | |
3671 | ||
3672 | my $AAD_HASHz = "%zmm14"; | |
3673 | my $AAD_HASHx = "%xmm14"; | |
3674 | ||
3675 | # ; hardcoded in GCM_COMPLETE | |
3676 | ||
3677 | my $ZTMP0 = "%zmm0"; | |
3678 | my $ZTMP1 = "%zmm3"; | |
3679 | my $ZTMP2 = "%zmm4"; | |
3680 | my $ZTMP3 = "%zmm5"; | |
3681 | my $ZTMP4 = "%zmm6"; | |
3682 | my $ZTMP5 = "%zmm7"; | |
3683 | my $ZTMP6 = "%zmm10"; | |
3684 | my $ZTMP7 = "%zmm11"; | |
3685 | my $ZTMP8 = "%zmm12"; | |
3686 | my $ZTMP9 = "%zmm13"; | |
3687 | my $ZTMP10 = "%zmm15"; | |
3688 | my $ZTMP11 = "%zmm16"; | |
3689 | my $ZTMP12 = "%zmm17"; | |
3690 | ||
3691 | my $ZTMP13 = "%zmm19"; | |
3692 | my $ZTMP14 = "%zmm20"; | |
3693 | my $ZTMP15 = "%zmm21"; | |
3694 | my $ZTMP16 = "%zmm30"; | |
3695 | my $ZTMP17 = "%zmm31"; | |
3696 | my $ZTMP18 = "%zmm1"; | |
3697 | my $ZTMP19 = "%zmm18"; | |
3698 | my $ZTMP20 = "%zmm8"; | |
3699 | my $ZTMP21 = "%zmm22"; | |
3700 | my $ZTMP22 = "%zmm23"; | |
3701 | ||
3702 | my $GH = "%zmm24"; | |
3703 | my $GL = "%zmm25"; | |
3704 | my $GM = "%zmm26"; | |
3705 | my $SHUF_MASK = "%zmm29"; | |
3706 | ||
3707 | # ; Unused in the small packet path | |
3708 | my $ADDBE_4x4 = "%zmm27"; | |
3709 | my $ADDBE_1234 = "%zmm28"; | |
3710 | ||
3711 | my $MASKREG = "%k1"; | |
3712 | ||
3713 | my $rndsuffix = &random_string(); | |
3714 | ||
3715 | # ;; reduction every 48 blocks, depth 32 blocks | |
3716 | # ;; @note 48 blocks is the maximum capacity of the stack frame | |
3717 | my $big_loop_nblocks = 48; | |
3718 | my $big_loop_depth = 32; | |
3719 | ||
3720 | # ;;; Macro flow depending on packet size | |
3721 | # ;;; - LENGTH <= 16 blocks | |
3722 | # ;;; - cipher followed by hashing (reduction) | |
3723 | # ;;; - 16 blocks < LENGTH < 32 blocks | |
3724 | # ;;; - cipher 16 blocks | |
3725 | # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) | |
3726 | # ;;; - 32 blocks < LENGTH < 48 blocks | |
3727 | # ;;; - cipher 2 x 16 blocks | |
3728 | # ;;; - hash 16 blocks | |
3729 | # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) | |
3730 | # ;;; - LENGTH >= 48 blocks | |
3731 | # ;;; - cipher 2 x 16 blocks | |
3732 | # ;;; - while (data_to_cipher >= 48 blocks): | |
3733 | # ;;; - cipher 16 blocks & hash 16 blocks | |
3734 | # ;;; - cipher 16 blocks & hash 16 blocks | |
3735 | # ;;; - cipher 16 blocks & hash 16 blocks (reduction) | |
3736 | # ;;; - if (data_to_cipher >= 32 blocks): | |
3737 | # ;;; - cipher 16 blocks & hash 16 blocks | |
3738 | # ;;; - cipher 16 blocks & hash 16 blocks | |
3739 | # ;;; - hash 16 blocks (reduction) | |
3740 | # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) | |
3741 | # ;;; - elif (data_to_cipher >= 16 blocks): | |
3742 | # ;;; - cipher 16 blocks & hash 16 blocks | |
3743 | # ;;; - hash 16 blocks | |
3744 | # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) | |
3745 | # ;;; - else: | |
3746 | # ;;; - hash 16 blocks | |
3747 | # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) | |
3748 | ||
3749 | if ($win64) { | |
3750 | $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n"; | |
3751 | } else { | |
3752 | $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n"; | |
3753 | } | |
3754 | $code .= "je .L_enc_dec_done_${rndsuffix}\n"; | |
3755 | ||
3756 | # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in | |
3757 | # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc' | |
3758 | ||
3759 | $code .= "xor $HKEYS_READY, $HKEYS_READY\n"; | |
3760 | $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n"; | |
3761 | ||
3762 | # ;; Used for the update flow - if there was a previous partial | |
3763 | # ;; block fill the remaining bytes here. | |
3764 | &PARTIAL_BLOCK( | |
3765 | $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, | |
3766 | $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1, | |
3767 | $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, | |
3768 | $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG); | |
3769 | ||
3770 | $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n"; | |
3771 | ||
3772 | # ;; Save the amount of data left to process in $LENGTH | |
3773 | # ;; NOTE: PLAIN_CIPH_LEN is a register on linux; | |
3774 | if ($win64) { | |
3775 | $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n"; | |
3776 | } | |
3777 | ||
3778 | # ;; There may be no more data if it was consumed in the partial block. | |
3779 | $code .= <<___; | |
3780 | sub $DATA_OFFSET,$LENGTH | |
3781 | je .L_enc_dec_done_${rndsuffix} | |
3782 | ___ | |
3783 | ||
3784 | $code .= <<___; | |
3785 | cmp \$`(16 * 16)`,$LENGTH | |
3786 | jbe .L_message_below_equal_16_blocks_${rndsuffix} | |
3787 | ||
3788 | vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK | |
3789 | vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4 | |
3790 | vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234 | |
3791 | ||
3792 | # ;; start the pipeline | |
3793 | # ;; - 32 blocks aes-ctr | |
3794 | # ;; - 16 blocks ghash + aes-ctr | |
3795 | ||
3796 | # ;; set up CTR_CHECK | |
3797 | vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]} | |
3798 | and \$255,@{[DWORD($CTR_CHECK)]} | |
3799 | # ;; in LE format after init, convert to BE | |
3800 | vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz | |
3801 | vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz | |
3802 | ___ | |
3803 | ||
3804 | # ;; ==== AES-CTR - first 16 blocks | |
3805 | my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
3806 | my $data_in_out_offset = 0; | |
3807 | &INITIAL_BLOCKS_16( | |
3808 | $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz, | |
3809 | $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2, | |
3810 | $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, | |
3811 | $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0); | |
3812 | ||
3813 | &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
3814 | "first16"); | |
3815 | ||
3816 | $code .= <<___; | |
3817 | cmp \$`(32 * 16)`,$LENGTH | |
3818 | jb .L_message_below_32_blocks_${rndsuffix} | |
3819 | ___ | |
3820 | ||
3821 | # ;; ==== AES-CTR - next 16 blocks | |
3822 | $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); | |
3823 | $data_in_out_offset = (16 * 16); | |
3824 | &INITIAL_BLOCKS_16( | |
3825 | $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz, | |
3826 | $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2, | |
3827 | $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, | |
3828 | $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0); | |
3829 | ||
3830 | &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
3831 | "last32"); | |
3832 | $code .= "mov \$1,$HKEYS_READY\n"; | |
3833 | ||
3834 | $code .= <<___; | |
3835 | add \$`(32 * 16)`,$DATA_OFFSET | |
3836 | sub \$`(32 * 16)`,$LENGTH | |
3837 | ||
3838 | cmp \$`($big_loop_nblocks * 16)`,$LENGTH | |
3839 | jb .L_no_more_big_nblocks_${rndsuffix} | |
3840 | ___ | |
3841 | ||
3842 | # ;; ==== | |
3843 | # ;; ==== AES-CTR + GHASH - 48 blocks loop | |
3844 | # ;; ==== | |
3845 | $code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n"; | |
3846 | ||
3847 | # ;; ==== AES-CTR + GHASH - 16 blocks, start | |
3848 | $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); | |
3849 | $data_in_out_offset = (0 * 16); | |
3850 | my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
3851 | &GHASH_16_ENCRYPT_16_PARALLEL( | |
3852 | $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, | |
3853 | 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, | |
3854 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
3855 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
3856 | $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, | |
3857 | $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, | |
3858 | $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, | |
3859 | $IA0); | |
3860 | ||
3861 | # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction | |
3862 | $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
3863 | $data_in_out_offset = (16 * 16); | |
3864 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); | |
3865 | &GHASH_16_ENCRYPT_16_PARALLEL( | |
3866 | $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, | |
3867 | 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, | |
3868 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
3869 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
3870 | $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, | |
3871 | $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, | |
3872 | $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", | |
3873 | $IA0); | |
3874 | ||
3875 | # ;; ==== AES-CTR + GHASH - 16 blocks, reduction | |
3876 | $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); | |
3877 | $data_in_out_offset = (32 * 16); | |
3878 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); | |
3879 | &GHASH_16_ENCRYPT_16_PARALLEL( | |
3880 | $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, | |
3881 | 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, | |
3882 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
3883 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
3884 | $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, | |
3885 | $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, | |
3886 | $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", | |
3887 | $IA0); | |
3888 | ||
3889 | # ;; === xor cipher block 0 with GHASH (ZT4) | |
3890 | $code .= <<___; | |
3891 | vmovdqa64 $ZTMP4,$AAD_HASHz | |
3892 | ||
3893 | add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET | |
3894 | sub \$`($big_loop_nblocks * 16)`,$LENGTH | |
3895 | cmp \$`($big_loop_nblocks * 16)`,$LENGTH | |
3896 | jae .L_encrypt_big_nblocks_${rndsuffix} | |
3897 | ||
3898 | .L_no_more_big_nblocks_${rndsuffix}: | |
3899 | ||
3900 | cmp \$`(32 * 16)`,$LENGTH | |
3901 | jae .L_encrypt_32_blocks_${rndsuffix} | |
3902 | ||
3903 | cmp \$`(16 * 16)`,$LENGTH | |
3904 | jae .L_encrypt_16_blocks_${rndsuffix} | |
3905 | ___ | |
3906 | ||
3907 | # ;; ===================================================== | |
3908 | # ;; ===================================================== | |
3909 | # ;; ==== GHASH 1 x 16 blocks | |
3910 | # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks | |
3911 | # ;; ==== then GHASH N blocks | |
3912 | $code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n"; | |
3913 | ||
3914 | # ;; calculate offset to the right hash key | |
3915 | $code .= <<___; | |
3916 | mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]} | |
3917 | and \$~15,@{[DWORD($IA0)]} | |
3918 | mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]} | |
3919 | sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} | |
3920 | ___ | |
3921 | ||
3922 | # ;; ==== GHASH 32 blocks and follow with reduction | |
3923 | &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16), | |
3924 | "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); | |
3925 | ||
3926 | # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder | |
3927 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); | |
3928 | $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n"; | |
3929 | &GCM_ENC_DEC_LAST( | |
3930 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, | |
3931 | $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, | |
3932 | $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
3933 | $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, | |
3934 | $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, | |
3935 | $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, | |
3936 | "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, | |
3937 | $IA0, $IA5, $MASKREG, $PBLOCK_LEN); | |
3938 | ||
3939 | $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; | |
3940 | $code .= "jmp .L_ghash_done_${rndsuffix}\n"; | |
3941 | ||
3942 | # ;; ===================================================== | |
3943 | # ;; ===================================================== | |
3944 | # ;; ==== GHASH & encrypt 1 x 16 blocks | |
3945 | # ;; ==== GHASH & encrypt 1 x 16 blocks | |
3946 | # ;; ==== GHASH 1 x 16 blocks (reduction) | |
3947 | # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks | |
3948 | # ;; ==== then GHASH N blocks | |
3949 | $code .= ".L_encrypt_32_blocks_${rndsuffix}:\n"; | |
3950 | ||
3951 | # ;; ==== AES-CTR + GHASH - 16 blocks, start | |
3952 | $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); | |
3953 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
3954 | $data_in_out_offset = (0 * 16); | |
3955 | &GHASH_16_ENCRYPT_16_PARALLEL( | |
3956 | $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, | |
3957 | 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, | |
3958 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
3959 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
3960 | $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, | |
3961 | $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, | |
3962 | $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, | |
3963 | $IA0); | |
3964 | ||
3965 | # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction | |
3966 | $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
3967 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); | |
3968 | $data_in_out_offset = (16 * 16); | |
3969 | &GHASH_16_ENCRYPT_16_PARALLEL( | |
3970 | $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, | |
3971 | 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, | |
3972 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
3973 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
3974 | $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, | |
3975 | $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, | |
3976 | $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", | |
3977 | $IA0); | |
3978 | ||
3979 | # ;; ==== GHASH 16 blocks with reduction | |
3980 | &GHASH_16( | |
3981 | "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16), | |
3982 | "%rsp", &HashKeyOffsetByIdx(16, "frame"), | |
3983 | 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); | |
3984 | ||
3985 | # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder | |
3986 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
3987 | $code .= <<___; | |
3988 | sub \$`(32 * 16)`,$LENGTH | |
3989 | add \$`(32 * 16)`,$DATA_OFFSET | |
3990 | ___ | |
3991 | ||
3992 | # ;; calculate offset to the right hash key | |
3993 | $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n"; | |
3994 | $code .= <<___; | |
3995 | and \$~15,@{[DWORD($IA0)]} | |
3996 | mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]} | |
3997 | sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} | |
3998 | ___ | |
3999 | &GCM_ENC_DEC_LAST( | |
4000 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, | |
4001 | $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, | |
4002 | $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
4003 | $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, | |
4004 | $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, | |
4005 | $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, | |
4006 | "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, | |
4007 | $IA0, $IA5, $MASKREG, $PBLOCK_LEN); | |
4008 | ||
4009 | $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; | |
4010 | $code .= "jmp .L_ghash_done_${rndsuffix}\n"; | |
4011 | ||
4012 | # ;; ===================================================== | |
4013 | # ;; ===================================================== | |
4014 | # ;; ==== GHASH & encrypt 16 blocks (done before) | |
4015 | # ;; ==== GHASH 1 x 16 blocks | |
4016 | # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks | |
4017 | # ;; ==== then GHASH N blocks | |
4018 | $code .= ".L_encrypt_16_blocks_${rndsuffix}:\n"; | |
4019 | ||
4020 | # ;; ==== AES-CTR + GHASH - 16 blocks, start | |
4021 | $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); | |
4022 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
4023 | $data_in_out_offset = (0 * 16); | |
4024 | &GHASH_16_ENCRYPT_16_PARALLEL( | |
4025 | $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, | |
4026 | 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, | |
4027 | $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, | |
4028 | $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, | |
4029 | $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, | |
4030 | $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, | |
4031 | $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, | |
4032 | $IA0); | |
4033 | ||
4034 | # ;; ==== GHASH 1 x 16 blocks | |
4035 | &GHASH_16( | |
4036 | "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16), | |
4037 | "%rsp", &HashKeyOffsetByIdx(32, "frame"), | |
4038 | 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); | |
4039 | ||
4040 | # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder | |
4041 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); | |
4042 | $code .= <<___; | |
4043 | sub \$`(16 * 16)`,$LENGTH | |
4044 | add \$`(16 * 16)`,$DATA_OFFSET | |
4045 | ___ | |
4046 | &GCM_ENC_DEC_LAST( | |
4047 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, | |
4048 | $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK, | |
4049 | &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0, | |
4050 | $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, | |
4051 | $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, | |
4052 | $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, | |
4053 | $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, | |
4054 | $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20, | |
4055 | $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, | |
4056 | "end_reduce", $GL, $GH, $GM, | |
4057 | $ENC_DEC, $AAD_HASHz, $IA0, $IA5, | |
4058 | $MASKREG, $PBLOCK_LEN); | |
4059 | ||
4060 | $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; | |
4061 | $code .= <<___; | |
4062 | jmp .L_ghash_done_${rndsuffix} | |
4063 | ||
4064 | .L_message_below_32_blocks_${rndsuffix}: | |
4065 | # ;; 32 > number of blocks > 16 | |
4066 | ||
4067 | sub \$`(16 * 16)`,$LENGTH | |
4068 | add \$`(16 * 16)`,$DATA_OFFSET | |
4069 | ___ | |
4070 | $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); | |
4071 | ||
4072 | # ;; calculate offset to the right hash key | |
4073 | $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n"; | |
4074 | ||
4075 | &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
4076 | "mid16"); | |
4077 | $code .= "mov \$1,$HKEYS_READY\n"; | |
4078 | ||
4079 | $code .= <<___; | |
4080 | and \$~15,@{[DWORD($IA0)]} | |
4081 | mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]} | |
4082 | sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} | |
4083 | ___ | |
4084 | ||
4085 | &GCM_ENC_DEC_LAST( | |
4086 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, | |
4087 | $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, | |
4088 | $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
4089 | $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, | |
4090 | $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, | |
4091 | $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, | |
4092 | "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, | |
4093 | $IA0, $IA5, $MASKREG, $PBLOCK_LEN); | |
4094 | ||
4095 | $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; | |
4096 | $code .= <<___; | |
4097 | jmp .L_ghash_done_${rndsuffix} | |
4098 | ||
4099 | .L_message_below_equal_16_blocks_${rndsuffix}: | |
4100 | # ;; Determine how many blocks to process | |
4101 | # ;; - process one additional block if there is a partial block | |
4102 | mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]} | |
4103 | add \$15,@{[DWORD($IA1)]} | |
4104 | shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16 | |
4105 | ___ | |
4106 | &GCM_ENC_DEC_SMALL( | |
4107 | $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC, | |
4108 | $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0, | |
4109 | $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, | |
4110 | $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, | |
4111 | $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK, | |
4112 | $PBLOCK_LEN); | |
4113 | ||
4114 | # ;; fall through to exit | |
4115 | ||
4116 | $code .= ".L_ghash_done_${rndsuffix}:\n"; | |
4117 | ||
4118 | # ;; save the last counter block | |
4119 | $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n"; | |
4120 | $code .= <<___; | |
4121 | vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX) | |
4122 | .L_enc_dec_done_${rndsuffix}: | |
4123 | ___ | |
4124 | } | |
4125 | ||
4126 | # ;;; =========================================================================== | |
4127 | # ;;; Encrypt/decrypt the initial 16 blocks | |
4128 | sub INITIAL_BLOCKS_16 { | |
4129 | my $IN = $_[0]; # [in] input buffer | |
4130 | my $OUT = $_[1]; # [in] output buffer | |
4131 | my $AES_KEYS = $_[2]; # [in] pointer to expanded keys | |
4132 | my $DATA_OFFSET = $_[3]; # [in] data offset | |
4133 | my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits) | |
4134 | my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits | |
4135 | my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check | |
4136 | my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian) | |
4137 | my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) | |
4138 | my $T0 = $_[9]; # [clobered] temporary ZMM register | |
4139 | my $T1 = $_[10]; # [clobered] temporary ZMM register | |
4140 | my $T2 = $_[11]; # [clobered] temporary ZMM register | |
4141 | my $T3 = $_[12]; # [clobered] temporary ZMM register | |
4142 | my $T4 = $_[13]; # [clobered] temporary ZMM register | |
4143 | my $T5 = $_[14]; # [clobered] temporary ZMM register | |
4144 | my $T6 = $_[15]; # [clobered] temporary ZMM register | |
4145 | my $T7 = $_[16]; # [clobered] temporary ZMM register | |
4146 | my $T8 = $_[17]; # [clobered] temporary ZMM register | |
4147 | my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask | |
4148 | my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector | |
4149 | my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks | |
4150 | my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset | |
4151 | my $IA0 = $_[22]; # [clobered] temporary GP register | |
4152 | ||
4153 | my $B00_03 = $T5; | |
4154 | my $B04_07 = $T6; | |
4155 | my $B08_11 = $T7; | |
4156 | my $B12_15 = $T8; | |
4157 | ||
4158 | my $rndsuffix = &random_string(); | |
4159 | ||
4160 | my $stack_offset = $BLK_OFFSET; | |
4161 | $code .= <<___; | |
4162 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4163 | # ;; prepare counter blocks | |
4164 | ||
4165 | cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} | |
4166 | jae .L_next_16_overflow_${rndsuffix} | |
4167 | vpaddd $ADDBE_1234,$CTR,$B00_03 | |
4168 | vpaddd $ADDBE_4x4,$B00_03,$B04_07 | |
4169 | vpaddd $ADDBE_4x4,$B04_07,$B08_11 | |
4170 | vpaddd $ADDBE_4x4,$B08_11,$B12_15 | |
4171 | jmp .L_next_16_ok_${rndsuffix} | |
4172 | .L_next_16_overflow_${rndsuffix}: | |
4173 | vpshufb $SHUF_MASK,$CTR,$CTR | |
4174 | vmovdqa64 ddq_add_4444(%rip),$B12_15 | |
4175 | vpaddd ddq_add_1234(%rip),$CTR,$B00_03 | |
4176 | vpaddd $B12_15,$B00_03,$B04_07 | |
4177 | vpaddd $B12_15,$B04_07,$B08_11 | |
4178 | vpaddd $B12_15,$B08_11,$B12_15 | |
4179 | vpshufb $SHUF_MASK,$B00_03,$B00_03 | |
4180 | vpshufb $SHUF_MASK,$B04_07,$B04_07 | |
4181 | vpshufb $SHUF_MASK,$B08_11,$B08_11 | |
4182 | vpshufb $SHUF_MASK,$B12_15,$B12_15 | |
4183 | .L_next_16_ok_${rndsuffix}: | |
4184 | vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR | |
4185 | addb \$16,@{[BYTE($CTR_CHECK)]} | |
4186 | # ;; === load 16 blocks of data | |
4187 | vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0 | |
4188 | vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1 | |
4189 | vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2 | |
4190 | vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3 | |
4191 | ||
4192 | # ;; move to AES encryption rounds | |
4193 | vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4 | |
4194 | vpxorq $T4,$B00_03,$B00_03 | |
4195 | vpxorq $T4,$B04_07,$B04_07 | |
4196 | vpxorq $T4,$B08_11,$B08_11 | |
4197 | vpxorq $T4,$B12_15,$B12_15 | |
4198 | ___ | |
4199 | foreach (1 .. ($NROUNDS)) { | |
4200 | $code .= <<___; | |
4201 | vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4 | |
4202 | vaesenc $T4,$B00_03,$B00_03 | |
4203 | vaesenc $T4,$B04_07,$B04_07 | |
4204 | vaesenc $T4,$B08_11,$B08_11 | |
4205 | vaesenc $T4,$B12_15,$B12_15 | |
4206 | ___ | |
4207 | } | |
4208 | $code .= <<___; | |
4209 | vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4 | |
4210 | vaesenclast $T4,$B00_03,$B00_03 | |
4211 | vaesenclast $T4,$B04_07,$B04_07 | |
4212 | vaesenclast $T4,$B08_11,$B08_11 | |
4213 | vaesenclast $T4,$B12_15,$B12_15 | |
4214 | ||
4215 | # ;; xor against text | |
4216 | vpxorq $T0,$B00_03,$B00_03 | |
4217 | vpxorq $T1,$B04_07,$B04_07 | |
4218 | vpxorq $T2,$B08_11,$B08_11 | |
4219 | vpxorq $T3,$B12_15,$B12_15 | |
4220 | ||
4221 | # ;; store | |
4222 | mov $OUT, $IA0 | |
4223 | vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1) | |
4224 | vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1) | |
4225 | vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1) | |
4226 | vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1) | |
4227 | ___ | |
4228 | if ($ENC_DEC eq "DEC") { | |
4229 | $code .= <<___; | |
4230 | # ;; decryption - cipher text needs to go to GHASH phase | |
4231 | vpshufb $SHUF_MASK,$T0,$B00_03 | |
4232 | vpshufb $SHUF_MASK,$T1,$B04_07 | |
4233 | vpshufb $SHUF_MASK,$T2,$B08_11 | |
4234 | vpshufb $SHUF_MASK,$T3,$B12_15 | |
4235 | ___ | |
4236 | } else { | |
4237 | $code .= <<___; | |
4238 | # ;; encryption | |
4239 | vpshufb $SHUF_MASK,$B00_03,$B00_03 | |
4240 | vpshufb $SHUF_MASK,$B04_07,$B04_07 | |
4241 | vpshufb $SHUF_MASK,$B08_11,$B08_11 | |
4242 | vpshufb $SHUF_MASK,$B12_15,$B12_15 | |
4243 | ___ | |
4244 | } | |
4245 | ||
4246 | if ($GHASH ne "no_ghash") { | |
4247 | $code .= <<___; | |
4248 | # ;; === xor cipher block 0 with GHASH for the next GHASH round | |
4249 | vpxorq $GHASH,$B00_03,$B00_03 | |
4250 | ___ | |
4251 | } | |
4252 | $code .= <<___; | |
4253 | vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp) | |
4254 | vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp) | |
4255 | vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp) | |
4256 | vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp) | |
4257 | ___ | |
4258 | } | |
4259 | ||
4260 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4261 | # ; GCM_COMPLETE Finishes ghash calculation | |
4262 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4263 | sub GCM_COMPLETE { | |
4264 | my $GCM128_CTX = $_[0]; | |
4265 | my $PBLOCK_LEN = $_[1]; | |
4266 | ||
4267 | my $rndsuffix = &random_string(); | |
4268 | ||
4269 | $code .= <<___; | |
4270 | vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2 | |
4271 | vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0) | |
4272 | ___ | |
4273 | ||
4274 | $code .= <<___; | |
4275 | vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4 | |
4276 | ||
4277 | # ;; Process the final partial block. | |
4278 | cmp \$0,$PBLOCK_LEN | |
4279 | je .L_partial_done_${rndsuffix} | |
4280 | ___ | |
4281 | ||
4282 | # ;GHASH computation for the last <16 Byte block | |
4283 | &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17"); | |
4284 | ||
4285 | $code .= <<___; | |
4286 | .L_partial_done_${rndsuffix}: | |
4287 | vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5 | |
4288 | vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C) | |
4289 | vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits | |
4290 | ||
4291 | vpxor %xmm5,%xmm4,%xmm4 | |
4292 | ___ | |
4293 | ||
4294 | &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17"); | |
4295 | ||
4296 | $code .= <<___; | |
4297 | vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap | |
4298 | vpxor %xmm4,%xmm3,%xmm3 | |
4299 | ||
4300 | .L_return_T_${rndsuffix}: | |
4301 | vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX) | |
4302 | ___ | |
4303 | } | |
4304 | ||
4305 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4306 | # ;;; Functions definitions | |
4307 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4308 | ||
4309 | $code .= ".text\n"; | |
4310 | { | |
4311 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4312 | # ;void ossl_aes_gcm_init_avx512 / | |
4313 | # ; (const void *aes_keys, | |
4314 | # ; void *gcm128ctx) | |
4315 | # ; | |
4316 | # ; Precomputes hashkey table for GHASH optimization. | |
4317 | # ; Leaf function (does not allocate stack space, does not use non-volatile registers). | |
4318 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4319 | $code .= <<___; | |
4320 | .globl ossl_aes_gcm_init_avx512 | |
4321 | .type ossl_aes_gcm_init_avx512,\@abi-omnipotent | |
4322 | .align 32 | |
4323 | ossl_aes_gcm_init_avx512: | |
4324 | .cfi_startproc | |
4325 | endbranch | |
4326 | ___ | |
4327 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4328 | $code .= <<___; | |
4329 | # ;; Check aes_keys != NULL | |
4330 | test $arg1,$arg1 | |
4331 | jz .Labort_init | |
4332 | ||
4333 | # ;; Check gcm128ctx != NULL | |
4334 | test $arg2,$arg2 | |
4335 | jz .Labort_init | |
4336 | ___ | |
4337 | } | |
4338 | $code .= "vpxorq %xmm16,%xmm16,%xmm16\n"; | |
4339 | &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey | |
4340 | $code .= <<___; | |
4341 | vpshufb SHUF_MASK(%rip),%xmm16,%xmm16 | |
4342 | # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;; | |
4343 | vmovdqa64 %xmm16,%xmm2 | |
4344 | vpsllq \$1,%xmm16,%xmm16 | |
4345 | vpsrlq \$63,%xmm2,%xmm2 | |
4346 | vmovdqa %xmm2,%xmm1 | |
4347 | vpslldq \$8,%xmm2,%xmm2 | |
4348 | vpsrldq \$8,%xmm1,%xmm1 | |
4349 | vporq %xmm2,%xmm16,%xmm16 | |
4350 | # ;reduction | |
4351 | vpshufd \$0b00100100,%xmm1,%xmm2 | |
4352 | vpcmpeqd TWOONE(%rip),%xmm2,%xmm2 | |
4353 | vpand POLY(%rip),%xmm2,%xmm2 | |
4354 | vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly | |
4355 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4356 | vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly | |
4357 | ___ | |
4358 | &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); | |
4359 | if ($CLEAR_SCRATCH_REGISTERS) { | |
4360 | &clear_scratch_gps_asm(); | |
4361 | &clear_scratch_zmms_asm(); | |
4362 | } else { | |
4363 | $code .= "vzeroupper\n"; | |
4364 | } | |
4365 | $code .= <<___; | |
4366 | .Labort_init: | |
4367 | ret | |
4368 | .cfi_endproc | |
4369 | .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 | |
4370 | ___ | |
4371 | } | |
4372 | ||
4373 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4374 | # ;void ossl_aes_gcm_setiv_avx512 | |
4375 | # ; (const void *aes_keys, | |
4376 | # ; void *gcm128ctx, | |
4377 | # ; const unsigned char *iv, | |
4378 | # ; size_t ivlen) | |
4379 | # ; | |
4380 | # ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure. | |
4381 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4382 | $code .= <<___; | |
4383 | .globl ossl_aes_gcm_setiv_avx512 | |
4384 | .type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent | |
4385 | .align 32 | |
4386 | ossl_aes_gcm_setiv_avx512: | |
4387 | .cfi_startproc | |
4388 | .Lsetiv_seh_begin: | |
4389 | endbranch | |
4390 | ___ | |
4391 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4392 | $code .= <<___; | |
4393 | # ;; Check aes_keys != NULL | |
4394 | test $arg1,$arg1 | |
4395 | jz .Labort_setiv | |
4396 | ||
4397 | # ;; Check gcm128ctx != NULL | |
4398 | test $arg2,$arg2 | |
4399 | jz .Labort_setiv | |
4400 | ||
4401 | # ;; Check iv != NULL | |
4402 | test $arg3,$arg3 | |
4403 | jz .Labort_setiv | |
4404 | ||
4405 | # ;; Check ivlen != 0 | |
4406 | test $arg4,$arg4 | |
4407 | jz .Labort_setiv | |
4408 | ___ | |
4409 | } | |
4410 | ||
4411 | # ; NOTE: code before PROLOG() must not modify any registers | |
4412 | &PROLOG( | |
4413 | 1, # allocate stack space for hkeys | |
4414 | 0, # do not allocate stack space for AES blocks | |
4415 | "setiv"); | |
4416 | &GCM_INIT_IV( | |
4417 | "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1", | |
4418 | "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", | |
4419 | "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19"); | |
4420 | &EPILOG( | |
4421 | 1, # hkeys were allocated | |
4422 | $arg4); | |
4423 | $code .= <<___; | |
4424 | .Labort_setiv: | |
4425 | ret | |
4426 | .Lsetiv_seh_end: | |
4427 | .cfi_endproc | |
4428 | .size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512 | |
4429 | ___ | |
4430 | ||
4431 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4432 | # ;void ossl_aes_gcm_update_aad_avx512 | |
4433 | # ; (unsigned char *gcm128ctx, | |
4434 | # ; const unsigned char *aad, | |
4435 | # ; size_t aadlen) | |
4436 | # ; | |
4437 | # ; Updates AAD hash in gcm128_context structure. | |
4438 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4439 | $code .= <<___; | |
4440 | .globl ossl_aes_gcm_update_aad_avx512 | |
4441 | .type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent | |
4442 | .align 32 | |
4443 | ossl_aes_gcm_update_aad_avx512: | |
4444 | .cfi_startproc | |
4445 | .Lghash_seh_begin: | |
4446 | endbranch | |
4447 | ___ | |
4448 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4449 | $code .= <<___; | |
4450 | # ;; Check gcm128ctx != NULL | |
4451 | test $arg1,$arg1 | |
4452 | jz .Lexit_update_aad | |
4453 | ||
4454 | # ;; Check aad != NULL | |
4455 | test $arg2,$arg2 | |
4456 | jz .Lexit_update_aad | |
4457 | ||
4458 | # ;; Check aadlen != 0 | |
4459 | test $arg3,$arg3 | |
4460 | jz .Lexit_update_aad | |
4461 | ___ | |
4462 | } | |
4463 | ||
4464 | # ; NOTE: code before PROLOG() must not modify any registers | |
4465 | &PROLOG( | |
4466 | 1, # allocate stack space for hkeys, | |
4467 | 0, # do not allocate stack space for AES blocks | |
4468 | "ghash"); | |
4469 | &GCM_UPDATE_AAD( | |
4470 | "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11", | |
4471 | "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13", | |
4472 | "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19"); | |
4473 | &EPILOG( | |
4474 | 1, # hkeys were allocated | |
4475 | $arg3); | |
4476 | $code .= <<___; | |
4477 | .Lexit_update_aad: | |
4478 | ret | |
4479 | .Lghash_seh_end: | |
4480 | .cfi_endproc | |
4481 | .size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512 | |
4482 | ___ | |
4483 | ||
4484 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4485 | # ;void ossl_aes_gcm_encrypt_avx512 | |
4486 | # ; (const void* aes_keys, | |
4487 | # ; void *gcm128ctx, | |
4488 | # ; unsigned int *pblocklen, | |
4489 | # ; const unsigned char *in, | |
4490 | # ; size_t len, | |
4491 | # ; unsigned char *out); | |
4492 | # ; | |
4493 | # ; Performs encryption of data |in| of len |len|, and stores the output in |out|. | |
4494 | # ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|. | |
4495 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4496 | $code .= <<___; | |
4497 | .globl ossl_aes_gcm_encrypt_avx512 | |
4498 | .type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent | |
4499 | .align 32 | |
4500 | ossl_aes_gcm_encrypt_avx512: | |
4501 | .cfi_startproc | |
4502 | .Lencrypt_seh_begin: | |
4503 | endbranch | |
4504 | ___ | |
4505 | ||
4506 | # ; NOTE: code before PROLOG() must not modify any registers | |
4507 | &PROLOG( | |
4508 | 1, # allocate stack space for hkeys | |
4509 | 1, # allocate stack space for AES blocks | |
4510 | "encrypt"); | |
4511 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4512 | $code .= <<___; | |
4513 | # ;; Check aes_keys != NULL | |
4514 | test $arg1,$arg1 | |
4515 | jz .Lexit_gcm_encrypt | |
4516 | ||
4517 | # ;; Check gcm128ctx != NULL | |
4518 | test $arg2,$arg2 | |
4519 | jz .Lexit_gcm_encrypt | |
4520 | ||
4521 | # ;; Check pblocklen != NULL | |
4522 | test $arg3,$arg3 | |
4523 | jz .Lexit_gcm_encrypt | |
4524 | ||
4525 | # ;; Check in != NULL | |
4526 | test $arg4,$arg4 | |
4527 | jz .Lexit_gcm_encrypt | |
4528 | ||
4529 | # ;; Check if len != 0 | |
4530 | cmp \$0,$arg5 | |
4531 | jz .Lexit_gcm_encrypt | |
4532 | ||
4533 | # ;; Check out != NULL | |
4534 | cmp \$0,$arg6 | |
4535 | jz .Lexit_gcm_encrypt | |
4536 | ___ | |
4537 | } | |
4538 | $code .= <<___; | |
4539 | # ; load number of rounds from AES_KEY structure (offset in bytes is | |
4540 | # ; size of the |rd_key| buffer) | |
4541 | mov `4*15*4`($arg1),%eax | |
4542 | cmp \$9,%eax | |
4543 | je .Laes_gcm_encrypt_128_avx512 | |
4544 | cmp \$11,%eax | |
4545 | je .Laes_gcm_encrypt_192_avx512 | |
4546 | cmp \$13,%eax | |
4547 | je .Laes_gcm_encrypt_256_avx512 | |
4548 | xor %eax,%eax | |
4549 | jmp .Lexit_gcm_encrypt | |
4550 | ___ | |
4551 | for my $keylen (sort keys %aes_rounds) { | |
4552 | $NROUNDS = $aes_rounds{$keylen}; | |
4553 | $code .= <<___; | |
4554 | .align 32 | |
4555 | .Laes_gcm_encrypt_${keylen}_avx512: | |
4556 | ___ | |
4557 | &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC"); | |
4558 | $code .= "jmp .Lexit_gcm_encrypt\n"; | |
4559 | } | |
4560 | $code .= ".Lexit_gcm_encrypt:\n"; | |
4561 | &EPILOG(1, $arg5); | |
4562 | $code .= <<___; | |
4563 | ret | |
4564 | .Lencrypt_seh_end: | |
4565 | .cfi_endproc | |
4566 | .size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512 | |
4567 | ___ | |
4568 | ||
4569 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4570 | # ;void ossl_aes_gcm_decrypt_avx512 | |
4571 | # ; (const void* keys, | |
4572 | # ; void *gcm128ctx, | |
4573 | # ; unsigned int *pblocklen, | |
4574 | # ; const unsigned char *in, | |
4575 | # ; size_t len, | |
4576 | # ; unsigned char *out); | |
4577 | # ; | |
4578 | # ; Performs decryption of data |in| of len |len|, and stores the output in |out|. | |
4579 | # ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|. | |
4580 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4581 | $code .= <<___; | |
4582 | .globl ossl_aes_gcm_decrypt_avx512 | |
4583 | .type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent | |
4584 | .align 32 | |
4585 | ossl_aes_gcm_decrypt_avx512: | |
4586 | .cfi_startproc | |
4587 | .Ldecrypt_seh_begin: | |
4588 | endbranch | |
4589 | ___ | |
4590 | ||
4591 | # ; NOTE: code before PROLOG() must not modify any registers | |
4592 | &PROLOG( | |
4593 | 1, # allocate stack space for hkeys | |
4594 | 1, # allocate stack space for AES blocks | |
4595 | "decrypt"); | |
4596 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4597 | $code .= <<___; | |
4598 | # ;; Check keys != NULL | |
4599 | test $arg1,$arg1 | |
4600 | jz .Lexit_gcm_decrypt | |
4601 | ||
4602 | # ;; Check gcm128ctx != NULL | |
4603 | test $arg2,$arg2 | |
4604 | jz .Lexit_gcm_decrypt | |
4605 | ||
4606 | # ;; Check pblocklen != NULL | |
4607 | test $arg3,$arg3 | |
4608 | jz .Lexit_gcm_decrypt | |
4609 | ||
4610 | # ;; Check in != NULL | |
4611 | test $arg4,$arg4 | |
4612 | jz .Lexit_gcm_decrypt | |
4613 | ||
4614 | # ;; Check if len != 0 | |
4615 | cmp \$0,$arg5 | |
4616 | jz .Lexit_gcm_decrypt | |
4617 | ||
4618 | # ;; Check out != NULL | |
4619 | cmp \$0,$arg6 | |
4620 | jz .Lexit_gcm_decrypt | |
4621 | ___ | |
4622 | } | |
4623 | $code .= <<___; | |
4624 | # ; load number of rounds from AES_KEY structure (offset in bytes is | |
4625 | # ; size of the |rd_key| buffer) | |
4626 | mov `4*15*4`($arg1),%eax | |
4627 | cmp \$9,%eax | |
4628 | je .Laes_gcm_decrypt_128_avx512 | |
4629 | cmp \$11,%eax | |
4630 | je .Laes_gcm_decrypt_192_avx512 | |
4631 | cmp \$13,%eax | |
4632 | je .Laes_gcm_decrypt_256_avx512 | |
4633 | xor %eax,%eax | |
4634 | jmp .Lexit_gcm_decrypt | |
4635 | ___ | |
4636 | for my $keylen (sort keys %aes_rounds) { | |
4637 | $NROUNDS = $aes_rounds{$keylen}; | |
4638 | $code .= <<___; | |
4639 | .align 32 | |
4640 | .Laes_gcm_decrypt_${keylen}_avx512: | |
4641 | ___ | |
4642 | &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC"); | |
4643 | $code .= "jmp .Lexit_gcm_decrypt\n"; | |
4644 | } | |
4645 | $code .= ".Lexit_gcm_decrypt:\n"; | |
4646 | &EPILOG(1, $arg5); | |
4647 | $code .= <<___; | |
4648 | ret | |
4649 | .Ldecrypt_seh_end: | |
4650 | .cfi_endproc | |
4651 | .size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512 | |
4652 | ___ | |
4653 | ||
4654 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4655 | # ;void ossl_aes_gcm_finalize_vaes_avx512 | |
4656 | # ; (void *gcm128ctx, | |
4657 | # ; unsigned int pblocklen); | |
4658 | # ; | |
4659 | # ; Finalizes encryption / decryption | |
4660 | # ; Leaf function (does not allocate stack space, does not use non-volatile registers). | |
4661 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4662 | $code .= <<___; | |
4663 | .globl ossl_aes_gcm_finalize_avx512 | |
4664 | .type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent | |
4665 | .align 32 | |
4666 | ossl_aes_gcm_finalize_avx512: | |
4667 | .cfi_startproc | |
4668 | endbranch | |
4669 | ___ | |
4670 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4671 | $code .= <<___; | |
4672 | # ;; Check gcm128ctx != NULL | |
4673 | test $arg1,$arg1 | |
4674 | jz .Labort_finalize | |
4675 | ___ | |
4676 | } | |
4677 | ||
4678 | &GCM_COMPLETE("$arg1", "$arg2"); | |
4679 | ||
4680 | $code .= <<___; | |
4681 | .Labort_finalize: | |
4682 | ret | |
4683 | .cfi_endproc | |
4684 | .size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512 | |
4685 | ___ | |
4686 | ||
4687 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4688 | # ;void ossl_gcm_gmult_avx512(u64 Xi[2], | |
4689 | # ; const void* gcm128ctx) | |
4690 | # ; | |
4691 | # ; Leaf function (does not allocate stack space, does not use non-volatile registers). | |
4692 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4693 | $code .= <<___; | |
4694 | .globl ossl_gcm_gmult_avx512 | |
4695 | .hidden ossl_gcm_gmult_avx512 | |
4696 | .type ossl_gcm_gmult_avx512,\@abi-omnipotent | |
4697 | .align 32 | |
4698 | ossl_gcm_gmult_avx512: | |
4699 | .cfi_startproc | |
4700 | endbranch | |
4701 | ___ | |
4702 | if ($CHECK_FUNCTION_ARGUMENTS) { | |
4703 | $code .= <<___; | |
4704 | # ;; Check Xi != NULL | |
4705 | test $arg1,$arg1 | |
4706 | jz .Labort_gmult | |
4707 | ||
4708 | # ;; Check gcm128ctx != NULL | |
4709 | test $arg2,$arg2 | |
4710 | jz .Labort_gmult | |
4711 | ___ | |
4712 | } | |
4713 | $code .= "vmovdqu64 ($arg1),%xmm1\n"; | |
4714 | $code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n"; | |
4715 | ||
4716 | &GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); | |
4717 | ||
4718 | $code .= "vmovdqu64 %xmm1,($arg1)\n"; | |
4719 | if ($CLEAR_SCRATCH_REGISTERS) { | |
4720 | &clear_scratch_gps_asm(); | |
4721 | &clear_scratch_zmms_asm(); | |
4722 | } else { | |
4723 | $code .= "vzeroupper\n"; | |
4724 | } | |
4725 | $code .= <<___; | |
4726 | .Labort_gmult: | |
4727 | ret | |
4728 | .cfi_endproc | |
4729 | .size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512 | |
4730 | ___ | |
4731 | ||
4732 | if ($win64) { | |
4733 | ||
4734 | # Add unwind metadata for SEH. | |
4735 | ||
4736 | # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160 | |
4737 | my $UWOP_PUSH_NONVOL = 0; | |
4738 | my $UWOP_ALLOC_LARGE = 1; | |
4739 | my $UWOP_SET_FPREG = 3; | |
4740 | my $UWOP_SAVE_XMM128 = 8; | |
4741 | my %UWOP_REG_NUMBER = ( | |
4742 | rax => 0, | |
4743 | rcx => 1, | |
4744 | rdx => 2, | |
4745 | rbx => 3, | |
4746 | rsp => 4, | |
4747 | rbp => 5, | |
4748 | rsi => 6, | |
4749 | rdi => 7, | |
4750 | map(("r$_" => $_), (8 .. 15))); | |
4751 | ||
4752 | $code .= <<___; | |
4753 | .section .pdata | |
4754 | .align 4 | |
4755 | .rva .Lsetiv_seh_begin | |
4756 | .rva .Lsetiv_seh_end | |
4757 | .rva .Lsetiv_seh_info | |
4758 | ||
4759 | .rva .Lghash_seh_begin | |
4760 | .rva .Lghash_seh_end | |
4761 | .rva .Lghash_seh_info | |
4762 | ||
4763 | .rva .Lencrypt_seh_begin | |
4764 | .rva .Lencrypt_seh_end | |
4765 | .rva .Lencrypt_seh_info | |
4766 | ||
4767 | .rva .Ldecrypt_seh_begin | |
4768 | .rva .Ldecrypt_seh_end | |
4769 | .rva .Ldecrypt_seh_info | |
4770 | ||
4771 | .section .xdata | |
4772 | ___ | |
4773 | ||
4774 | foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") { | |
4775 | $code .= <<___; | |
4776 | .align 8 | |
4777 | .L${func_name}_seh_info: | |
4778 | .byte 1 # version 1, no flags | |
224ea84b | 4779 | .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin |
63b996e7 AM |
4780 | .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10 |
4781 | # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16 | |
4782 | .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]} | |
4783 | ___ | |
4784 | ||
4785 | # Metadata for %xmm15-%xmm6 | |
4786 | # Occupy 2 slots each | |
4787 | for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { | |
4788 | ||
4789 | # Scaled-by-16 stack offset | |
4790 | my $xmm_reg_offset = ($reg_idx - 6); | |
4791 | $code .= <<___; | |
224ea84b | 4792 | .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin |
63b996e7 AM |
4793 | .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]} |
4794 | .value $xmm_reg_offset | |
4795 | ___ | |
4796 | } | |
4797 | ||
4798 | $code .= <<___; | |
4799 | # Frame pointer (occupy 1 slot) | |
224ea84b | 4800 | .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin |
63b996e7 AM |
4801 | .byte $UWOP_SET_FPREG |
4802 | ||
4803 | # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes | |
224ea84b | 4804 | .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin |
63b996e7 AM |
4805 | .byte $UWOP_ALLOC_LARGE |
4806 | .value `($XMM_STORAGE + 8) / 8` | |
4807 | ___ | |
4808 | ||
4809 | # Metadata for GPR regs | |
4810 | # Occupy 1 slot each | |
4811 | foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") { | |
4812 | $code .= <<___; | |
224ea84b | 4813 | .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin |
63b996e7 AM |
4814 | .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]} |
4815 | ___ | |
4816 | } | |
4817 | } | |
4818 | } | |
4819 | ||
4820 | $code .= <<___; | |
4821 | .data | |
4822 | .align 16 | |
4823 | POLY: .quad 0x0000000000000001, 0xC200000000000000 | |
4824 | ||
4825 | .align 64 | |
4826 | POLY2: | |
4827 | .quad 0x00000001C2000000, 0xC200000000000000 | |
4828 | .quad 0x00000001C2000000, 0xC200000000000000 | |
4829 | .quad 0x00000001C2000000, 0xC200000000000000 | |
4830 | .quad 0x00000001C2000000, 0xC200000000000000 | |
4831 | ||
4832 | .align 16 | |
4833 | TWOONE: .quad 0x0000000000000001, 0x0000000100000000 | |
4834 | ||
4835 | # ;;; Order of these constants should not change. | |
4836 | # ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F | |
4837 | .align 64 | |
4838 | SHUF_MASK: | |
4839 | .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 | |
4840 | .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 | |
4841 | .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 | |
4842 | .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 | |
4843 | ||
4844 | .align 16 | |
4845 | SHIFT_MASK: | |
4846 | .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 | |
4847 | ||
4848 | ALL_F: | |
4849 | .quad 0xffffffffffffffff, 0xffffffffffffffff | |
4850 | ||
4851 | ZERO: | |
4852 | .quad 0x0000000000000000, 0x0000000000000000 | |
4853 | ||
4854 | .align 16 | |
4855 | ONE: | |
4856 | .quad 0x0000000000000001, 0x0000000000000000 | |
4857 | ||
4858 | .align 16 | |
4859 | ONEf: | |
4860 | .quad 0x0000000000000000, 0x0100000000000000 | |
4861 | ||
4862 | .align 64 | |
4863 | ddq_add_1234: | |
4864 | .quad 0x0000000000000001, 0x0000000000000000 | |
4865 | .quad 0x0000000000000002, 0x0000000000000000 | |
4866 | .quad 0x0000000000000003, 0x0000000000000000 | |
4867 | .quad 0x0000000000000004, 0x0000000000000000 | |
4868 | ||
4869 | .align 64 | |
4870 | ddq_add_5678: | |
4871 | .quad 0x0000000000000005, 0x0000000000000000 | |
4872 | .quad 0x0000000000000006, 0x0000000000000000 | |
4873 | .quad 0x0000000000000007, 0x0000000000000000 | |
4874 | .quad 0x0000000000000008, 0x0000000000000000 | |
4875 | ||
4876 | .align 64 | |
4877 | ddq_add_4444: | |
4878 | .quad 0x0000000000000004, 0x0000000000000000 | |
4879 | .quad 0x0000000000000004, 0x0000000000000000 | |
4880 | .quad 0x0000000000000004, 0x0000000000000000 | |
4881 | .quad 0x0000000000000004, 0x0000000000000000 | |
4882 | ||
4883 | .align 64 | |
4884 | ddq_add_8888: | |
4885 | .quad 0x0000000000000008, 0x0000000000000000 | |
4886 | .quad 0x0000000000000008, 0x0000000000000000 | |
4887 | .quad 0x0000000000000008, 0x0000000000000000 | |
4888 | .quad 0x0000000000000008, 0x0000000000000000 | |
4889 | ||
4890 | .align 64 | |
4891 | ddq_addbe_1234: | |
4892 | .quad 0x0000000000000000, 0x0100000000000000 | |
4893 | .quad 0x0000000000000000, 0x0200000000000000 | |
4894 | .quad 0x0000000000000000, 0x0300000000000000 | |
4895 | .quad 0x0000000000000000, 0x0400000000000000 | |
4896 | ||
4897 | .align 64 | |
4898 | ddq_addbe_4444: | |
4899 | .quad 0x0000000000000000, 0x0400000000000000 | |
4900 | .quad 0x0000000000000000, 0x0400000000000000 | |
4901 | .quad 0x0000000000000000, 0x0400000000000000 | |
4902 | .quad 0x0000000000000000, 0x0400000000000000 | |
4903 | ||
4904 | .align 64 | |
4905 | byte_len_to_mask_table: | |
4906 | .value 0x0000, 0x0001, 0x0003, 0x0007 | |
4907 | .value 0x000f, 0x001f, 0x003f, 0x007f | |
4908 | .value 0x00ff, 0x01ff, 0x03ff, 0x07ff | |
4909 | .value 0x0fff, 0x1fff, 0x3fff, 0x7fff | |
4910 | .value 0xffff | |
4911 | ||
4912 | .align 64 | |
4913 | byte64_len_to_mask_table: | |
4914 | .quad 0x0000000000000000, 0x0000000000000001 | |
4915 | .quad 0x0000000000000003, 0x0000000000000007 | |
4916 | .quad 0x000000000000000f, 0x000000000000001f | |
4917 | .quad 0x000000000000003f, 0x000000000000007f | |
4918 | .quad 0x00000000000000ff, 0x00000000000001ff | |
4919 | .quad 0x00000000000003ff, 0x00000000000007ff | |
4920 | .quad 0x0000000000000fff, 0x0000000000001fff | |
4921 | .quad 0x0000000000003fff, 0x0000000000007fff | |
4922 | .quad 0x000000000000ffff, 0x000000000001ffff | |
4923 | .quad 0x000000000003ffff, 0x000000000007ffff | |
4924 | .quad 0x00000000000fffff, 0x00000000001fffff | |
4925 | .quad 0x00000000003fffff, 0x00000000007fffff | |
4926 | .quad 0x0000000000ffffff, 0x0000000001ffffff | |
4927 | .quad 0x0000000003ffffff, 0x0000000007ffffff | |
4928 | .quad 0x000000000fffffff, 0x000000001fffffff | |
4929 | .quad 0x000000003fffffff, 0x000000007fffffff | |
4930 | .quad 0x00000000ffffffff, 0x00000001ffffffff | |
4931 | .quad 0x00000003ffffffff, 0x00000007ffffffff | |
4932 | .quad 0x0000000fffffffff, 0x0000001fffffffff | |
4933 | .quad 0x0000003fffffffff, 0x0000007fffffffff | |
4934 | .quad 0x000000ffffffffff, 0x000001ffffffffff | |
4935 | .quad 0x000003ffffffffff, 0x000007ffffffffff | |
4936 | .quad 0x00000fffffffffff, 0x00001fffffffffff | |
4937 | .quad 0x00003fffffffffff, 0x00007fffffffffff | |
4938 | .quad 0x0000ffffffffffff, 0x0001ffffffffffff | |
4939 | .quad 0x0003ffffffffffff, 0x0007ffffffffffff | |
4940 | .quad 0x000fffffffffffff, 0x001fffffffffffff | |
4941 | .quad 0x003fffffffffffff, 0x007fffffffffffff | |
4942 | .quad 0x00ffffffffffffff, 0x01ffffffffffffff | |
4943 | .quad 0x03ffffffffffffff, 0x07ffffffffffffff | |
4944 | .quad 0x0fffffffffffffff, 0x1fffffffffffffff | |
4945 | .quad 0x3fffffffffffffff, 0x7fffffffffffffff | |
4946 | .quad 0xffffffffffffffff | |
4947 | ___ | |
4948 | ||
4949 | } else { | |
4950 | # Fallback for old assembler | |
4951 | $code .= <<___; | |
4952 | .text | |
4953 | .globl ossl_vaes_vpclmulqdq_capable | |
4954 | .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent | |
4955 | ossl_vaes_vpclmulqdq_capable: | |
4956 | xor %eax,%eax | |
4957 | ret | |
4958 | .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable | |
4959 | ||
4960 | .globl ossl_aes_gcm_init_avx512 | |
4961 | .globl ossl_aes_gcm_setiv_avx512 | |
4962 | .globl ossl_aes_gcm_update_aad_avx512 | |
4963 | .globl ossl_aes_gcm_encrypt_avx512 | |
4964 | .globl ossl_aes_gcm_decrypt_avx512 | |
4965 | .globl ossl_aes_gcm_finalize_avx512 | |
4966 | .globl ossl_gcm_gmult_avx512 | |
4967 | ||
4968 | .type ossl_aes_gcm_init_avx512,\@abi-omnipotent | |
4969 | ossl_aes_gcm_init_avx512: | |
4970 | ossl_aes_gcm_setiv_avx512: | |
4971 | ossl_aes_gcm_update_aad_avx512: | |
4972 | ossl_aes_gcm_encrypt_avx512: | |
4973 | ossl_aes_gcm_decrypt_avx512: | |
4974 | ossl_aes_gcm_finalize_avx512: | |
4975 | ossl_gcm_gmult_avx512: | |
4976 | .byte 0x0f,0x0b # ud2 | |
4977 | ret | |
4978 | .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 | |
4979 | ___ | |
4980 | } | |
4981 | ||
4982 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
4983 | print $code; | |
4984 | close STDOUT or die "error closing STDOUT: $!"; |