]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/aes-gcm-armv8_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / modes / asm / aes-gcm-armv8_64.pl
CommitLineData
31b59078 1#! /usr/bin/env perl
33388b44 2# Copyright 2019-2020 The OpenSSL Project Authors. All Rights Reserved.
31b59078
FF
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16#========================================================================
17#
18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
19#
20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
21#
22# ____________________________________________________
23# | |
24# | PRE |
25# |____________________________________________________|
26# | | | |
27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28# |________________|________________|__________________|
29# | | | |
30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31# |________________|________________|__________________|
32# | | | |
33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34# |________________|________________|__________________|
35# | | | |
36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37# |________________|____(mostly)____|__________________|
38# | |
39# | MODULO |
40# |____________________________________________________|
41#
42# PRE:
43# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44# EXT low_acc, low_acc, low_acc, #8
45# EOR res_curr (4k+0), res_curr (4k+0), low_acc
46#
47# CTR block:
48# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49# REV ctr32, rev_ctr32
50# ORR ctr64, constctr96_top32, ctr32, LSL #32
51# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
52# INS ctr_next.d[1], ctr64X
53# ADD rev_ctr32, #1
54#
55# AES block:
56# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58# Given we are very constrained in our ASIMD registers this is quite important
59#
60# Encrypt:
61# LDR input_low, [ input_ptr ], #8
62# LDR input_high, [ input_ptr ], #8
63# EOR input_low, k14_low
64# EOR input_high, k14_high
65# INS res_curr.d[0], input_low
66# INS res_curr.d[1], input_high
67# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
68# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
69# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
70# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
71# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
72# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
73# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
74# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
75# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
76# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
77# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
78# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
79# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
80# AESE ctr_curr, k13
81# EOR res_curr, res_curr, ctr_curr
82# ST1 { res_curr.16b }, [ output_ptr ], #16
83#
84# Decrypt:
85# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
86# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
87# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
88# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
89# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
90# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
91# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
92# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
93# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
94# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
95# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
96# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
97# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
98# AESE ctr_curr, k13
99# LDR res_curr, [ input_ptr ], #16
100# EOR res_curr, res_curr, ctr_curr
101# MOV output_low, res_curr.d[0]
102# MOV output_high, res_curr.d[1]
103# EOR output_low, k14_low
104# EOR output_high, k14_high
105# STP output_low, output_high, [ output_ptr ], #16
106#
107# GHASH block X:
108# do 128b karatsuba polynomial multiplication on block
109# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
110#
111# multiplication:
112# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
113#
114# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
116#
117# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118# multiplying with "twisted" powers of H
119#
120# Note: We can PMULL directly into the acc_x in first GHASH of the loop
121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122# path latency dominates the performance
123#
124# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125# than indicated here
126# REV64 res_curr, res_curr
127# INS t_m.d[0], res_curr.d[1]
128# EOR t_m.8B, t_m.8B, res_curr.8B
129# PMULL2 t_h, res_curr, HX
130# PMULL t_l, res_curr, HX
131# PMULL t_m, t_m, HX_k
132# EOR acc_h, acc_h, t_h
133# EOR acc_l, acc_l, t_l
134# EOR acc_m, acc_m, t_m
135#
136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138# with a reversed constant
139# EOR acc_m, acc_m, acc_h
140# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
141# PMULL t_mod, acc_h, mod_constant
142# EXT acc_h, acc_h, acc_h, #8
143# EOR acc_m, acc_m, acc_h
144# EOR acc_m, acc_m, t_mod
145# PMULL acc_h, acc_m, mod_constant
146# EXT acc_m, acc_m, acc_m, #8
147# EOR acc_l, acc_l, acc_h
148# EOR acc_l, acc_l, acc_m
149
150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
152
153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156die "can't locate arm-xlate.pl";
157
158open OUT,"| \"$^X\" $xlate $flavour $output";
159*STDOUT=*OUT;
160
161$input_ptr="x0"; #argument block
162$bit_length="x1";
163$output_ptr="x2";
164$current_tag="x3";
165$counter="x16";
166$cc="x8";
167
168{
169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172my ($output_l0,$output_h0)=map("x$_",(6..7));
173
174my $ctr32w="w9";
175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
177
178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
182
183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
186
187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
190
191my $t0="v8";
192my $t0d="d8";
193
194my ($t1,$t2,$t3)=map("v$_",(28..30));
195my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
196
197my $t4="v8";
198my $t4d="d8";
199my $t5="v28";
200my $t5d="d28";
201my $t6="v31";
202my $t6d="d31";
203
204my $t7="v4";
205my $t7d="d4";
206my $t8="v29";
207my $t8d="d29";
208my $t9="v30";
209my $t9d="d30";
210
211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
214
215my $mod_constantd="d8";
216my $mod_constant="v8";
217my $mod_t="v31";
218
219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
221my $rk2q1="v20.1q";
222my $rk3q1="v21.1q";
223my $rk4v="v22";
224my $rk4d="d22";
225
226$code=<<___;
227#include "arm_arch.h"
228
229#if __ARM_MAX_ARCH__>=8
230___
231$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
232$code.=<<___ if ($flavour !~ /64/);
233.fpu neon
234#ifdef __thumb2__
235.syntax unified
236.thumb
237# define INST(a,b,c,d) $_byte c,0xef,a,b
238#else
239.code 32
240# define INST(a,b,c,d) $_byte a,b,c,0xf2
241#endif
242
243.text
244___
245
246#########################################################################################
247# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
248# size_t len,
249# unsigned char *out,
250# const void *key,
251# unsigned char ivec[16],
252# u64 *Xi);
253#
254$code.=<<___;
255.global aes_gcm_enc_128_kernel
256.type aes_gcm_enc_128_kernel,%function
257.align 4
258aes_gcm_enc_128_kernel:
259 cbz x1, .L128_enc_ret
260 stp x19, x20, [sp, #-112]!
261 mov x16, x4
262 mov x8, x5
263 stp x21, x22, [sp, #16]
264 stp x23, x24, [sp, #32]
265 stp d8, d9, [sp, #48]
266 stp d10, d11, [sp, #64]
267 stp d12, d13, [sp, #80]
268 stp d14, d15, [sp, #96]
269
270 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
271 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
272
273 ld1 {$acc_lb}, [$current_tag]
274 ext $acc_lb, $acc_lb, $acc_lb, #8
275 rev64 $acc_lb, $acc_lb
276 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
277 mov $len, $main_end_input_ptr
278
279 ldr $rk9q, [$cc, #144] @ load rk9
280 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
281 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
282
283 lsr $rctr32x, $ctr96_t32x, #32
284 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
285 ext $h4b, $h4b, $h4b, #8
286
287 fmov $ctr1d, $ctr96_b64x @ CTR block 1
288 rev $rctr32w, $rctr32w @ rev_ctr32
289
290 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
291 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
292 ldr $rk0q, [$cc, #0] @ load rk0
293
294 rev $ctr32w, $rctr32w @ CTR block 1
295 add $rctr32w, $rctr32w, #1 @ CTR block 1
296 fmov $ctr3d, $ctr96_b64x @ CTR block 3
297
298 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
299 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
300
301 fmov $ctr1.d[1], $ctr32x @ CTR block 1
302 rev $ctr32w, $rctr32w @ CTR block 2
303
304 fmov $ctr2d, $ctr96_b64x @ CTR block 2
305 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
306 add $rctr32w, $rctr32w, #1 @ CTR block 2
307
308 fmov $ctr2.d[1], $ctr32x @ CTR block 2
309 rev $ctr32w, $rctr32w @ CTR block 3
310
311 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
312 ldr $rk1q, [$cc, #16] @ load rk1
313
314 add $rctr32w, $rctr32w, #1 @ CTR block 3
315 fmov $ctr3.d[1], $ctr32x @ CTR block 3
316
317 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
318 ext $h3b, $h3b, $h3b, #8
319
320 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
321 ldr $rk2q, [$cc, #32] @ load rk2
322
323 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
324 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
325 ext $h1b, $h1b, $h1b, #8
326
327 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
328 ldr $rk8q, [$cc, #128] @ load rk8
329
330 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
331 ldr $rk3q, [$cc, #48] @ load rk3
332
333 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
334 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
335
336 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
337 ldr $rk6q, [$cc, #96] @ load rk6
338
339 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
340 ldr $rk7q, [$cc, #112] @ load rk7
341
342 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
343 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
344
345 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
346 ldr $rk5q, [$cc, #80] @ load rk5
347
348 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
349 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
350 ext $h2b, $h2b, $h2b, #8
351
352 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
353
354 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
355 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
356
357 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
358
359 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
360
361 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
362 ldr $rk4q, [$cc, #64] @ load rk4
363
364 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
365
366 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
367 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
368
369 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
370 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
371
372 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
373 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
374
375 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
376
377 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
378
379 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
380
381 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
382
383 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
384
385 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
386
387 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
388 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
389
390 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
391
392 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
393
394 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
395
396 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
397
398 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
399
400 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
401
402 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
403
404 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
405
406 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
407
408 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
409
410 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
411
412 aese $ctr2b, $rk9 @ AES block 2 - round 9
413
414 aese $ctr0b, $rk9 @ AES block 0 - round 9
415
416 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
417
418 aese $ctr1b, $rk9 @ AES block 1 - round 9
419
420 aese $ctr3b, $rk9 @ AES block 3 - round 9
421 b.ge .L128_enc_tail @ handle tail
422
423 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
424
425 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
426
427 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
428
429 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
430
431 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
432 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
433
434 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
435 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
436
437 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
438 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
439 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
440
441 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
442 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
443
444 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
445 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
446
447 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
448 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
449 rev $ctr32w, $rctr32w @ CTR block 4
450
451 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
452 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
453
454 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
455 fmov $ctr0d, $ctr96_b64x @ CTR block 4
456 add $rctr32w, $rctr32w, #1 @ CTR block 4
457
458 fmov $ctr0.d[1], $ctr32x @ CTR block 4
459 rev $ctr32w, $rctr32w @ CTR block 5
460
461 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
462 fmov $ctr1d, $ctr96_b64x @ CTR block 5
463 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
464
465 add $rctr32w, $rctr32w, #1 @ CTR block 5
466 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
467 fmov $ctr1.d[1], $ctr32x @ CTR block 5
468
469 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
470 rev $ctr32w, $rctr32w @ CTR block 6
471 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
472
473 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
474 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
475
476 add $rctr32w, $rctr32w, #1 @ CTR block 6
477 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
478 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
479
480 fmov $ctr2d, $ctr96_b64x @ CTR block 6
481 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
482
483 fmov $ctr2.d[1], $ctr32x @ CTR block 6
484 rev $ctr32w, $rctr32w @ CTR block 7
485 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
486
487 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
488
489 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
490 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
491 b.ge .L128_enc_prepretail @ do prepretail
492
493 .L128_enc_main_loop: @ main loop start
494 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
495 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
496 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
497
498 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
499 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
500
501 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
502 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
503
504 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
505 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
506 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
507
508 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
509 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
510
511 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
512 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
513
514 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
515 eor $res0b, $res0b, $acc_lb @ PRE 1
516
517 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
518 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
519
520 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
521 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
522 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
523
524 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
525 rev $ctr32w, $rctr32w @ CTR block 4k+8
526
527 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
528 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
529 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
530
531 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
532 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
533 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
534
535 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
536
537 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
538 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
539
540 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
541
542 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
543 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
544
545 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
546
547 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
548 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
549
550 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
551
552 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
553 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
554
555 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
556 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
557
558 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
559 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
560
561 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
562 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
563
564 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
565 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
566
567 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
568 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
569
570 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
571
572 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
573 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
574
575 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
576
577 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
578 movi $mod_constant.8b, #0xc2
579
580 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
581 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
582
583 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
584
585 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
586 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
587
588 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
589 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
590
591 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
592 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
593
594 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
595 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
596
597 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
598 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
599
600 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
601 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
602
603 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
604 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
605
606 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
607 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
608
609 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
610 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
611
612 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
613 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
614
615 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
616 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
617 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
618
619 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
620 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
621 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
622
623 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
624 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
625
626 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
627 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
628
629 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
630 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
631
632 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
633 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
634
635 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
636 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
637
638 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
639 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
640
641 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
642 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
643
644 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
645 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
646 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
647
648 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
649 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
650
651 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
652 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
653
654 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
655 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
656
657 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
658 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
659
660 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
661 rev $ctr32w, $rctr32w @ CTR block 4k+9
662 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
663
664 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
665 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
666
667 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
668 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
669 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
670
671 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
672 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
673 rev $ctr32w, $rctr32w @ CTR block 4k+10
674
675 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
676 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
677 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
678 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
679
680 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
681 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
682 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
683 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
684
685 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
686 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
687
688 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
689 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
690 rev $ctr32w, $rctr32w @ CTR block 4k+11
691
692 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
693 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
694
695 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
696 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
697 b.lt .L128_enc_main_loop
698
699 .L128_enc_prepretail: @ PREPRETAIL
700 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
701 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
702 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
703
704 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
705 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
706 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
707
708 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
709 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
710
711 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
712
713 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
714 eor $res0b, $res0b, $acc_lb @ PRE 1
715
716 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
717
718 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
719 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
720
721 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
722 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
723
724 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
725 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
726
727 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
728 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
729
730 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
731
732 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
733 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
734
735 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
736
737 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
738 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
739
740 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
741
742 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
743 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
744
745 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
746
747 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
748 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
749
750 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
751 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
752
753 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
754
755 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
756 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
757
758 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
759
760 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
761
762 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
763 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
764
765 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
766
767 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
768 movi $mod_constant.8b, #0xc2
769
770 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
771 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
772
773 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
774
775 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
776 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
777
778 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
779
780 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
781 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
782
783 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
784
785 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
786 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
787
788 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
789 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
790
791 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
792
793 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
794 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
795
796 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
797
798 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
799 ext $acc_hb, $acc_hb, $acc_hb, #8
800
801 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
802
803 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
804 eor $acc_mb, $acc_mb, $acc_lb
805
806 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
807
808 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
809
810 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
811
812 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
813 eor $acc_mb, $acc_mb, $t1.16b
814
815 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
816
817 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
818
819 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
820
821 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
822 eor $acc_mb, $acc_mb, $acc_hb
823
824 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
825
826 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
827
828 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
829
830 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
831
832 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
833 ext $acc_mb, $acc_mb, $acc_mb, #8
834
835 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
836
837 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
838 eor $acc_lb, $acc_lb, $t1.16b
839
840 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
841
842 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
843
844 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
845
846 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
847
848 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
849 eor $acc_lb, $acc_lb, $acc_mb
850
851 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
852 .L128_enc_tail: @ TAIL
853
854 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
855 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
856
857 cmp $main_end_input_ptr, #48
858
859 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
860 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
861 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
862
863 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
864
865 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
866
867 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
868
869 b.gt .L128_enc_blocks_more_than_3
870
871 sub $rctr32w, $rctr32w, #1
872 movi $acc_l.8b, #0
873 mov $ctr3b, $ctr2b
874
875 cmp $main_end_input_ptr, #32
876 mov $ctr2b, $ctr1b
877 movi $acc_h.8b, #0
878
879 movi $acc_m.8b, #0
880 b.gt .L128_enc_blocks_more_than_2
881
882 mov $ctr3b, $ctr1b
883 cmp $main_end_input_ptr, #16
884
885 sub $rctr32w, $rctr32w, #1
886 b.gt .L128_enc_blocks_more_than_1
887
888 sub $rctr32w, $rctr32w, #1
889 b .L128_enc_blocks_less_than_1
890 .L128_enc_blocks_more_than_3: @ blocks left > 3
891 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
892
893 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
894
895 rev64 $res0b, $res1b @ GHASH final-3 block
896
897 eor $res0b, $res0b, $t0.16b @ feed in partial tag
898 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
899 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
900
901 fmov $res1d, $input_l0 @ AES final-2 block - mov low
902
903 movi $t0.8b, #0 @ suppress further partial tag feed in
904 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
905
906 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
907 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
908
909 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
910
911 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
912
913 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
914 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
915
916 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
917 .L128_enc_blocks_more_than_2: @ blocks left > 2
918
919 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
920
921 rev64 $res0b, $res1b @ GHASH final-2 block
922 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
923
924 eor $res0b, $res0b, $t0.16b @ feed in partial tag
925
926 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
927
928 fmov $res1d, $input_l0 @ AES final-1 block - mov low
929 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
930
931 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
932 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
933
934 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
935
936 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
937
938 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
939
940 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
941
942 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
943
944 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
945
946 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
947
948 movi $t0.8b, #0 @ suppress further partial tag feed in
949
950 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
951 .L128_enc_blocks_more_than_1: @ blocks left > 1
952
953 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
954
955 rev64 $res0b, $res1b @ GHASH final-1 block
956 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
957
958 eor $res0b, $res0b, $t0.16b @ feed in partial tag
959
960 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
961 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
962
963 fmov $res1d, $input_l0 @ AES final block - mov low
964
965 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
966 fmov $res1.d[1], $input_h0 @ AES final block - mov high
967
968 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
969
970 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
971
972 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
973
974 eor $res1b, $res1b, $ctr3b @ AES final block - result
975
976 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
977
978 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
979
980 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
981
982 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
983
984 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
985 movi $t0.8b, #0 @ suppress further partial tag feed in
986 .L128_enc_blocks_less_than_1: @ blocks left <= 1
987
988 and $bit_length, $bit_length, #127 @ bit_length %= 128
989 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
990
991 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
992 sub $bit_length, $bit_length, #128 @ bit_length -= 128
993
994 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
995
996 and $bit_length, $bit_length, #127 @ bit_length %= 128
997
998 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
999 cmp $bit_length, #64
1000
1001 csel $input_l0, $rk10_l, $rk10_h, lt
1002 csel $input_h0, $rk10_h, xzr, lt
1003
1004 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
1005
1006 fmov $ctr0.d[1], $input_h0
1007
1008 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1009
1010 rev64 $res0b, $res1b @ GHASH final block
1011
1012 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1013
1014 mov $t0d, $res0.d[1] @ GHASH final block - mid
1015
1016 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1017 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1018
1019 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1020
1021 rev $ctr32w, $rctr32w
1022
1023 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1024
1025 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1026
1027 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1028
1029 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1030
1031 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1032 movi $mod_constant.8b, #0xc2
1033
1034 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1035
1036 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1037
1038 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1039
1040 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1041
1042 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1043
1044 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1045
1046 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1047
1048 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1049
1050 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1051
1052 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1053
1054 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
1055 st1 { $res1b}, [$output_ptr] @ store all 16B
1056
1057 str $ctr32w, [$counter, #12] @ store the updated counter
1058
1059 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1060 ext $acc_lb, $acc_lb, $acc_lb, #8
1061 rev64 $acc_lb, $acc_lb
1062 mov x0, $len
1063 st1 { $acc_l.16b }, [$current_tag]
1064 ldp x21, x22, [sp, #16]
1065 ldp x23, x24, [sp, #32]
1066 ldp d8, d9, [sp, #48]
1067 ldp d10, d11, [sp, #64]
1068 ldp d12, d13, [sp, #80]
1069 ldp d14, d15, [sp, #96]
1070 ldp x19, x20, [sp], #112
1071 ret
1072
1073.L128_enc_ret:
1074 mov w0, #0x0
1075 ret
1076.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1077___
1078
1079#########################################################################################
1080# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1081# size_t len,
1082# unsigned char *out,
1083# const void *key,
1084# unsigned char ivec[16],
1085# u64 *Xi);
1086#
1087$code.=<<___;
1088.global aes_gcm_dec_128_kernel
1089.type aes_gcm_dec_128_kernel,%function
1090.align 4
1091aes_gcm_dec_128_kernel:
1092 cbz x1, .L128_dec_ret
1093 stp x19, x20, [sp, #-112]!
1094 mov x16, x4
1095 mov x8, x5
1096 stp x21, x22, [sp, #16]
1097 stp x23, x24, [sp, #32]
1098 stp d8, d9, [sp, #48]
1099 stp d10, d11, [sp, #64]
1100 stp d12, d13, [sp, #80]
1101 stp d14, d15, [sp, #96]
1102
1103 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
1104 mov $len, $main_end_input_ptr
1105 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1106
1107 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1108 ldr $rk0q, [$cc, #0] @ load rk0
1109
1110 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1111 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
1112
1113 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1114 ext $h2b, $h2b, $h2b, #8
1115
1116 lsr $rctr32x, $ctr96_t32x, #32
1117 fmov $ctr2d, $ctr96_b64x @ CTR block 2
1118
1119 ldr $rk1q, [$cc, #16] @ load rk1
1120 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1121 rev $rctr32w, $rctr32w @ rev_ctr32
1122
1123 fmov $ctr1d, $ctr96_b64x @ CTR block 1
1124 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
1125
1126 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1127 rev $ctr32w, $rctr32w @ CTR block 1
1128
1129 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
1130 ldr $rk2q, [$cc, #32] @ load rk2
1131 add $rctr32w, $rctr32w, #1 @ CTR block 1
1132
1133 fmov $ctr1.d[1], $ctr32x @ CTR block 1
1134 rev $ctr32w, $rctr32w @ CTR block 2
1135 add $rctr32w, $rctr32w, #1 @ CTR block 2
1136
1137 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1138 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
1139
1140 fmov $ctr2.d[1], $ctr32x @ CTR block 2
1141 rev $ctr32w, $rctr32w @ CTR block 3
1142
1143 fmov $ctr3d, $ctr96_b64x @ CTR block 3
1144 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
1145 add $rctr32w, $rctr32w, #1 @ CTR block 3
1146
1147 fmov $ctr3.d[1], $ctr32x @ CTR block 3
1148 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1149
1150 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1151 ldr $rk3q, [$cc, #48] @ load rk3
1152
1153 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1154 ldr $rk6q, [$cc, #96] @ load rk6
1155
1156 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1157 ldr $rk7q, [$cc, #112] @ load rk7
1158
1159 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1160 ldr $rk4q, [$cc, #64] @ load rk4
1161
1162 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1163
1164 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1165
1166 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1167 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
1168
1169 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1170 ld1 { $acc_lb}, [$current_tag]
1171 ext $acc_lb, $acc_lb, $acc_lb, #8
1172 rev64 $acc_lb, $acc_lb
1173
1174 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1175 ldr $rk5q, [$cc, #80] @ load rk5
1176
1177 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1178
1179 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1180
1181 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1182 ldr $rk9q, [$cc, #144] @ load rk9
1183
1184 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1185
1186 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1187
1188 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1189 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1190 ext $h3b, $h3b, $h3b, #8
1191
1192 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1193 ldr $rk8q, [$cc, #128] @ load rk8
1194
1195 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1196
1197 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1198
1199 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1200
1201 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1202
1203 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1204 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1205 ext $h1b, $h1b, $h1b, #8
1206
1207 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1208
1209 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1210
1211 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1212
1213 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1214
1215 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1216 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
1217
1218 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1219 ext $h4b, $h4b, $h4b, #8
1220 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
1221 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1222
1223 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1224
1225 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1226
1227 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1228 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
1229
1230 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1231
1232 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1233 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
1234
1235 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1236
1237 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1238
1239 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1240 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
1241
1242 aese $ctr2b, $rk9 @ AES block 2 - round 9
1243
1244 aese $ctr3b, $rk9 @ AES block 3 - round 9
1245
1246 aese $ctr0b, $rk9 @ AES block 0 - round 9
1247 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
1248
1249 aese $ctr1b, $rk9 @ AES block 1 - round 9
1250 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
1251 b.ge .L128_dec_tail @ handle tail
1252
1253 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
1254
1255 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
1256
1257 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
1258 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
1259
1260 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
1261 rev64 $res0b, $res0b @ GHASH block 0
1262 rev $ctr32w, $rctr32w @ CTR block 4
1263
1264 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
1265 add $rctr32w, $rctr32w, #1 @ CTR block 4
1266 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
1267
1268 rev64 $res1b, $res1b @ GHASH block 1
1269 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1270 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
1271
1272 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
1273
1274 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
1275 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1276
1277 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
1278
1279 fmov $ctr0d, $ctr96_b64x @ CTR block 4
1280
1281 fmov $ctr0.d[1], $ctr32x @ CTR block 4
1282 rev $ctr32w, $rctr32w @ CTR block 5
1283 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
1284
1285 fmov $ctr1d, $ctr96_b64x @ CTR block 5
1286 add $rctr32w, $rctr32w, #1 @ CTR block 5
1287 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
1288
1289 fmov $ctr1.d[1], $ctr32x @ CTR block 5
1290 rev $ctr32w, $rctr32w @ CTR block 6
1291 add $rctr32w, $rctr32w, #1 @ CTR block 6
1292
1293 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
1294
1295 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
1296 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
1297 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
1298
1299 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
1300 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
1301
1302 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
1303 b.ge .L128_dec_prepretail @ do prepretail
1304
1305 .L128_dec_main_loop: @ main loop start
1306 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1307 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1308 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1309
1310 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1311 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1312
1313 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1314 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1315
1316 rev64 $res2b, $res2b @ GHASH block 4k+2
1317 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1318 rev $ctr32w, $rctr32w @ CTR block 4k+7
1319
1320 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1321 eor $res0b, $res0b, $acc_lb @ PRE 1
1322 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1323
1324 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1325 rev64 $res3b, $res3b @ GHASH block 4k+3
1326
1327 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1328 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1329 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1330
1331 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1332 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1333 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1334
1335 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1336 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1337
1338 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1339 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1340
1341 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1342 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1343
1344 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1345
1346 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1347 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1348
1349 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1350 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1351
1352 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1353
1354 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1355 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1356
1357 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1358 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1359
1360 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1361 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1362 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1363
1364 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1365 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1366
1367 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1368
1369 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1370 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1371
1372 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1373
1374 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1375 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1376
1377 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1378
1379 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1380 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1381
1382 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1383
1384 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1385 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1386
1387 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1388 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1389
1390 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1391 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1392
1393 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1394 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1395
1396 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1397 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1398
1399 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1400 movi $mod_constant.8b, #0xc2
1401
1402 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1403 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1404
1405 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1406
1407 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1408 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1409
1410 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1411 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1412
1413 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1414 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1415 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
1416
1417 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1418 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1419
1420 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1421 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1422
1423 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1424 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1425
1426 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1427 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1428
1429 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1430 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1431
1432 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1433 rev $ctr32w, $rctr32w @ CTR block 4k+8
1434
1435 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1436 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
1437 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1438
1439 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1440 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
1441
1442 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1443 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1444
1445 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1446
1447 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1448 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
1449
1450 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1451 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
1452
1453 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
1454 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1455 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
1456
1457 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1458 ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
1459
1460 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1461 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1462
1463 rev64 $res1b, $res1b @ GHASH block 4k+5
1464 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1465 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1466
1467 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1468 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1469
1470 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1471 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
1472
1473 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1474 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
1475 rev $ctr32w, $rctr32w @ CTR block 4k+9
1476
1477 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1478 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
1479 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1480
1481 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1482 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1483
1484 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1485 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
1486 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1487
1488 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
1489 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
1490 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
1491
1492 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1493 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
1494 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1495
1496 rev64 $res0b, $res0b @ GHASH block 4k+4
1497 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1498 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
1499
1500 rev $ctr32w, $rctr32w @ CTR block 4k+10
1501 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
1502
1503 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
1504 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
1505
1506 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
1507 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
1508
1509 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
1510 b.lt L128_dec_main_loop
1511
1512 .L128_dec_prepretail: @ PREPRETAIL
1513 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1514 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1515 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1516
1517 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1518 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1519
1520 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1521 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1522
1523 eor $res0b, $res0b, $acc_lb @ PRE 1
1524 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1525 rev64 $res2b, $res2b @ GHASH block 4k+2
1526
1527 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1528 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1529
1530 rev $ctr32w, $rctr32w @ CTR block 4k+7
1531 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1532 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1533
1534 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1535 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1536 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1537
1538 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1539 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1540
1541 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1542 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1543
1544 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1545 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1546 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1547
1548 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1549 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1550
1551 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1552 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1553
1554 rev64 $res3b, $res3b @ GHASH block 4k+3
1555
1556 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1557 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1558
1559 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1560
1561 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1562 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1563
1564 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1565
1566 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1567 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1568
1569 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1570
1571 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1572 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1573
1574 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1575
1576 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1577
1578 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1579 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1580
1581 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1582 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1583
1584 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1585
1586 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1587 movi $mod_constant.8b, #0xc2
1588
1589 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1590 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1591
1592 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1593
1594 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1595 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1596
1597 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1598 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1599
1600 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1601 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1602 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1603
1604 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1605
1606 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1607 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1608
1609 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1610
1611 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1612 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1613
1614 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1615
1616 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1617 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1618
1619 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1620
1621 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1622
1623 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1624
1625 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1626 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1627
1628 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1629
1630 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1631 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1632
1633 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1634
1635 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1636 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1637
1638 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1639
1640 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1641
1642 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1643
1644 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1645 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1646
1647 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1648
1649 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1650
1651 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1652
1653 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1654 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1655
1656 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1657 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1658
1659 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1660
1661 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1662 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1663
1664 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1665
1666 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1667 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1668
1669 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1670 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1671
1672 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1673 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1674 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1675
1676 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1677 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1678 .L128_dec_tail: @ TAIL
1679
1680 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1681 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
1682
1683 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
1684
1685 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1686
1687 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1688
1689 cmp $main_end_input_ptr, #48
1690
1691 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1692
1693 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
1694 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1695 b.gt .L128_dec_blocks_more_than_3
1696
1697 mov $ctr3b, $ctr2b
1698 sub $rctr32w, $rctr32w, #1
1699 movi $acc_l.8b, #0
1700
1701 movi $acc_h.8b, #0
1702 mov $ctr2b, $ctr1b
1703
1704 movi $acc_m.8b, #0
1705 cmp $main_end_input_ptr, #32
1706 b.gt .L128_dec_blocks_more_than_2
1707
1708 cmp $main_end_input_ptr, #16
1709
1710 mov $ctr3b, $ctr1b
1711 sub $rctr32w, $rctr32w, #1
1712 b.gt .L128_dec_blocks_more_than_1
1713
1714 sub $rctr32w, $rctr32w, #1
1715 b .L128_dec_blocks_less_than_1
1716 .L128_dec_blocks_more_than_3: @ blocks left > 3
1717 rev64 $res0b, $res1b @ GHASH final-3 block
1718 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
1719
1720 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1721
1722 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
1723 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
1724 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
1725
1726 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
1727 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
1728
1729 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
1730 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
1731
1732 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
1733
1734 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1735
1736 movi $t0.8b, #0 @ suppress further partial tag feed in
1737 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
1738
1739 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
1740 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
1741 .L128_dec_blocks_more_than_2: @ blocks left > 2
1742
1743 rev64 $res0b, $res1b @ GHASH final-2 block
1744 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
1745
1746 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1747
1748 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
1749 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
1750
1751 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
1752
1753 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1754
1755 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1756 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
1757
1758 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
1759 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1760
1761 movi $t0.8b, #0 @ suppress further partial tag feed in
1762
1763 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1764
1765 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
1766 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1767
1768 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1769
1770 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1771 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
1772 .L128_dec_blocks_more_than_1: @ blocks left > 1
1773
1774 rev64 $res0b, $res1b @ GHASH final-1 block
1775
1776 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
1777 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1778
1779 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
1780
1781 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
1782
1783 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1784
1785 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
1786 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
1787
1788 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
1789 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1790
1791 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1792
1793 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1794
1795 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1796 movi $t0.8b, #0 @ suppress further partial tag feed in
1797
1798 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1799
1800 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1801 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
1802
1803 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
1804 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1805 .L128_dec_blocks_less_than_1: @ blocks left <= 1
1806
1807 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
1808 and $bit_length, $bit_length, #127 @ bit_length %= 128
1809
1810 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
1811 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1812
1813 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1814
1815 and $bit_length, $bit_length, #127 @ bit_length %= 128
1816
1817 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1818 cmp $bit_length, #64
1819
1820 csel $ctr96_b64x, $rk10_h, xzr, lt
1821 csel $ctr32x, $rk10_l, $rk10_h, lt
1822
1823 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
1824
1825 mov $ctr0.d[1], $ctr96_b64x
1826
1827 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1828
1829 rev64 $res0b, $res1b @ GHASH final block
1830
1831 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1832
1833 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1834
1835 and $output_h0, $output_h0, $ctr96_b64x
1836
1837 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1838 mov $t0d, $res0.d[1] @ GHASH final block - mid
1839
1840 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1841 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1842
1843 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1844
1845 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1846 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
1847 and $output_l0, $output_l0, $ctr32x
1848
1849 rev $ctr32w, $rctr32w
1850
1851 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1852 movi $mod_constant.8b, #0xc2
1853
1854 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1855
1856 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
1857 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1858
1859 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1860
1861 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1862
1863 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1864
1865 orr $output_l0, $output_l0, $end_input_ptr
1866 str $ctr32w, [$counter, #12] @ store the updated counter
1867
1868 orr $output_h0, $output_h0, $main_end_input_ptr
1869 stp $output_l0, $output_h0, [$output_ptr]
1870 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1871
1872 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1873
1874 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1875
1876 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1877 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1878
1879 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1880
1881 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1882 ext $acc_lb, $acc_lb, $acc_lb, #8
1883 rev64 $acc_lb, $acc_lb
1884 mov x0, $len
1885 st1 { $acc_l.16b }, [$current_tag]
1886
1887 ldp x21, x22, [sp, #16]
1888 ldp x23, x24, [sp, #32]
1889 ldp d8, d9, [sp, #48]
1890 ldp d10, d11, [sp, #64]
1891 ldp d12, d13, [sp, #80]
1892 ldp d14, d15, [sp, #96]
1893 ldp x19, x20, [sp], #112
1894 ret
1895
1896 .L128_dec_ret:
1897 mov w0, #0x0
1898 ret
1899.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1900___
1901}
1902
1903{
1904my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
1905my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
1906my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
1907my ($output_l0,$output_h0)=map("x$_",(6..7));
1908
1909my $ctr32w="w9";
1910my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
1911my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
1912
1913my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
1914my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
1915my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
1916my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
1917
1918my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
1919my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
1920my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
1921
1922my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
1923my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
1924my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
1925
1926my $t0="v8";
1927my $t0d="d8";
1928my $t3="v4";
1929my $t3d="d4";
1930
1931my ($t1,$t2)=map("v$_",(30..31));
1932my ($t1d,$t2d)=map("d$_",(30..31));
1933
1934my $t4="v30";
1935my $t4d="d30";
1936my $t5="v8";
1937my $t5d="d8";
1938my $t6="v31";
1939my $t6d="d31";
1940
1941my $t7="v5";
1942my $t7d="d5";
1943my $t8="v6";
1944my $t8d="d6";
1945my $t9="v30";
1946my $t9d="d30";
1947
1948my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
1949my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
1950my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
1951
1952my $mod_constantd="d8";
1953my $mod_constant="v8";
1954my $mod_t="v31";
1955
1956my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
1957my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
1958my $rk2q1="v20.1q";
1959my $rk3q1="v21.1q";
1960my $rk4v="v22";
1961my $rk4d="d22";
1962
1963#########################################################################################
1964# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
1965# size_t len,
1966# unsigned char *out,
1967# const void *key,
1968# unsigned char ivec[16],
1969# u64 *Xi);
1970#
1971$code.=<<___;
1972.global aes_gcm_enc_192_kernel
1973.type aes_gcm_enc_192_kernel,%function
1974.align 4
1975aes_gcm_enc_192_kernel:
1976 cbz x1, .L192_enc_ret
1977 stp x19, x20, [sp, #-112]!
1978 mov x16, x4
1979 mov x8, x5
1980 stp x21, x22, [sp, #16]
1981 stp x23, x24, [sp, #32]
1982 stp d8, d9, [sp, #48]
1983 stp d10, d11, [sp, #64]
1984 stp d12, d13, [sp, #80]
1985 stp d14, d15, [sp, #96]
1986
1987 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1988
1989 ldr $rk5q, [$cc, #80] @ load rk5
1990
1991 ldr $rk4q, [$cc, #64] @ load rk4
1992
1993 ldr $rk8q, [$cc, #128] @ load rk8
1994
1995 lsr $rctr32x, $ctr96_t32x, #32
1996 ldr $rk6q, [$cc, #96] @ load rk6
1997 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1998
1999 ldr $rk7q, [$cc, #112] @ load rk7
2000 rev $rctr32w, $rctr32w @ rev_ctr32
2001
2002 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2003 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2004
2005 rev $ctr32w, $rctr32w @ CTR block 1
2006 add $rctr32w, $rctr32w, #1 @ CTR block 1
2007 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2008
2009 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2010 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2011
2012 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2013 rev $ctr32w, $rctr32w @ CTR block 2
2014 add $rctr32w, $rctr32w, #1 @ CTR block 2
2015
2016 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2017 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2018
2019 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2020 rev $ctr32w, $rctr32w @ CTR block 3
2021
2022 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2023 ldr $rk0q, [$cc, #0] @ load rk0
2024
2025 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2026
2027 ldr $rk3q, [$cc, #48] @ load rk3
2028
2029 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2030
2031 ldr $rk1q, [$cc, #16] @ load rk1
2032
2033 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2034 ld1 { $acc_lb}, [$current_tag]
2035 ext $acc_lb, $acc_lb, $acc_lb, #8
2036 rev64 $acc_lb, $acc_lb
2037
2038 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2039 ldr $rk11q, [$cc, #176] @ load rk11
2040
2041 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2042 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2043 ext $h4b, $h4b, $h4b, #8
2044
2045 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2046 ldr $rk2q, [$cc, #32] @ load rk2
2047
2048 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2049 ldr $rk10q, [$cc, #160] @ load rk10
2050
2051 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2052 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2053 ext $h1b, $h1b, $h1b, #8
2054
2055 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2056 ldr $rk9q, [$cc, #144] @ load rk9
2057
2058 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2059 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2060 ext $h3b, $h3b, $h3b, #8
2061
2062 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2063
2064 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2065
2066 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2067
2068 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2069 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2070
2071 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2072
2073 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2074 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2075
2076 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2077
2078 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2079
2080 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2081
2082 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2083
2084 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2085
2086 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2087
2088 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2089
2090 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2091
2092 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2093
2094 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2095
2096 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2097
2098 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2099 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2100 ext $h2b, $h2b, $h2b, #8
2101
2102 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2103
2104 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2105
2106 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2107
2108 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2109 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2110
2111 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2112
2113 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2114
2115 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2116 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
2117
2118 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2119
2120 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2121
2122 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2123
2124 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2125
2126 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2127
2128 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2129
2130 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2131
2132 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
2133
2134 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
2135
2136 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
2137 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2138 mov $len, $main_end_input_ptr
2139
2140 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
2141 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2142
2143 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
2144 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2145
2146 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
2147
2148 aese $ctr2b, $rk11 @ AES block 2 - round 11
2149 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2150 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2151
2152 aese $ctr1b, $rk11 @ AES block 1 - round 11
2153 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
2154
2155 aese $ctr0b, $rk11 @ AES block 0 - round 11
2156 add $rctr32w, $rctr32w, #1 @ CTR block 3
2157
2158 aese $ctr3b, $rk11 @ AES block 3 - round 11
2159 b.ge .L192_enc_tail @ handle tail
2160
2161 rev $ctr32w, $rctr32w @ CTR block 4
2162 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
2163
2164 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
2165 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
2166
2167 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
2168
2169 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
2170 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2171 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2172
2173 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
2174
2175 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
2176 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
2177 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
2178
2179 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
2180 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
2181
2182 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
2183 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
2184
2185 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
2186 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
2187
2188 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
2189
2190 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
2191 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
2192
2193 add $rctr32w, $rctr32w, #1 @ CTR block 4
2194 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
2195 fmov $ctr0d, $ctr96_b64x @ CTR block 4
2196
2197 fmov $ctr0.d[1], $ctr32x @ CTR block 4
2198 rev $ctr32w, $rctr32w @ CTR block 5
2199
2200 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
2201 add $rctr32w, $rctr32w, #1 @ CTR block 5
2202
2203 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
2204 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
2205
2206 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
2207
2208 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
2209 fmov $ctr1d, $ctr96_b64x @ CTR block 5
2210 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
2211
2212 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
2213
2214 fmov $ctr1.d[1], $ctr32x @ CTR block 5
2215 rev $ctr32w, $rctr32w @ CTR block 6
2216
2217 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
2218
2219 add $rctr32w, $rctr32w, #1 @ CTR block 6
2220 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
2221 fmov $ctr2d, $ctr96_b64x @ CTR block 6
2222
2223 fmov $ctr2.d[1], $ctr32x @ CTR block 6
2224 rev $ctr32w, $rctr32w @ CTR block 7
2225
2226 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
2227 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
2228
2229 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
2230 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
2231 b.ge .L192_enc_prepretail @ do prepretail
2232
2233 .L192_enc_main_loop: @ main loop start
2234 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2235 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2236
2237 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2238 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
2239
2240 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2241 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2242 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2243
2244 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2245 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2246
2247 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2248 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2249 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
2250
2251 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2252 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
2253
2254 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2255 eor $res0b, $res0b, $acc_lb @ PRE 1
2256
2257 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2258
2259 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2260 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2261
2262 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2263 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
2264
2265 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2266 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2267
2268 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2269
2270 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2271 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
2272
2273 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2274 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2275
2276 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2277 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
2278
2279 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2280 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2281
2282 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2283 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2284
2285 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2286
2287 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2288
2289 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2290 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2291
2292 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2293 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2294
2295 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2296
2297 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2298 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2299
2300 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2301
2302 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2303 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
2304 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2305
2306 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2307 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2308
2309 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2310 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2311
2312 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2313 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
2314
2315 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2316 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
2317 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2318
2319 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2320 rev $ctr32w, $rctr32w @ CTR block 4k+8
2321
2322 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2323 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
2324
2325 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2326 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2327
2328 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2329 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
2330
2331 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2332 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2333
2334 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2335 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2336
2337 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2338 movi $mod_constant.8b, #0xc2
2339
2340 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2341 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2342 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2343
2344 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2345 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2346
2347 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2348 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2349
2350 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2351 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2352
2353 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2354 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
2355
2356 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2357 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2358
2359 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2360 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
2361
2362 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2363 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2364
2365 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2366 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
2367 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2368
2369 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2370 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2371
2372 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2373 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
2374
2375 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2376 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2377 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
2378
2379 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2380 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
2381
2382 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2383 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2384 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
2385
2386 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2387
2388 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2389 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2390
2391 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2392
2393 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2394
2395 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2396
2397 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2398 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2399
2400 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2401
2402 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2403
2404 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2405
2406 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2407 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2408
2409 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2410
2411 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2412 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
2413
2414 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2415 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
2416 rev $ctr32w, $rctr32w @ CTR block 4k+9
2417
2418 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2419 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
2420 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
2421
2422 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2423 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
2424
2425 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
2426 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
2427 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
2428
2429 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2430 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
2431 rev $ctr32w, $rctr32w @ CTR block 4k+10
2432
2433 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
2434 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2435 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
2436
2437 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
2438 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2439
2440 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2441 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
2442 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
2443
2444 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
2445 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
2446 rev $ctr32w, $rctr32w @ CTR block 4k+11
2447
2448 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2449 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
2450
2451 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
2452 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
2453 b.lt .L192_enc_main_loop
2454
2455 .L192_enc_prepretail: @ PREPRETAIL
2456 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2457 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2458
2459 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2460 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2461 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2462
2463 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2464 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2465
2466 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2467
2468 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2469 eor $res0b, $res0b, $acc_lb @ PRE 1
2470 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2471
2472 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2473 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2474
2475 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2476
2477 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2478 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2479
2480 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2481 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2482
2483 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2484
2485 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2486 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2487
2488 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2489 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2490
2491 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2492 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2493
2494 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2495
2496 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2497 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2498
2499 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2500
2501 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2502 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2503
2504 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2505
2506 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2507 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2508
2509 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2510 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2511
2512 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2513
2514 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2515 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2516
2517 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2518
2519 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2520
2521 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2522
2523 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2524 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2525
2526 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2527
2528 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2529 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2530
2531 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2532
2533 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2534 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2535
2536 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2537
2538 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2539 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2540
2541 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2542
2543 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2544 movi $mod_constant.8b, #0xc2
2545
2546 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2547
2548 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2549
2550 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2551 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2552
2553 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2554
2555 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2556
2557 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2558 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2559
2560 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2561
2562 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2563 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
2564
2565 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2566
2567 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2568 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2569
2570 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2571
2572 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2573 eor $acc_mb, $acc_mb, $acc_lb
2574
2575 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2576
2577 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
2578
2579 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2580 ext $acc_hb, $acc_hb, $acc_hb, #8
2581
2582 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2583
2584 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2585 eor $acc_mb, $acc_mb, $t1.16b
2586
2587 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2588
2589 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2590
2591 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2592
2593 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2594 eor $acc_mb, $acc_mb, $acc_hb
2595
2596 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2597
2598 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2599
2600 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2601
2602 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
2603
2604 ext $acc_mb, $acc_mb, $acc_mb, #8
2605
2606 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2607
2608 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2609
2610 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2611
2612 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2613 eor $acc_lb, $acc_lb, $t1.16b
2614
2615 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2616
2617 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2618
2619 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2620
2621 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2622 eor $acc_lb, $acc_lb, $acc_mb
2623 .L192_enc_tail: @ TAIL
2624
2625 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2626 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
2627
2628 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2629 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2630
2631 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2632
2633 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2634 cmp $main_end_input_ptr, #48
2635
2636 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2637
2638 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2639 b.gt .L192_enc_blocks_more_than_3
2640
2641 sub $rctr32w, $rctr32w, #1
2642 movi $acc_m.8b, #0
2643
2644 mov $ctr3b, $ctr2b
2645 movi $acc_h.8b, #0
2646 cmp $main_end_input_ptr, #32
2647
2648 mov $ctr2b, $ctr1b
2649 movi $acc_l.8b, #0
2650 b.gt .L192_enc_blocks_more_than_2
2651
2652 sub $rctr32w, $rctr32w, #1
2653
2654 mov $ctr3b, $ctr1b
2655 cmp $main_end_input_ptr, #16
2656 b.gt .L192_enc_blocks_more_than_1
2657
2658 sub $rctr32w, $rctr32w, #1
2659 b .L192_enc_blocks_less_than_1
2660 .L192_enc_blocks_more_than_3: @ blocks left > 3
2661 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
2662
2663 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
2664
2665 rev64 $res0b, $res1b @ GHASH final-3 block
2666
2667 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
2668 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2669
2670 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
2671 fmov $res1d, $input_l0 @ AES final-2 block - mov low
2672
2673 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
2674
2675 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
2676
2677 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
2678
2679 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
2680
2681 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2682
2683 movi $t0.8b, #0 @ suppress further partial tag feed in
2684
2685 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
2686
2687 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
2688 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
2689 .L192_enc_blocks_more_than_2: @ blocks left > 2
2690
2691 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
2692
2693 rev64 $res0b, $res1b @ GHASH final-2 block
2694 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
2695
2696 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2697
2698 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
2699
2700 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2701 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
2702
2703 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2704 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
2705
2706 fmov $res1d, $input_l0 @ AES final-1 block - mov low
2707
2708 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
2709 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2710 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2711
2712 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2713
2714 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2715
2716 movi $t0.8b, #0 @ suppress further partial tag feed in
2717
2718 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
2719
2720 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2721 .L192_enc_blocks_more_than_1: @ blocks left > 1
2722
2723 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
2724
2725 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
2726
2727 rev64 $res0b, $res1b @ GHASH final-1 block
2728
2729 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
2730 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2731 movi $t0.8b, #0 @ suppress further partial tag feed in
2732
2733 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
2734
2735 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2736 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
2737 fmov $res1d, $input_l0 @ AES final block - mov low
2738
2739 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2740 fmov $res1.d[1], $input_h0 @ AES final block - mov high
2741
2742 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2743
2744 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2745
2746 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2747
2748 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2749
2750 eor $res1b, $res1b, $ctr3b @ AES final block - result
2751
2752 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2753
2754 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2755 .L192_enc_blocks_less_than_1: @ blocks left <= 1
2756
2757 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2758 rev $ctr32w, $rctr32w
2759 and $bit_length, $bit_length, #127 @ bit_length %= 128
2760
2761 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2762 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
2763
2764 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2765 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
2766
2767 and $bit_length, $bit_length, #127 @ bit_length %= 128
2768
2769 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
2770 cmp $bit_length, #64
2771
2772 csel $input_l0, $rk12_l, $rk12_h, lt
2773 csel $input_h0, $rk12_h, xzr, lt
2774
2775 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
2776
2777 fmov $ctr0.d[1], $input_h0
2778
2779 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2780
2781 rev64 $res0b, $res1b @ GHASH final block
2782
2783 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2784
2785 mov $t0d, $res0.d[1] @ GHASH final block - mid
2786
2787 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2788
2789 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2790
2791 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2792
2793 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2794
2795 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2796
2797 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2798
2799 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2800 movi $mod_constant.8b, #0xc2
2801
2802 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2803
2804 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2805
2806 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2807
2808 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2809
2810 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2811
2812 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2813
2814 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2815
2816 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2817
2818 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2819
2820 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2821
2822 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2823 str $ctr32w, [$counter, #12] @ store the updated counter
2824
2825 st1 { $res1b}, [$output_ptr] @ store all 16B
2826
2827 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2828 ext $acc_lb, $acc_lb, $acc_lb, #8
2829 rev64 $acc_lb, $acc_lb
2830 mov x0, $len
2831 st1 { $acc_l.16b }, [$current_tag]
2832
2833 ldp x21, x22, [sp, #16]
2834 ldp x23, x24, [sp, #32]
2835 ldp d8, d9, [sp, #48]
2836 ldp d10, d11, [sp, #64]
2837 ldp d12, d13, [sp, #80]
2838 ldp d14, d15, [sp, #96]
2839 ldp x19, x20, [sp], #112
2840 ret
2841
2842.L192_enc_ret:
2843 mov w0, #0x0
2844 ret
2845.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
2846___
2847
2848#########################################################################################
2849# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
2850# size_t len,
2851# unsigned char *out,
2852# const void *key,
2853# unsigned char ivec[16],
2854# u64 *Xi);
2855#
2856$code.=<<___;
2857.global aes_gcm_dec_192_kernel
2858.type aes_gcm_dec_192_kernel,%function
2859.align 4
2860aes_gcm_dec_192_kernel:
2861 cbz x1, .L192_dec_ret
2862 stp x19, x20, [sp, #-112]!
2863 mov x16, x4
2864 mov x8, x5
2865 stp x21, x22, [sp, #16]
2866 stp x23, x24, [sp, #32]
2867 stp d8, d9, [sp, #48]
2868 stp d10, d11, [sp, #64]
2869 stp d12, d13, [sp, #80]
2870 stp d14, d15, [sp, #96]
2871
2872 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2873 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
2874
2875 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2876
2877 ldr $rk0q, [$cc, #0] @ load rk0
2878
2879 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2880 mov $len, $main_end_input_ptr
2881 ldr $rk2q, [$cc, #32] @ load rk2
2882
2883 lsr $rctr32x, $ctr96_t32x, #32
2884 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2885 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2886
2887 rev $rctr32w, $rctr32w @ rev_ctr32
2888 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2889
2890 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2891 ldr $rk1q, [$cc, #16] @ load rk1
2892
2893 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2894 rev $ctr32w, $rctr32w @ CTR block 1
2895
2896 add $rctr32w, $rctr32w, #1 @ CTR block 1
2897 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2898 ldr $rk3q, [$cc, #48] @ load rk3
2899
2900 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2901 rev $ctr32w, $rctr32w @ CTR block 2
2902 add $rctr32w, $rctr32w, #1 @ CTR block 2
2903
2904 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2905 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2906
2907 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2908 rev $ctr32w, $rctr32w @ CTR block 3
2909
2910 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2911 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2912
2913 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2914
2915 ldr $rk8q, [$cc, #128] @ load rk8
2916
2917 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2918
2919 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2920 ldr $rk11q, [$cc, #176] @ load rk11
2921
2922 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2923 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2924 ext $h4b, $h4b, $h4b, #8
2925
2926 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2927 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2928 ext $h2b, $h2b, $h2b, #8
2929
2930 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2931 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2932 ext $h3b, $h3b, $h3b, #8
2933
2934 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2935 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2936
2937 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2938 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2939 ext $h1b, $h1b, $h1b, #8
2940
2941 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2942 ldr $rk10q, [$cc, #160] @ load rk10
2943
2944 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2945 ldr $rk9q, [$cc, #144] @ load rk9
2946
2947 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2948 ldr $rk7q, [$cc, #112] @ load rk7
2949
2950 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2951 ldr $rk4q, [$cc, #64] @ load rk4
2952
2953 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2954 ld1 { $acc_lb}, [$current_tag]
2955 ext $acc_lb, $acc_lb, $acc_lb, #8
2956 rev64 $acc_lb, $acc_lb
2957
2958 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2959 add $rctr32w, $rctr32w, #1 @ CTR block 3
2960
2961 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2962 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2963
2964 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2965 ldr $rk5q, [$cc, #80] @ load rk5
2966
2967 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2968 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2969
2970 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2971
2972 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2973 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2974
2975 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2976 ldr $rk6q, [$cc, #96] @ load rk6
2977
2978 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2979
2980 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2981
2982 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2983
2984 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2985
2986 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2987
2988 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2989
2990 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2991
2992 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2993
2994 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2995
2996 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2997
2998 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2999
3000 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3001
3002 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3003
3004 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3005
3006 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3007
3008 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3009 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3010
3011 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3012 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3013
3014 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3015 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3016
3017 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3018 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3019
3020 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3021 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3022
3023 aese $ctr3b, $rk11 @ AES block 3 - round 11
3024
3025 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3026
3027 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3028
3029 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3030 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3031
3032 aese $ctr2b, $rk11 @ AES block 2 - round 11
3033
3034 aese $ctr1b, $rk11 @ AES block 1 - round 11
3035 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3036
3037 aese $ctr0b, $rk11 @ AES block 0 - round 11
3038 b.ge .L192_dec_tail @ handle tail
3039
3040 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
3041
3042 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
3043
3044 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
3045
3046 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
3047 rev $ctr32w, $rctr32w @ CTR block 4
3048 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
3049
3050 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
3051
3052 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
3053
3054 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
3055
3056 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
3057 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
3058 add $rctr32w, $rctr32w, #1 @ CTR block 4
3059
3060 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
3061 rev64 $res0b, $res0b @ GHASH block 0
3062 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3063
3064 fmov $ctr0d, $ctr96_b64x @ CTR block 4
3065 rev64 $res1b, $res1b @ GHASH block 1
3066 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3067
3068 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
3069 fmov $ctr0.d[1], $ctr32x @ CTR block 4
3070 rev $ctr32w, $rctr32w @ CTR block 5
3071
3072 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
3073 fmov $ctr1d, $ctr96_b64x @ CTR block 5
3074 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
3075
3076 add $rctr32w, $rctr32w, #1 @ CTR block 5
3077 fmov $ctr1.d[1], $ctr32x @ CTR block 5
3078 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
3079
3080 rev $ctr32w, $rctr32w @ CTR block 6
3081 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
3082
3083 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
3084 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
3085
3086 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
3087
3088 add $rctr32w, $rctr32w, #1 @ CTR block 6
3089 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
3090 b.ge .L192_dec_prepretail @ do prepretail
3091
3092 .L192_dec_main_loop: @ main loop start
3093 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3094 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3095
3096 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3097 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3098
3099 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3100 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3101 rev64 $res3b, $res3b @ GHASH block 4k+3
3102
3103 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3104 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3105
3106 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3107 eor $res0b, $res0b, $acc_lb @ PRE 1
3108
3109 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3110 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3111
3112 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3113 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3114
3115 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3116 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3117
3118 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3119 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3120 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3121
3122 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3123 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3124 rev $ctr32w, $rctr32w @ CTR block 4k+7
3125
3126 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3127 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3128
3129 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3130 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3131 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3132
3133 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3134
3135 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3136 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3137
3138 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3139 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3140
3141 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3142
3143 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3144 rev64 $res2b, $res2b @ GHASH block 4k+2
3145
3146 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3147
3148 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3149 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3150 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3151
3152 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3153
3154 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3155
3156 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3157 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3158
3159 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3160 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3161
3162 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3163
3164 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3165 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3166
3167 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3168
3169 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3170
3171 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3172 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3173
3174 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3175
3176 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3177
3178 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3179 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3180
3181 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3182
3183 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3184 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3185
3186 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3187
3188 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3189 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3190
3191 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3192
3193 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3194 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3195
3196 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3197
3198 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3199 movi $mod_constant.8b, #0xc2
3200
3201 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3202
3203 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3204 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3205
3206 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3207
3208 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3209 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3210
3211 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3212
3213 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3214 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3215
3216 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3217
3218 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3219 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3220
3221 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3222
3223 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3224 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3225
3226 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3227
3228 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3229 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
3230
3231 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3232 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3233
3234 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3235 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
3236 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3237
3238 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3239 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3240
3241 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
3242 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3243
3244 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3245 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3246
3247 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3248 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
3249
3250 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
3251 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
3252 rev $ctr32w, $rctr32w @ CTR block 4k+8
3253
3254 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3255 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3256
3257 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3258 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3259
3260 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3261 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3262
3263 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
3264 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3265 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
3266
3267 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3268 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
3269
3270 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3271
3272 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3273 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
3274
3275 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3276 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3277 rev64 $res1b, $res1b @ GHASH block 4k+5
3278
3279 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
3280 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3281
3282 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3283 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
3284
3285 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
3286 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
3287 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3288
3289 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
3290 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
3291 rev $ctr32w, $rctr32w @ CTR block 4k+9
3292
3293 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3294 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
3295 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3296
3297 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
3298 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
3299 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
3300
3301 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
3302 rev $ctr32w, $rctr32w @ CTR block 4k+10
3303 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
3304
3305 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3306 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
3307 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3308
3309 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
3310 rev64 $res0b, $res0b @ GHASH block 4k+4
3311 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
3312
3313 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
3314 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
3315 b.lt .L192_dec_main_loop
3316
3317 .L192_dec_prepretail: @ PREPRETAIL
3318 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3319 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3320 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3321
3322 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3323 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3324
3325 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3326 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3327
3328 eor $res0b, $res0b, $acc_lb @ PRE 1
3329 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3330
3331 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3332 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3333
3334 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3335 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3336
3337 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3338 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3339 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3340
3341 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3342 rev64 $res2b, $res2b @ GHASH block 4k+2
3343
3344 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3345 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3346 rev $ctr32w, $rctr32w @ CTR block 4k+7
3347
3348 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3349 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3350 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3351
3352 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3353 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3354 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3355
3356 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3357 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3358
3359 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3360 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3361 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3362
3363 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3364 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3365 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3366
3367 rev64 $res3b, $res3b @ GHASH block 4k+3
3368 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3369
3370 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3371 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3372
3373 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3374 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3375
3376 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3377 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3378
3379 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3380
3381 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3382 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3383
3384 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3385
3386 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3387 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3388
3389 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3390
3391 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3392
3393 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3394 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3395
3396 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3397 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3398
3399 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3400
3401 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3402 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3403
3404 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3405
3406 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3407 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3408
3409 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3410
3411 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3412 movi $mod_constant.8b, #0xc2
3413
3414 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3415
3416 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3417
3418 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3419 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3420
3421 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3422 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3423
3424 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3425
3426 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3427 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3428
3429 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3430
3431 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3432 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3433
3434 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3435
3436 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3437 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3438
3439 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3440
3441 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3442 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3443
3444 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3445
3446 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3447 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3448
3449 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3450
3451 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3452
3453 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3454
3455 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3456
3457 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3458 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3459
3460 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3461
3462 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3463
3464 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3465
3466 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3467 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3468
3469 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3470
3471 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3472
3473 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3474
3475 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3476
3477 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3478
3479 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3480
3481 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3482
3483 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3484
3485 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3486 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3487
3488 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3489
3490 aese $ctr0b, $rk11
3491 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3492
3493 aese $ctr2b, $rk11
3494
3495 aese $ctr1b, $rk11
3496
3497 aese $ctr3b, $rk11
3498
3499 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3500 .L192_dec_tail: @ TAIL
3501
3502 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3503 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
3504
3505 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
3506
3507 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3508
3509 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3510
3511 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3512
3513 cmp $main_end_input_ptr, #48
3514
3515 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3516
3517 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3518 b.gt .L192_dec_blocks_more_than_3
3519
3520 movi $acc_l.8b, #0
3521 movi $acc_h.8b, #0
3522
3523 mov $ctr3b, $ctr2b
3524 mov $ctr2b, $ctr1b
3525 sub $rctr32w, $rctr32w, #1
3526
3527 movi $acc_m.8b, #0
3528 cmp $main_end_input_ptr, #32
3529 b.gt .L192_dec_blocks_more_than_2
3530
3531 mov $ctr3b, $ctr1b
3532 cmp $main_end_input_ptr, #16
3533 sub $rctr32w, $rctr32w, #1
3534
3535 b.gt .L192_dec_blocks_more_than_1
3536
3537 sub $rctr32w, $rctr32w, #1
3538 b .L192_dec_blocks_less_than_1
3539 .L192_dec_blocks_more_than_3: @ blocks left > 3
3540 rev64 $res0b, $res1b @ GHASH final-3 block
3541 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
3542
3543 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
3544
3545 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3546
3547 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
3548
3549 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
3550 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
3551 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
3552
3553 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
3554
3555 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
3556 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3557
3558 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
3559
3560 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
3561 movi $t0.8b, #0 @ suppress further partial tag feed in
3562
3563 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
3564 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
3565 .L192_dec_blocks_more_than_2: @ blocks left > 2
3566
3567 rev64 $res0b, $res1b @ GHASH final-2 block
3568 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
3569
3570 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3571
3572 movi $t0.8b, #0 @ suppress further partial tag feed in
3573
3574 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
3575
3576 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
3577
3578 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3579
3580 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
3581
3582 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3583 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
3584
3585 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3586 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
3587
3588 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
3589
3590 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3591
3592 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3593 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
3594
3595 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
3596 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3597 .L192_dec_blocks_more_than_1: @ blocks left > 1
3598
3599 rev64 $res0b, $res1b @ GHASH final-1 block
3600
3601 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3602 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
3603
3604 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
3605
3606 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3607
3608 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
3609 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
3610
3611 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3612
3613 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3614
3615 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3616 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
3617
3618 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3619 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
3620
3621 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3622
3623 movi $t0.8b, #0 @ suppress further partial tag feed in
3624 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3625 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
3626
3627 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
3628
3629 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3630 .L192_dec_blocks_less_than_1: @ blocks left <= 1
3631
3632 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
3633 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
3634 and $bit_length, $bit_length, #127 @ bit_length %= 128
3635
3636 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3637
3638 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3639
3640 and $bit_length, $bit_length, #127 @ bit_length %= 128
3641 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
3642
3643 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
3644 cmp $bit_length, #64
3645
3646 csel $ctr32x, $rk12_l, $rk12_h, lt
3647 csel $ctr96_b64x, $rk12_h, xzr, lt
3648
3649 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
3650 and $output_l0, $output_l0, $ctr32x
3651 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
3652
3653 orr $output_l0, $output_l0, $end_input_ptr
3654 mov $ctr0.d[1], $ctr96_b64x
3655
3656 rev $ctr32w, $rctr32w
3657
3658 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3659 str $ctr32w, [$counter, #12] @ store the updated counter
3660
3661 rev64 $res0b, $res1b @ GHASH final block
3662
3663 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3664 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3665
3666 and $output_h0, $output_h0, $ctr96_b64x
3667
3668 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3669 mov $t0d, $res0.d[1] @ GHASH final block - mid
3670
3671 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3672
3673 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3674
3675 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3676
3677 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3678
3679 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3680
3681 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3682 movi $mod_constant.8b, #0xc2
3683
3684 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3685
3686 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3687
3688 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3689
3690 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3691 orr $output_h0, $output_h0, $main_end_input_ptr
3692 stp $output_l0, $output_h0, [$output_ptr]
3693
3694 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3695
3696 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3697
3698 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3699
3700 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3701
3702 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3703
3704 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3705
3706 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3707 ext $acc_lb, $acc_lb, $acc_lb, #8
3708 rev64 $acc_lb, $acc_lb
3709 mov x0, $len
3710 st1 { $acc_l.16b }, [$current_tag]
3711
3712 ldp x21, x22, [sp, #16]
3713 ldp x23, x24, [sp, #32]
3714 ldp d8, d9, [sp, #48]
3715 ldp d10, d11, [sp, #64]
3716 ldp d12, d13, [sp, #80]
3717 ldp d14, d15, [sp, #96]
3718 ldp x19, x20, [sp], #112
3719 ret
3720
3721.L192_dec_ret:
3722 mov w0, #0x0
3723 ret
3724.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3725___
3726}
3727
3728{
3729my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3730my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3731my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3732my ($output_l0,$output_h0)=map("x$_",(6..7));
3733
3734my $ctr32w="w9";
3735my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3736my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3737
3738my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3739my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3740my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3741my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3742
3743my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3744my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3745my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3746
3747my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3748my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3749my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3750
3751my $t0="v8";
3752my $t0d="d8";
3753my $t1="v4";
3754my $t1d="d4";
3755my $t2="v8";
3756my $t2d="d8";
3757my $t3="v4";
3758my $t3d="d4";
3759my $t4="v4";
3760my $t4d="d4";
3761my $t5="v5";
3762my $t5d="d5";
3763my $t6="v8";
3764my $t6d="d8";
3765my $t7="v5";
3766my $t7d="d5";
3767my $t8="v6";
3768my $t8d="d6";
3769my $t9="v4";
3770my $t9d="d4";
3771
3772my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
3773my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
3774my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
3775
3776my $mod_constantd="d8";
3777my $mod_constant="v8";
3778my $mod_t="v7";
3779
3780my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
3781my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
3782my $rk2q1="v20.1q";
3783my $rk3q1="v21.1q";
3784my $rk4v="v22";
3785my $rk4d="d22";
3786
3787#########################################################################################
3788# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
3789# size_t len,
3790# unsigned char *out,
3791# const void *key,
3792# unsigned char ivec[16],
3793# u64 *Xi);
3794#
3795$code.=<<___;
3796.global aes_gcm_enc_256_kernel
3797.type aes_gcm_enc_256_kernel,%function
3798.align 4
3799aes_gcm_enc_256_kernel:
3800 cbz x1, .L256_enc_ret
3801 stp x19, x20, [sp, #-112]!
3802 mov x16, x4
3803 mov x8, x5
3804 stp x21, x22, [sp, #16]
3805 stp x23, x24, [sp, #32]
3806 stp d8, d9, [sp, #48]
3807 stp d10, d11, [sp, #64]
3808 stp d12, d13, [sp, #80]
3809 stp d14, d15, [sp, #96]
3810
3811 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3812 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
3813 mov $len, $main_end_input_ptr
3814 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
3815
3816 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
3817 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3818
3819 ldr $rk0q, [$cc, #0] @ load rk0
3820 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3821
3822 ldr $rk7q, [$cc, #112] @ load rk7
3823 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3824
3825 lsr $rctr32x, $ctr96_t32x, #32
3826 fmov $ctr2d, $ctr96_b64x @ CTR block 2
3827 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3828
3829 rev $rctr32w, $rctr32w @ rev_ctr32
3830 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3831 fmov $ctr1d, $ctr96_b64x @ CTR block 1
3832
3833 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3834 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
3835
3836 rev $ctr32w, $rctr32w @ CTR block 1
3837 fmov $ctr3d, $ctr96_b64x @ CTR block 3
3838
3839 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
3840 add $rctr32w, $rctr32w, #1 @ CTR block 1
3841 ldr $rk1q, [$cc, #16] @ load rk1
3842
3843 fmov $ctr1.d[1], $ctr32x @ CTR block 1
3844 rev $ctr32w, $rctr32w @ CTR block 2
3845 add $rctr32w, $rctr32w, #1 @ CTR block 2
3846
3847 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
3848 ldr $rk2q, [$cc, #32] @ load rk2
3849
3850 fmov $ctr2.d[1], $ctr32x @ CTR block 2
3851 rev $ctr32w, $rctr32w @ CTR block 3
3852
3853 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3854 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
3855
3856 fmov $ctr3.d[1], $ctr32x @ CTR block 3
3857
3858 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3859 ldr $rk3q, [$cc, #48] @ load rk3
3860
3861 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3862 ldr $rk6q, [$cc, #96] @ load rk6
3863
3864 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3865 ldr $rk5q, [$cc, #80] @ load rk5
3866
3867 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3868 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3869 ext $h3b, $h3b, $h3b, #8
3870
3871 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3872 ldr $rk13q, [$cc, #208] @ load rk13
3873
3874 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3875 ldr $rk4q, [$cc, #64] @ load rk4
3876
3877 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3878 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
3879 ext $h2b, $h2b, $h2b, #8
3880
3881 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3882 ldr $rk12q, [$cc, #192] @ load rk12
3883
3884 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3885 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3886 ext $h4b, $h4b, $h4b, #8
3887
3888 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3889 ldr $rk11q, [$cc, #176] @ load rk11
3890
3891 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3892 ldr $rk8q, [$cc, #128] @ load rk8
3893
3894 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3895 add $rctr32w, $rctr32w, #1 @ CTR block 3
3896
3897 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3898 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
3899
3900 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3901 ld1 { $acc_lb}, [$current_tag]
3902 ext $acc_lb, $acc_lb, $acc_lb, #8
3903 rev64 $acc_lb, $acc_lb
3904
3905 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3906
3907 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3908
3909 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3910
3911 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3912
3913 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3914
3915 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3916
3917 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3918
3919 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3920
3921 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3922 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
3923
3924 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3925 ldr $rk9q, [$cc, #144] @ load rk9
3926
3927 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3928 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3929 ext $h1b, $h1b, $h1b, #8
3930
3931 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3932 ldr $rk10q, [$cc, #160] @ load rk10
3933
3934 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3935 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
3936
3937 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3938
3939 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3940
3941 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3942 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
3943
3944 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3945
3946 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3947
3948 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3949
3950 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3951
3952 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3953
3954 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3955
3956 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3957
3958 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3959
3960 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3961
3962 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3963
3964 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3965
3966 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
3967
3968 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
3969
3970 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3971
3972 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
3973
3974 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
3975
3976 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
3977 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3978
3979 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
3980
3981 aese $ctr2b, $rk13 @ AES block 2 - round 13
3982 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3983
3984 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
3985
3986 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
3987
3988 aese $ctr1b, $rk13 @ AES block 1 - round 13
3989
3990 aese $ctr0b, $rk13 @ AES block 0 - round 13
3991
3992 aese $ctr3b, $rk13 @ AES block 3 - round 13
3993 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3994 b.ge .L256_enc_tail @ handle tail
3995
3996 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
3997
3998 rev $ctr32w, $rctr32w @ CTR block 4
3999 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
4000
4001 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
4002
4003 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
4004 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4005
4006 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
4007 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
4008
4009 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
4010 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
4011
4012 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
4013 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
4014 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
4015
4016 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4017 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
4018 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
4019
4020 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
4021 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
4022
4023 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
4024 add $rctr32w, $rctr32w, #1 @ CTR block 4
4025
4026 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4027 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
4028 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
4029
4030 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
4031
4032 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
4033 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4034
4035 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4036 rev $ctr32w, $rctr32w @ CTR block 5
4037 add $rctr32w, $rctr32w, #1 @ CTR block 5
4038
4039 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
4040 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4041 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4042
4043 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4044 rev $ctr32w, $rctr32w @ CTR block 6
4045 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
4046
4047 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
4048 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4049 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
4050
4051 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
4052
4053 add $rctr32w, $rctr32w, #1 @ CTR block 6
4054 fmov $ctr2d, $ctr96_b64x @ CTR block 6
4055
4056 fmov $ctr2.d[1], $ctr32x @ CTR block 6
4057 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
4058 rev $ctr32w, $rctr32w @ CTR block 7
4059
4060 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
4061
4062 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
4063 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
4064 b.ge L256_enc_prepretail @ do prepretail
4065
4066 .L256_enc_main_loop: @ main loop start
4067 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4068 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4069
4070 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4071 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4072
4073 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4074 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4075
4076 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4077 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4078
4079 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4080 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
4081
4082 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4083 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
4084
4085 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4086 eor $res0b, $res0b, $acc_lb @ PRE 1
4087
4088 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4089
4090 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4091 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
4092
4093 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4094 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4095
4096 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4097 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
4098 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4099
4100 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4101 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4102
4103 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4104
4105 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4106 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4107
4108 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4109
4110 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4111 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4112
4113 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4114
4115 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4116 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4117
4118 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4119
4120 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4121 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4122
4123 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4124
4125 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4126 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4127
4128 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4129
4130 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4131 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4132
4133 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4134 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4135
4136 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4137
4138 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4139 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4140
4141 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4142
4143 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4144
4145 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4146
4147 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4148 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4149
4150 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4151
4152 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4153
4154 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4155
4156 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4157 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4158
4159 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4160
4161 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4162
4163 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4164
4165 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4166 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4167
4168 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4169 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
4170
4171 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4172 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4173
4174 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4175 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4176
4177 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4178
4179 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4180 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4181
4182 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4183 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
4184
4185 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4186 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4187
4188 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4189 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
4190
4191 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4192 movi $mod_constant.8b, #0xc2
4193
4194 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4195 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4196 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
4197
4198 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4199 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
4200
4201 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4202 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4203
4204 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4205 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4206
4207 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4208
4209 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4210 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4211
4212 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4213 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4214
4215 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4216 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4217
4218 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4219 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4220
4221 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4222 rev $ctr32w, $rctr32w @ CTR block 4k+8
4223 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4224
4225 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4226 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4227
4228 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4229 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4230
4231 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4232 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4233
4234 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4235 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
4236 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
4237
4238 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4239 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
4240
4241 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4242 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
4243
4244 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4245 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
4246
4247 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4248 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4249 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4250
4251 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4252 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
4253
4254 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4255 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
4256
4257 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
4258 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4259
4260 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
4261
4262 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4263 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4264 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
4265
4266 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
4267 rev $ctr32w, $rctr32w @ CTR block 4k+9
4268 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
4269
4270 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
4271 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
4272 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
4273
4274 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4275 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
4276
4277 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4278 rev $ctr32w, $rctr32w @ CTR block 4k+10
4279 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
4280
4281 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
4282 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4283 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
4284
4285 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4286 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
4287 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
4288
4289 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4290 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
4291 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
4292
4293 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
4294 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
4295 rev $ctr32w, $rctr32w @ CTR block 4k+11
4296
4297 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4298 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
4299
4300 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
4301 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
4302 b.lt L256_enc_main_loop
4303
4304 .L256_enc_prepretail: @ PREPRETAIL
4305 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4306 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4307
4308 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4309 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4310
4311 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4312 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4313
4314 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4315 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4316
4317 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4318
4319 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4320
4321 eor $res0b, $res0b, $acc_lb @ PRE 1
4322 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4323
4324 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4325
4326 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4327 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4328
4329 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4330
4331 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4332 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4333
4334 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4335
4336 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4337
4338 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4339 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4340
4341 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4342
4343 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4344
4345 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4346
4347 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4348
4349 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4350
4351 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4352
4353 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4354
4355 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4356 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4357
4358 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4359 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4360
4361 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4362
4363 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4364 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4365
4366 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4367 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4368
4369 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4370
4371 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4372 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4373 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4374
4375 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4376
4377 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4378
4379 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4380 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4381
4382 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4383
4384 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4385 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4386
4387 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4388
4389 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4390 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4391
4392 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4393
4394 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4395
4396 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4397
4398 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4399
4400 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4401
4402 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4403 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4404
4405 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4406
4407 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4408
4409 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4410
4411 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4412 movi $mod_constant.8b, #0xc2
4413
4414 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4415
4416 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4417 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4418
4419 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4420
4421 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4422 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4423
4424 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4425 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4426
4427 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4428
4429 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4430
4431 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4432
4433 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4434 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4435
4436 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4437
4438 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
4439
4440 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
4441 ext $acc_hb, $acc_hb, $acc_hb, #8
4442
4443 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4444
4445 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4446 eor $acc_mb, $acc_mb, $acc_lb
4447
4448 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4449
4450 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4451
4452 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4453
4454 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4455 eor $acc_mb, $acc_mb, $t1.16b
4456
4457 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4458
4459 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4460
4461 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4462
4463 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4464 eor $acc_mb, $acc_mb, $acc_hb
4465
4466 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4467
4468 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4469
4470 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4471
4472 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
4473
4474 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4475 ext $acc_mb, $acc_mb, $acc_mb, #8
4476
4477 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4478
4479 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4480 eor $acc_lb, $acc_lb, $t1.16b
4481
4482 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4483
4484 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4485
4486 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4487
4488 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4489 eor $acc_lb, $acc_lb, $acc_mb
4490 .L256_enc_tail: @ TAIL
4491
4492 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4493 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4494 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
4495
4496 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4497 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4498
4499 cmp $main_end_input_ptr, #48
4500 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4501
4502 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4503
4504 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4505 b.gt .L256_enc_blocks_more_than_3
4506
4507 cmp $main_end_input_ptr, #32
4508 mov $ctr3b, $ctr2b
4509 movi $acc_l.8b, #0
4510
4511 movi $acc_h.8b, #0
4512 sub $rctr32w, $rctr32w, #1
4513
4514 mov $ctr2b, $ctr1b
4515 movi $acc_m.8b, #0
4516 b.gt .L256_enc_blocks_more_than_2
4517
4518 mov $ctr3b, $ctr1b
4519 sub $rctr32w, $rctr32w, #1
4520 cmp $main_end_input_ptr, #16
4521
4522 b.gt .L256_enc_blocks_more_than_1
4523
4524 sub $rctr32w, $rctr32w, #1
4525 b .L256_enc_blocks_less_than_1
4526 .L256_enc_blocks_more_than_3: @ blocks left > 3
4527 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
4528
4529 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
4530
4531 rev64 $res0b, $res1b @ GHASH final-3 block
4532
4533 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
4534 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4535
4536 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
4537
4538 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
4539 fmov $res1d, $input_l0 @ AES final-2 block - mov low
4540
4541 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
4542
4543 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4544 movi $t0.8b, #0 @ suppress further partial tag feed in
4545
4546 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
4547
4548 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
4549
4550 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
4551
4552 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
4553 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
4554 .L256_enc_blocks_more_than_2: @ blocks left > 2
4555
4556 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
4557
4558 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
4559
4560 rev64 $res0b, $res1b @ GHASH final-2 block
4561
4562 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
4563 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4564
4565 fmov $res1d, $input_l0 @ AES final-1 block - mov low
4566 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
4567
4568 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
4569
4570 movi $t0.8b, #0 @ suppress further partial tag feed in
4571
4572 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4573 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
4574
4575 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4576
4577 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4578
4579 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
4580
4581 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4582
4583 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
4584
4585 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4586
4587 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4588 .L256_enc_blocks_more_than_1: @ blocks left > 1
4589
4590 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
4591
4592 rev64 $res0b, $res1b @ GHASH final-1 block
4593
4594 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
4595
4596 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4597
4598 movi $t0.8b, #0 @ suppress further partial tag feed in
4599
4600 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
4601 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
4602
4603 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4604 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
4605
4606 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4607
4608 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4609
4610 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4611 fmov $res1d, $input_l0 @ AES final block - mov low
4612
4613 fmov $res1.d[1], $input_h0 @ AES final block - mov high
4614
4615 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4616
4617 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4618
4619 eor $res1b, $res1b, $ctr3b @ AES final block - result
4620 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4621
4622 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4623 .L256_enc_blocks_less_than_1: @ blocks left <= 1
4624
4625 and $bit_length, $bit_length, #127 @ bit_length %= 128
4626
4627 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
4628 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4629
4630 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4631 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4632
4633 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
4634 and $bit_length, $bit_length, #127 @ bit_length %= 128
4635
4636 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
4637 cmp $bit_length, #64
4638
4639 csel $input_l0, $rk14_l, $rk14_h, lt
4640 csel $input_h0, $rk14_h, xzr, lt
4641
4642 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
4643
4644 fmov $ctr0.d[1], $input_h0
4645
4646 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4647
4648 rev64 $res0b, $res1b @ GHASH final block
4649
4650 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4651
4652 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4653
4654 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4655 mov $t0d, $res0.d[1] @ GHASH final block - mid
4656 rev $ctr32w, $rctr32w
4657
4658 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4659
4660 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4661 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4662
4663 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4664
4665 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4666
4667 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4668 movi $mod_constant.8b, #0xc2
4669
4670 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4671
4672 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4673
4674 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4675
4676 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4677
4678 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4679
4680 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4681
4682 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
4683
4684 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4685
4686 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4687
4688 str $ctr32w, [$counter, #12] @ store the updated counter
4689
4690 st1 { $res1b}, [$output_ptr] @ store all 16B
4691 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4692
4693 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4694 ext $acc_lb, $acc_lb, $acc_lb, #8
4695 rev64 $acc_lb, $acc_lb
4696 mov x0, $len
4697 st1 { $acc_l.16b }, [$current_tag]
4698
4699 ldp x21, x22, [sp, #16]
4700 ldp x23, x24, [sp, #32]
4701 ldp d8, d9, [sp, #48]
4702 ldp d10, d11, [sp, #64]
4703 ldp d12, d13, [sp, #80]
4704 ldp d14, d15, [sp, #96]
4705 ldp x19, x20, [sp], #112
4706 ret
4707
4708.L256_enc_ret:
4709 mov w0, #0x0
4710 ret
4711.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
4712___
4713
4714{
4715my $t8="v4";
4716my $t8d="d4";
4717my $t9="v6";
4718my $t9d="d6";
4719#########################################################################################
4720# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
4721# size_t len,
4722# unsigned char *out,
4723# const void *key,
4724# unsigned char ivec[16],
4725# u64 *Xi);
4726#
4727$code.=<<___;
4728.global aes_gcm_dec_256_kernel
4729.type aes_gcm_dec_256_kernel,%function
4730.align 4
4731aes_gcm_dec_256_kernel:
4732 cbz x1, .L256_dec_ret
4733 stp x19, x20, [sp, #-112]!
4734 mov x16, x4
4735 mov x8, x5
4736 stp x21, x22, [sp, #16]
4737 stp x23, x24, [sp, #32]
4738 stp d8, d9, [sp, #48]
4739 stp d10, d11, [sp, #64]
4740 stp d12, d13, [sp, #80]
4741 stp d14, d15, [sp, #96]
4742
4743 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
4744 mov $len, $main_end_input_ptr
4745 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
4746
4747 ldr $rk8q, [$cc, #128] @ load rk8
4748 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4749
4750 ldr $rk7q, [$cc, #112] @ load rk7
4751 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4752
4753 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
4754 ldr $rk6q, [$cc, #96] @ load rk6
4755
4756 lsr $rctr32x, $ctr96_t32x, #32
4757 ldr $rk5q, [$cc, #80] @ load rk5
4758 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4759
4760 ldr $rk3q, [$cc, #48] @ load rk3
4761 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4762 rev $rctr32w, $rctr32w @ rev_ctr32
4763
4764 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
4765 fmov $ctr3d, $ctr96_b64x @ CTR block 3
4766
4767 rev $ctr32w, $rctr32w @ CTR block 1
4768 add $rctr32w, $rctr32w, #1 @ CTR block 1
4769 fmov $ctr1d, $ctr96_b64x @ CTR block 1
4770
4771 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
4772 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
4773
4774 fmov $ctr1.d[1], $ctr32x @ CTR block 1
4775 rev $ctr32w, $rctr32w @ CTR block 2
4776 add $rctr32w, $rctr32w, #1 @ CTR block 2
4777
4778 fmov $ctr2d, $ctr96_b64x @ CTR block 2
4779 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
4780
4781 fmov $ctr2.d[1], $ctr32x @ CTR block 2
4782 rev $ctr32w, $rctr32w @ CTR block 3
4783
4784 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
4785 ldr $rk0q, [$cc, #0] @ load rk0
4786
4787 fmov $ctr3.d[1], $ctr32x @ CTR block 3
4788 add $rctr32w, $rctr32w, #1 @ CTR block 3
4789
4790 ldr $rk4q, [$cc, #64] @ load rk4
4791
4792 ldr $rk13q, [$cc, #208] @ load rk13
4793
4794 ldr $rk1q, [$cc, #16] @ load rk1
4795
4796 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4797 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4798 ext $h3b, $h3b, $h3b, #8
4799
4800 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4801 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4802 ext $h4b, $h4b, $h4b, #8
4803
4804 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4805 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4806 ext $h2b, $h2b, $h2b, #8
4807
4808 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4809 ldr $rk2q, [$cc, #32] @ load rk2
4810
4811 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4812 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
4813
4814 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4815 ld1 { $acc_lb}, [$current_tag]
4816 ext $acc_lb, $acc_lb, $acc_lb, #8
4817 rev64 $acc_lb, $acc_lb
4818
4819 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4820 ldr $rk9q, [$cc, #144] @ load rk9
4821
4822 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4823 ldr $rk12q, [$cc, #192] @ load rk12
4824
4825 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4826 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4827 ext $h1b, $h1b, $h1b, #8
4828
4829 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4830 ldr $rk10q, [$cc, #160] @ load rk10
4831
4832 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4833
4834 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4835
4836 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4837
4838 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4839
4840 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4841 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
4842
4843 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4844
4845 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4846
4847 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
4848
4849 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4850
4851 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4852
4853 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
4854
4855 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
4856
4857 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
4858
4859 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
4860
4861 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
4862
4863 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
4864
4865 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
4866
4867 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
4868
4869 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
4870
4871 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
4872
4873 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
4874
4875 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
4876
4877 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
4878
4879 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
4880
4881 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
4882
4883 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
4884
4885 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
4886 ldr $rk11q, [$cc, #176] @ load rk11
4887
4888 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
4889
4890 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
4891
4892 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
4893
4894 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
4895
4896 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
4897
4898 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
4899
4900 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
4901
4902 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
4903
4904 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
4905
4906 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
4907
4908 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
4909
4910 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
4911
4912 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
4913
4914 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
4915 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
4916
4917 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
4918
4919 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
4920
4921 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
4922
4923 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
4924 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
4925
4926 aese $ctr1b, $rk13 @ AES block 1 - round 13
4927
4928 aese $ctr2b, $rk13 @ AES block 2 - round 13
4929 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
4930
4931 aese $ctr3b, $rk13 @ AES block 3 - round 13
4932
4933 aese $ctr0b, $rk13 @ AES block 0 - round 13
4934 b.ge .L256_dec_tail @ handle tail
4935
4936 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
4937
4938 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
4939
4940 rev $ctr32w, $rctr32w @ CTR block 4
4941
4942 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
4943
4944 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
4945 rev64 $res1b, $res1b @ GHASH block 1
4946 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
4947
4948 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
4949
4950 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
4951 rev64 $res0b, $res0b @ GHASH block 0
4952 add $rctr32w, $rctr32w, #1 @ CTR block 4
4953
4954 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4955 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4956
4957 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4958 rev $ctr32w, $rctr32w @ CTR block 5
4959 add $rctr32w, $rctr32w, #1 @ CTR block 5
4960
4961 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
4962
4963 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4964 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
4965 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
4966
4967 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
4968 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
4969 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4970
4971 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
4972 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4973
4974 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4975 rev $ctr32w, $rctr32w @ CTR block 6
4976 add $rctr32w, $rctr32w, #1 @ CTR block 6
4977
4978 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
4979 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4980
4981 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
4982 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
4983
4984 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
4985 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4986 b.ge .L256_dec_prepretail @ do prepretail
4987
4988 .L256_dec_main_loop: @ main loop start
4989 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
4990 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4991 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
4992
4993 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4994 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
4995
4996 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4997 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
4998
4999 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5000 eor $res0b, $res0b, $acc_lb @ PRE 1
5001 rev $ctr32w, $rctr32w @ CTR block 4k+7
5002
5003 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5004 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5005
5006 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5007 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5008
5009 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5010 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5011 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5012
5013 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5014 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5015
5016 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5017 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5018
5019 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5020 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5021
5022 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5023 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5024
5025 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5026 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5027
5028 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5029 rev64 $res2b, $res2b @ GHASH block 4k+2
5030
5031 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5032 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5033
5034 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5035 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5036
5037 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5038
5039 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5040
5041 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5042 rev64 $res3b, $res3b @ GHASH block 4k+3
5043
5044 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5045 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5046
5047 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5048 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5049 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5050
5051 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5052
5053 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5054 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5055
5056 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5057 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5058
5059 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5060 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5061
5062 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5063 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5064
5065 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5066 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5067
5068 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5069
5070 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5071 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5072
5073 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5074
5075 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5076 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5077
5078 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5079 rev $ctr32w, $rctr32w @ CTR block 4k+8
5080
5081 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5082 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5083
5084 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5085 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
5086
5087 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5088
5089 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5090 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5091
5092 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5093
5094 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5095 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5096
5097 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5098
5099 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5100
5101 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5102 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5103
5104 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5105
5106 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5107 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
5108 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5109
5110 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5111
5112 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5113 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5114
5115 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5116
5117 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5118 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5119
5120 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5121
5122 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5123 movi $mod_constant.8b, #0xc2
5124
5125 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5126 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5127
5128 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5129
5130 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5131 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5132
5133 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5134 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5135
5136 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5137
5138 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5139 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5140
5141 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5142 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
5143
5144 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5145 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5146
5147 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5148 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5149
5150 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5151 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
5152
5153 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5154 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
5155
5156 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5157 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5158
5159 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5160 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5161
5162 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5163 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
5164
5165 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5166 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
5167
5168 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5169 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5170
5171 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5172 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5173
5174 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5175 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
5176 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5177
5178 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5179 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
5180
5181 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5182 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
5183
5184 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5185 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
5186 rev $ctr32w, $rctr32w @ CTR block 4k+9
5187
5188 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5189 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
5190 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5191
5192 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
5193
5194 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5195 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5196
5197 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
5198 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
5199 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5200
5201 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5202 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
5203
5204 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
5205 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5206
5207 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
5208 rev $ctr32w, $rctr32w @ CTR block 4k+10
5209 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
5210
5211 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5212 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
5213
5214 rev64 $res1b, $res1b @ GHASH block 4k+5
5215 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
5216 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
5217
5218 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
5219 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
5220
5221 rev64 $res0b, $res0b @ GHASH block 4k+4
5222 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5223 b.lt .L256_dec_main_loop
5224
5225
5226 .L256_dec_prepretail: @ PREPRETAIL
5227 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5228 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
5229 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
5230
5231 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5232 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5233
5234 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5235 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5236
5237 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5238 rev $ctr32w, $rctr32w @ CTR block 4k+7
5239 eor $res0b, $res0b, $acc_lb @ PRE 1
5240
5241 rev64 $res2b, $res2b @ GHASH block 4k+2
5242 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5243 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5244
5245 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5246 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5247
5248 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5249 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5250 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5251
5252 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5253 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5254
5255 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5256 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5257
5258 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5259 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5260
5261 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5262
5263 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5264 rev64 $res3b, $res3b @ GHASH block 4k+3
5265
5266 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5267
5268 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5269 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5270
5271 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5272
5273 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5274 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5275
5276 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5277
5278 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5279 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5280
5281 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5282
5283 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5284 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5285
5286 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5287 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5288
5289 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5290
5291 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5292
5293 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5294 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5295
5296 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5297
5298 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5299 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5300
5301 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5302
5303 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5304 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5305
5306 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5307
5308 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5309 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5310
5311 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5312
5313 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5314 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5315
5316 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5317
5318 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5319 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5320
5321 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5322
5323 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5324
5325 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5326 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5327
5328 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5329
5330 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5331 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5332
5333 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5334
5335 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5336 movi $mod_constant.8b, #0xc2
5337
5338 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5339 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5340
5341 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5342
5343 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5344 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5345
5346 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5347
5348 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5349 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5350
5351 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5352
5353 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5354 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5355
5356 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5357
5358 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5359 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5360
5361 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5362
5363 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5364 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5365
5366 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5367
5368 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5369 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5370
5371 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5372
5373 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5374 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5375
5376 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5377
5378 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5379
5380 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5381 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5382
5383 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5384 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5385
5386 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5387 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5388
5389 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5390 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5391
5392 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5393 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5394
5395 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5396
5397 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5398 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5399
5400 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5401 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5402
5403 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5404 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5405
5406 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5407 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5408
5409 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5410 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5411
5412 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5413
5414 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5415
5416 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5417
5418 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5419 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5420 .L256_dec_tail: @ TAIL
5421
5422 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5423 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
5424
5425 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
5426
5427 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5428
5429 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5430 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5431
5432 cmp $main_end_input_ptr, #48
5433
5434 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5435
5436 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5437 b.gt .L256_dec_blocks_more_than_3
5438
5439 sub $rctr32w, $rctr32w, #1
5440 mov $ctr3b, $ctr2b
5441 movi $acc_m.8b, #0
5442
5443 movi $acc_l.8b, #0
5444 cmp $main_end_input_ptr, #32
5445
5446 movi $acc_h.8b, #0
5447 mov $ctr2b, $ctr1b
5448 b.gt .L256_dec_blocks_more_than_2
5449
5450 sub $rctr32w, $rctr32w, #1
5451
5452 mov $ctr3b, $ctr1b
5453 cmp $main_end_input_ptr, #16
5454 b.gt .L256_dec_blocks_more_than_1
5455
5456 sub $rctr32w, $rctr32w, #1
5457 b .L256_dec_blocks_less_than_1
5458 .L256_dec_blocks_more_than_3: @ blocks left > 3
5459 rev64 $res0b, $res1b @ GHASH final-3 block
5460 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
5461
5462 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
5463
5464 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
5465
5466 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5467
5468 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
5469
5470 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
5471
5472 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
5473
5474 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
5475
5476 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5477
5478 movi $t0.8b, #0 @ suppress further partial tag feed in
5479
5480 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
5481
5482 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
5483 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
5484
5485 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
5486 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
5487 .L256_dec_blocks_more_than_2: @ blocks left > 2
5488
5489 rev64 $res0b, $res1b @ GHASH final-2 block
5490 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
5491
5492 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5493 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
5494
5495 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
5496
5497 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
5498
5499 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
5500
5501 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5502
5503 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5504 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
5505
5506 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
5507 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
5508 movi $t0.8b, #0 @ suppress further partial tag feed in
5509
5510 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
5511
5512 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5513 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
5514
5515 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
5516 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
5517 .L256_dec_blocks_more_than_1: @ blocks left > 1
5518
5519 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
5520 rev64 $res0b, $res1b @ GHASH final-1 block
5521
5522 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
5523
5524 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5525 movi $t0.8b, #0 @ suppress further partial tag feed in
5526
5527 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
5528
5529 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
5530
5531 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
5532
5533 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
5534
5535 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
5536 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
5537
5538 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
5539
5540 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
5541
5542 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
5543 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
5544
5545 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
5546
5547 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
5548
5549 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
5550 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
5551 .L256_dec_blocks_less_than_1: @ blocks left <= 1
5552
5553 and $bit_length, $bit_length, #127 @ bit_length %= 128
5554 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
5555
5556 sub $bit_length, $bit_length, #128 @ bit_length -= 128
5557 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
5558
5559 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5560 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
5561
5562 and $bit_length, $bit_length, #127 @ bit_length %= 128
5563
5564 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
5565 cmp $bit_length, #64
5566
5567 csel $ctr32x, $rk14_l, $rk14_h, lt
5568 csel $ctr96_b64x, $rk14_h, xzr, lt
5569
5570 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
5571 and $output_l0, $output_l0, $ctr32x
5572
5573 mov $ctr0.d[1], $ctr96_b64x
5574 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
5575
5576 rev $ctr32w, $rctr32w
5577
5578 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
5579
5580 orr $output_l0, $output_l0, $end_input_ptr
5581
5582 and $output_h0, $output_h0, $ctr96_b64x
5583
5584 orr $output_h0, $output_h0, $main_end_input_ptr
5585
5586 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
5587
5588 rev64 $res0b, $res1b @ GHASH final block
5589
5590 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5591
5592 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
5593
5594 mov $t0d, $res0.d[1] @ GHASH final block - mid
5595
5596 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
5597
5598 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
5599
5600 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
5601
5602 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
5603
5604 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
5605
5606 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
5607 movi $mod_constant.8b, #0xc2
5608
5609 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5610
5611 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5612
5613 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5614
5615 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5616
5617 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5618
5619 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5620
5621 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5622
5623 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5624
5625 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5626
5627 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5628
5629 stp $output_l0, $output_h0, [$output_ptr]
5630
5631 str $ctr32w, [$counter, #12] @ store the updated counter
5632
5633 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5634 ext $acc_lb, $acc_lb, $acc_lb, #8
5635 rev64 $acc_lb, $acc_lb
5636 mov x0, $len
5637 st1 { $acc_l.16b }, [$current_tag]
5638
5639 ldp x21, x22, [sp, #16]
5640 ldp x23, x24, [sp, #32]
5641 ldp d8, d9, [sp, #48]
5642 ldp d10, d11, [sp, #64]
5643 ldp d12, d13, [sp, #80]
5644 ldp d14, d15, [sp, #96]
5645 ldp x19, x20, [sp], #112
5646 ret
5647
5648.L256_dec_ret:
5649 mov w0, #0x0
5650 ret
5651.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
5652___
5653}
5654}
5655
5656$code.=<<___;
5657.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
5658.align 2
5659#endif
5660___
5661
5662if ($flavour =~ /64/) { ######## 64-bit code
5663 sub unvmov {
5664 my $arg=shift;
5665
5666 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
5667 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
5668 $3<8?$3:$3+8,($4 eq "lo")?0:1;
5669 }
5670 foreach(split("\n",$code)) {
5671 s/@\s/\/\//o; # old->new style commentary
5672 print $_,"\n";
5673 }
5674} else { ######## 32-bit code
5675 sub unvdup32 {
5676 my $arg=shift;
5677
5678 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
5679 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
5680 }
5681 sub unvpmullp64 {
5682 my ($mnemonic,$arg)=@_;
5683
5684 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
5685 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
5686 |(($2&7)<<17)|(($2&8)<<4)
5687 |(($3&7)<<1) |(($3&8)<<2);
5688 $word |= 0x00010001 if ($mnemonic =~ "2");
5689 # since ARMv7 instructions are always encoded little-endian.
5690 # correct solution is to use .inst directive, but older%%%%
5691 # assemblers don't implement it:-(
5692 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
5693 $word&0xff,($word>>8)&0xff,
5694 ($word>>16)&0xff,($word>>24)&0xff,
5695 $mnemonic,$arg;
5696 }
5697 }
5698
5699 foreach(split("\n",$code)) {
5700 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
5701 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
5702 s/\/\/\s?/@ /o; # new->old style commentary
5703
5704 # fix up remaining new-style suffixes
5705 s/\],#[0-9]+/]!/o;
5706
5707 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
5708 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
5709 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
5710 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
5711 s/^(\s+)b\./$1b/o or
5712 s/^(\s+)ret/$1bx\tlr/o;
5713
5714 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
5715 print " it $2\n";
5716 }
5717
5718 print $_,"\n";
5719 }
5720}
5721
a21314db 5722close STDOUT or die "error closing STDOUT: $!"; # enforce flush