]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
Fix typos found by codespell
[thirdparty/openssl.git] / crypto / modes / asm / aes-gcm-armv8-unroll8_64.pl
CommitLineData
954f45ba 1#! /usr/bin/env perl
fecb3aae 2# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
954f45ba
X
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
15# obtain it.
16#========================================================================
17#
18# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
19# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
20# intermediate hashesfrom the 8 blocks.
21#
22# ____________________________________________________
23# | |
24# | PRE |
25# |____________________________________________________|
26# | | | |
27# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
28# |________________|________________|__________________|
29# | | | |
30# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
31# |________________|________________|__________________|
32# | | | |
33# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
34# |________________|________________|__________________|
35# | | | |
36# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
37# |________________|________________|__________________|
38# | | | |
39# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
40# |________________|________________|__________________|
41# | | | |
42# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
43# |________________|________________|__________________|
44# | | | |
45# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
46# |________________|________________|__________________|
47# | | | |
48# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
49# |________________|____(mostly)____|__________________|
50# | |
51# | MODULO |
52# |____________________________________________________|
53#
54# PRE:
55# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
56# EXT low_acc, low_acc, low_acc, #8
57# EOR res_curr (8k+0), res_curr (4k+0), low_acc
58#
59# CTR block:
60# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
61# REV ctr32, rev_ctr32
62# ORR ctr64, constctr96_top32, ctr32, LSL #32
63# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
64# INS ctr_next.d[1], ctr64X
65# ADD rev_ctr32, #1
66#
67# AES block:
68# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
69# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
70# Given we are very constrained in our ASIMD registers this is quite important
71#
72# Encrypt:
73# LDR input_low, [ input_ptr ], #8
74# LDR input_high, [ input_ptr ], #8
75# EOR input_low, k14_low
76# EOR input_high, k14_high
77# INS res_curr.d[0], input_low
78# INS res_curr.d[1], input_high
79# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
80# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
81# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
82# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
83# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
84# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
85# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
86# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
87# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
88# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
89# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
90# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
91# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
92# AESE ctr_curr, k13
93# EOR res_curr, res_curr, ctr_curr
94# ST1 { res_curr.16b }, [ output_ptr ], #16
95#
96# Decrypt:
97# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
98# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
99# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
100# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
101# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
102# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
103# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
104# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
105# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
106# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
107# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
108# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
109# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
110# AESE ctr_curr, k13
111# LDR res_curr, [ input_ptr ], #16
112# EOR res_curr, res_curr, ctr_curr
113# MOV output_low, res_curr.d[0]
114# MOV output_high, res_curr.d[1]
115# EOR output_low, k14_low
116# EOR output_high, k14_high
117# STP output_low, output_high, [ output_ptr ], #16
118
119# GHASH block X:
120# Do 128b karatsuba polynomial multiplication on block
121# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
122#
123# multiplication:
124# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
125#
126# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
127# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
128#
129# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
130# multiplying with "twisted" powers of H
131#
132# Note: We can PMULL directly into the acc_x in first GHASH of the loop
133# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
134# path latency dominates the performance
135#
136# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
137# than indicated here
138# REV64 res_curr, res_curr
139# INS t_m.d[0], res_curr.d[1]
140# EOR t_m.8B, t_m.8B, res_curr.8B
141# PMULL2 t_h, res_curr, HX
142# PMULL t_l, res_curr, HX
143# PMULL t_m, t_m, HX_k
144# EOR acc_h, acc_h, t_h
145# EOR acc_l, acc_l, t_l
146# EOR acc_m, acc_m, t_m
147#
148# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
149# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
150# with a reversed constant
151# EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing
152# PMULL t_mod, acc_h, mod_constant
153# EXT acc_h, acc_h, acc_h, #8
154# EOR3 acc_m, acc_m, t_mod, acc_h
155# PMULL acc_h, acc_m, mod_constant
156# EXT acc_m, acc_m, acc_m, #8
157# EOR3 acc_l, acc_l, acc_m, acc_h
158
159$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
160$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
161
162$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
163( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
164( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
165die "can't locate arm-xlate.pl";
166
167die "only for 64 bit" if $flavour !~ /64/;
168
169open OUT,"| \"$^X\" $xlate $flavour $output";
170*STDOUT=*OUT;
171
172$code=<<___;
173#include "arm_arch.h"
174
175#if __ARM_MAX_ARCH__>=8
176___
09bd0d05 177$code.=".arch armv8-a+crypto\n.text\n";
954f45ba
X
178
179$input_ptr="x0"; #argument block
180$bit_length="x1";
4596c20b 181$byte_length="x9";
954f45ba
X
182$output_ptr="x2";
183$current_tag="x3";
184$counter="x16";
185$constant_temp="x15";
186$modulo_constant="x10";
187$cc="x8";
188{
189my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
190my ($temp2_x,$temp3_x)=map("x$_",(13..14));
191my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
192my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
193my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
194my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
195my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
196
197my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
198my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
199my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
200
201my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
202my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
203
204my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
205my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
206my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
207my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
208
209my $t0="v16";
210my $t0d="d16";
211
212my $t1="v29";
213my $t2=$res1;
214my $t3=$t1;
215
216my $t4=$res0;
217my $t5=$res2;
218my $t6=$t0;
219
220my $t7=$res3;
221my $t8=$res4;
222my $t9=$res5;
223
224my $t10=$res6;
225my $t11="v21";
226my $t12=$t1;
227
228my $rtmp_ctr="v30";
229my $rtmp_ctrq="q30";
230my $rctr_inc="v31";
231my $rctr_incd="d31";
232
233my $mod_constantd=$t0d;
234my $mod_constant=$t0;
235
236my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
237my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
238my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
239my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
240my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
241my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
242my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
243my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
244my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
245my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
246my $rk2q1="v28.1q";
247my $rk3q1="v26.1q";
248my $rk4v="v27";
249
250
251#########################################################################################
252# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
253# size_t len,
254# unsigned char *out,
255# const void *key,
256# unsigned char ivec[16],
257# u64 *Xi);
258#
259$code.=<<___;
260.global unroll8_eor3_aes_gcm_enc_128_kernel
261.type unroll8_eor3_aes_gcm_enc_128_kernel,%function
262.align 4
263unroll8_eor3_aes_gcm_enc_128_kernel:
264 AARCH64_VALID_CALL_TARGET
265 cbz x1, .L128_enc_ret
266 stp d8, d9, [sp, #-80]!
4596c20b 267 lsr $byte_length, $bit_length, #3
954f45ba
X
268 mov $counter, x4
269 mov $cc, x5
270 stp d10, d11, [sp, #16]
271 stp d12, d13, [sp, #32]
272 stp d14, d15, [sp, #48]
273 mov x5, #0xc200000000000000
274 stp x5, xzr, [sp, #64]
275 add $modulo_constant, sp, #64
276
277 mov $constant_temp, #0x100000000 @ set up counter increment
278 movi $rctr_inc.16b, #0x0
279 mov $rctr_inc.d[1], $constant_temp
4596c20b 280 mov $main_end_input_ptr, $byte_length
954f45ba
X
281 ld1 { $ctr0b}, [$counter] @ CTR block 0
282
283 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
284
285 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
286
287 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
288
289 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
290
291 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
292 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
293
294 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
295 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
296
297 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
298 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
299
300 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
301 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
302
303 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
304 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
305 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
306
307 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
308 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
309
310 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
311 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
312
313 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
314 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
315 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
316
317 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
318 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
319 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
320
321 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
322 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
323 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
324
325 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
326
327 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
328 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
329 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
330
331 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
332 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
333 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
334
335 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
336 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
337 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
338
339 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
340 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
341 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
342
343 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
344 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
345 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
346
347 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
348
349 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
350 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
351 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
352
353 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
354 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
355 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
356
357 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
358
359 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
360 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
361 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
362
363 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
364 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
365 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
366
367 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
368 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
369 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
370
371 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
372 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
373 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
374
375 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
376 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
377 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
378
379 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
380 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
381 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
382
383 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
384 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
385 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
386
387 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
388 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
389 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
390
391 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
392 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
393 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
394
395 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
396
397 ld1 { $acc_lb}, [$current_tag]
398 ext $acc_lb, $acc_lb, $acc_lb, #8
399 rev64 $acc_lb, $acc_lb
400
401 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
402
403 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
404 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
405 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
406
407 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
408 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
409 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
410
411 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
412 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
413 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
414
415 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
416 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
417 ldr $rk10q, [$cc, #160] @ load rk10
418
419 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
420 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
421 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
422
423 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
424 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
425 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
426
427 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
428 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
429 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
430
431 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
432 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
433 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
434
435 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
436 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
437 b.ge .L128_enc_tail @ handle tail
438
439 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
440
441 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
442
443 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
444
445 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
446 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
447
448 eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result
449 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
450 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
451
452 eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result
453 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
454
455 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
456 eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
457 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
458
459 eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result
460 eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
461 eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
462
463 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
464 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
465
466 eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result
467 eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result
468 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
469
470 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
471 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
472 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
473
474 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
475
476 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
477 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
478 b.ge .L128_enc_prepretail @ do prepretail
479
480.L128_enc_main_loop: @ main loop start
481 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
482 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
483 ext $h5.16b, $h5.16b, $h5.16b, #8
484 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
485 ext $h6.16b, $h6.16b, $h6.16b, #8
486 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
487
488 rev64 $res1b, $res1b @ GHASH block 8k+1
489 rev64 $res0b, $res0b @ GHASH block 8k
490 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
491 ext $h7.16b, $h7.16b, $h7.16b, #8
492 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
493 ext $h8.16b, $h8.16b, $h8.16b, #8
494
495 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
496 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
497 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
498
499 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
500 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
501 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
502 rev64 $res3b, $res3b @ GHASH block 8k+3
503
504 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
505 eor $res0b, $res0b, $acc_lb @ PRE 1
506 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
507
508 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
509
510 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
511 rev64 $res2b, $res2b @ GHASH block 8k+2
512 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
513
514 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
515 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
516 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
517
518 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
519 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
520 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
521
522 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
523 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
524 ext $h3.16b, $h3.16b, $h3.16b, #8
525 ldr $h4q, [$current_tag, #112] @ load h3l | h3h
526 ext $h4.16b, $h4.16b, $h4.16b, #8
527 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
528
529 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
530 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
531 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
532
533 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
534 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
535 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
536
537 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
538 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
539 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
540
541 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
542 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
543 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
544
545 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
546 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
547 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
548
549 eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high
550 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
551 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
552
553 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
554 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
555 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
556
557 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
558 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
559 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
560
561 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
562 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
563 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
564
565 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
566 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
567
568 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
569 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
570 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
571
572 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
573 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
574 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
575
576 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
577 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
578 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
579
580 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
581 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
582 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
583
584 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
585 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
586 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
587 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
588
589 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
590 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
591 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
592
593 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
594 ext $h1.16b, $h1.16b, $h1.16b, #8
595 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
596 ext $h2.16b, $h2.16b, $h2.16b, #8
597 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
598 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
599
600 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
601 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
602
603 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
604 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
605
606 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
607 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
608
609 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
610 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
611
612 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
613 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
614 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
615
616 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
617 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
618 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
619
620 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
621 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
622 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
623
624 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
625 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
626 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
627
628 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
629 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
630 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
631
632 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
633 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
634 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
635
636 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
637 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
638 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
639
640 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
641 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
642
643 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
644 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
645 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
646
647 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
648 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
649 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
650
651 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
652 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
653 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
654
655 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
656 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
657
658 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
659 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
660 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
661
662 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
663 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
664 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
665
666 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
667 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
668 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
669
670 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
671 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
672 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
673
674 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
675 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
676 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
677
678 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
679 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
680 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
681
682 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
683 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
684 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
685
686 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
687 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
688 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
689
690 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
691 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
692
693 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
694 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
695 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
696
697 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
698 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
699 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
700
701 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
702 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
703 ldr $rk10q, [$cc, #160] @ load rk10
704
705 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
706 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
707 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
708 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
709
710 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
711 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
712 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
713
714 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
715 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
716 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
717
718 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
719 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
720 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
721
722 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
723 eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
724 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
725
726 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
727 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
728
729 eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result
730
731 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
732 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
733
734 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
735 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
736
737 eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result
738 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
739 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
740
741 eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result
742 eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result
743 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
744
745 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
746 eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
747 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
748
749 eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result
750 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
751 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
752
753 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
754 eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
755
756 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
757 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
758
759 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
760 b.lt .L128_enc_main_loop
761
762.L128_enc_prepretail: @ PREPRETAIL
763 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
764 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
765 ext $h7.16b, $h7.16b, $h7.16b, #8
766 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
767 ext $h8.16b, $h8.16b, $h8.16b, #8
768 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
769
770 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
771 ext $h5.16b, $h5.16b, $h5.16b, #8
772 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
773 ext $h6.16b, $h6.16b, $h6.16b, #8
774 rev64 $res0b, $res0b @ GHASH block 8k
775 rev64 $res1b, $res1b @ GHASH block 8k+1
776
777 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
778 ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
779 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
780 rev64 $res3b, $res3b @ GHASH block 8k+3
781
782 rev64 $res2b, $res2b @ GHASH block 8k+2
783 eor $res0b, $res0b, $acc_lb @ PRE 1
784
785 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
786
787 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
788 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
789 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
790
791 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
792 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
793
794 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
795 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
796 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
797
798 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
799 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
800
801 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
802 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
803
804 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
805 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
806
807 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
808 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
809
810 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
811
812 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
813
814 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
815
816 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
817
818 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
819 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
820
821 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
822 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
823
824 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
825 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
826
827 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
828 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
829 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
830
831 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
832 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
833
834 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
835 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
836 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
837
838 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
839 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
840
841 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
842 ext $h3.16b, $h3.16b, $h3.16b, #8
843 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
844 ext $h4.16b, $h4.16b, $h4.16b, #8
845
846 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
847 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
848 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
849
850 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
851 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
852
853 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
854 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
855
856 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
857 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
858 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
859 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
860
861 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
862 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
863
864 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
865 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
866 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
867
868 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
869 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
870 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
871
872 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
873 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
874
875 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
876 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
877 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
878
879 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
880 ext $h1.16b, $h1.16b, $h1.16b, #8
881 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
882 ext $h2.16b, $h2.16b, $h2.16b, #8
883 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
884 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
885
886 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
887 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
888 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
889
890 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
891 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
892 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
893
894 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
895 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
896
897 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
898 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
899 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
900
901 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
902 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
903 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
904
905 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
906 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
907 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
908
909 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
910 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
911 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
912
913 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
914 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
915 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
916
917 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
918 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
919 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
920
921 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
922 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
923
924 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
925 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
926 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
927
928 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
929 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
930 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
931
932 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
933 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
934 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
935
936 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
937 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
938 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
939
940 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
941 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
942
943 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
944 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
945 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
946
947 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
948 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
949
950 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
951 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
952 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
953
954 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
955 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
956 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
957
958 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
959 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
960 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
961
962 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
963 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
964 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
965
966 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
967 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
968 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
969 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
970
971 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
972 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
973 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
974
975 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
976 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
977
978 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
979 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
980
981 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
982 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
983 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
984 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
985
986 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
987 eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low
988 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
989
990 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
991 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
992 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
993
994 ldr $rk10q, [$cc, #160] @ load rk10
995 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
996 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
997
998 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
999 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
1000
1001 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
1002 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
1003
1004 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
1005 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
1006.L128_enc_tail: @ TAIL
1007
1008 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1009 ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
1010
1011 mov $t1.16b, $rk10
1012 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
1013 ext $h5.16b, $h5.16b, $h5.16b, #8
1014
1015 eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
1016 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3b5b9199 1017 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
954f45ba
X
1018 ext $h6.16b, $h6.16b, $h6.16b, #8
1019 ext $h7.16b, $h7.16b, $h7.16b, #8
1020
3b5b9199 1021 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
1022 ext $h8.16b, $h8.16b, $h8.16b, #8
1023 cmp $main_end_input_ptr, #112
1024 b.gt .L128_enc_blocks_more_than_7
1025
1026 mov $ctr7b, $ctr6b
1027 mov $ctr6b, $ctr5b
1028 movi $acc_h.8b, #0
1029
1030 cmp $main_end_input_ptr, #96
1031 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1032 mov $ctr5b, $ctr4b
1033
1034 mov $ctr4b, $ctr3b
1035 mov $ctr3b, $ctr2b
1036 mov $ctr2b, $ctr1b
1037
1038 movi $acc_l.8b, #0
1039 movi $acc_m.8b, #0
1040 b.gt .L128_enc_blocks_more_than_6
1041
1042 mov $ctr7b, $ctr6b
1043 cmp $main_end_input_ptr, #80
1044
1045 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1046 mov $ctr6b, $ctr5b
1047 mov $ctr5b, $ctr4b
1048
1049 mov $ctr4b, $ctr3b
1050 mov $ctr3b, $ctr1b
1051 b.gt .L128_enc_blocks_more_than_5
1052
1053 cmp $main_end_input_ptr, #64
1054 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1055
1056 mov $ctr7b, $ctr6b
1057 mov $ctr6b, $ctr5b
1058
1059 mov $ctr5b, $ctr4b
1060 mov $ctr4b, $ctr1b
1061 b.gt .L128_enc_blocks_more_than_4
1062
1063 mov $ctr7b, $ctr6b
1064 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1065 mov $ctr6b, $ctr5b
1066
1067 mov $ctr5b, $ctr1b
1068 cmp $main_end_input_ptr, #48
1069 b.gt .L128_enc_blocks_more_than_3
1070
1071 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1072 mov $ctr7b, $ctr6b
1073 mov $ctr6b, $ctr1b
1074
1075 cmp $main_end_input_ptr, #32
1076 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1077 b.gt .L128_enc_blocks_more_than_2
1078
1079 cmp $main_end_input_ptr, #16
1080
1081 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1082 mov $ctr7b, $ctr1b
1083 b.gt .L128_enc_blocks_more_than_1
1084
1085 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1086 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1087 b .L128_enc_blocks_less_than_1
1088.L128_enc_blocks_more_than_7: @ blocks left > 7
1089 st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
1090
1091 rev64 $res0b, $res1b @ GHASH final-7 block
1092 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
1093
1094 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1095
1096 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
1097
1098 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
1099
1100 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
1101
1102 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
eb4129e1 1103 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1104
1105 eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
1106
1107 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
1108 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
1109.L128_enc_blocks_more_than_6: @ blocks left > 6
1110
1111 st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
1112
1113 rev64 $res0b, $res1b @ GHASH final-6 block
1114 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
1115
1116 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1117
1118 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
1119
1120 eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
1121 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
1122
1123 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
eb4129e1 1124 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1125
1126 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
1127 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
1128
1129 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
1130
1131 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
1132 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
1133.L128_enc_blocks_more_than_5: @ blocks left > 5
1134
1135 st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
1136
1137 rev64 $res0b, $res1b @ GHASH final-5 block
1138
1139 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1140
1141 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
1142 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
1143 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
1144
1145 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
1146
1147 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
1148
1149 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
1150
1151 eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
1152 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
eb4129e1 1153 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1154
1155 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
1156 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
1157
1158 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
1159.L128_enc_blocks_more_than_4: @ blocks left > 4
1160
1161 st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
1162
1163 rev64 $res0b, $res1b @ GHASH final-4 block
1164
1165 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
1166
1167 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1168
1169 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
eb4129e1 1170 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1171 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
1172
1173 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
1174
1175 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
1176
1177 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
1178 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
1179
1180 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
1181
1182 eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
1183 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
1184.L128_enc_blocks_more_than_3: @ blocks left > 3
1185
1186 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
1187
3b5b9199 1188 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
954f45ba
X
1189 ext $h4.16b, $h4.16b, $h4.16b, #8
1190
1191 rev64 $res0b, $res1b @ GHASH final-3 block
1192
1193 eor $res0b, $res0b, $t0.16b @ feed in partial tag
eb4129e1 1194 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1195
1196 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
3b5b9199 1197 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
954f45ba
X
1198 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
1199
1200 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
1201
1202 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1203
1204 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
1205 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
1206
1207 eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
1208
1209 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
1210 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
1211
1212 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
1213 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
1214.L128_enc_blocks_more_than_2: @ blocks left > 2
1215
1216 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
1217
1218 rev64 $res0b, $res1b @ GHASH final-2 block
1219
1220 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1221
1222 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
1223
1224 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
3b5b9199 1225 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
954f45ba 1226 ext $h3.16b, $h3.16b, $h3.16b, #8
eb4129e1 1227 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1228
1229 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1230 eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
1231
1232 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1233
1234 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1235 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1236
1237 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1238
1239 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1240 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1241.L128_enc_blocks_more_than_1: @ blocks left > 1
1242
1243 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
1244
3b5b9199 1245 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
1246 ext $h2.16b, $h2.16b, $h2.16b, #8
1247 rev64 $res0b, $res1b @ GHASH final-1 block
1248 ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
1249
1250 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1251
eb4129e1 1252 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
1253 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
1254 eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
1255
1256 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1257
1258 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1259
3b5b9199 1260 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
954f45ba
X
1261
1262 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1263
1264 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1265 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1266
1267 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1268
1269 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1270 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1271.L128_enc_blocks_less_than_1: @ blocks left <= 1
1272
1273 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
1274 str $rtmp_ctrq, [$counter] @ store the updated counter
1275 and $bit_length, $bit_length, #127 @ bit_length %= 128
1276
1277 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1278
1279 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1280
1281 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
1282 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1283 and $bit_length, $bit_length, #127 @ bit_length %= 128
1284
1285 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
1286 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
1287 cmp $bit_length, #64
1288
1289 csel $temp2_x, $temp1_x, $temp0_x, lt
1290 csel $temp3_x, $temp0_x, xzr, lt
1291
1292 mov $ctr0.d[1], $temp3_x
1293 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
1294
1295 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1296
1297 rev64 $res0b, $res1b @ GHASH final block
1298
1299 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1300 st1 { $res1b}, [$output_ptr] @ store all 16B
1301
1302 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1303
1304 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
1305
1306 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3b5b9199 1307 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
954f45ba
X
1308 ext $h1.16b, $h1.16b, $h1.16b, #8
1309
1310 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1311
1312 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1313 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1314 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
1315
1316 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1317
1318 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1319
1320 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1321
1322 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1323 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1324
1325 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
1326
1327 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
1328
1329 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1330 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1331
1332 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
1333 ext $acc_lb, $acc_lb, $acc_lb, #8
1334 rev64 $acc_lb, $acc_lb
1335 st1 { $acc_l.16b }, [$current_tag]
4596c20b 1336 mov x0, $byte_length
954f45ba
X
1337
1338 ldp d10, d11, [sp, #16]
1339 ldp d12, d13, [sp, #32]
1340 ldp d14, d15, [sp, #48]
1341 ldp d8, d9, [sp], #80
1342 ret
1343
1344.L128_enc_ret:
1345 mov w0, #0x0
1346 ret
1347.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1348___
1349
1350#########################################################################################
1351# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
1352# size_t len,
1353# unsigned char *out,
1354# u64 *Xi,
1355# unsigned char ivec[16],
1356# const void *key);
1357#
1358$code.=<<___;
1359.global unroll8_eor3_aes_gcm_dec_128_kernel
1360.type unroll8_eor3_aes_gcm_dec_128_kernel,%function
1361.align 4
1362unroll8_eor3_aes_gcm_dec_128_kernel:
1363 AARCH64_VALID_CALL_TARGET
1364 cbz x1, .L128_dec_ret
1365 stp d8, d9, [sp, #-80]!
4596c20b 1366 lsr $byte_length, $bit_length, #3
954f45ba
X
1367 mov $counter, x4
1368 mov $cc, x5
1369 stp d10, d11, [sp, #16]
1370 stp d12, d13, [sp, #32]
1371 stp d14, d15, [sp, #48]
1372 mov x5, #0xc200000000000000
1373 stp x5, xzr, [sp, #64]
1374 add $modulo_constant, sp, #64
1375
4596c20b 1376 mov $main_end_input_ptr, $byte_length
954f45ba
X
1377 ld1 { $ctr0b}, [$counter] @ CTR block 0
1378
1379 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
1380 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1381
3b5b9199 1382 mov $constant_temp, #0x100000000 @ set up counter increment
954f45ba
X
1383 movi $rctr_inc.16b, #0x0
1384 mov $rctr_inc.d[1], $constant_temp
1385 ld1 { $acc_lb}, [$current_tag]
1386 ext $acc_lb, $acc_lb, $acc_lb, #8
1387 rev64 $acc_lb, $acc_lb
1388
1389 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
1390
1391 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1392
1393 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
1394
1395 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
1396 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
1397
1398 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1399
1400 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
1401 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
1402 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1403
1404 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
1405 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
1406
1407 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1408 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1409
1410 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
1411 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
1412
1413 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
1414 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
1415
1416 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1417
1418 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
1419 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
1420 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
1421
1422 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1423 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
1424
1425 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
1426
1427 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
1428 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1429
1430 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
1431
1432 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
1433
1434 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
1435 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
1436
1437 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
1438 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
1439
1440 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
1441 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1442 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1443
1444 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
1445 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1446 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
1447
1448 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
1449 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1450 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1451
1452 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
1453 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1454
1455 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
1456 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
1457
1458 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1459 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
1460
1461 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1462 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1463
1464 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1465 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
1466 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
1467
1468 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
1469 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1470 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1471
1472 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
1473 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
1474 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1475
1476 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
1477 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1478 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1479
1480 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
1481 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1482
1483 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
1484 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
1485
1486 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
1487
1488 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1489 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1490 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1491
1492 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
1493 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
1494 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1495
1496 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1497 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
1498 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
1499
1500 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1501 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
1502 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1503
1504 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
1505 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
1506 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
1507
1508 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
1509 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1510 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1511
1512 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1513 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
1514
1515 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
1516 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1517
1518 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1519 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
1520 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1521
1522 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
1523 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1524 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
1525
1526 aese $ctr0b, $rk9 @ AES block 0 - round 9
1527 aese $ctr1b, $rk9 @ AES block 1 - round 9
1528 aese $ctr6b, $rk9 @ AES block 6 - round 9
1529
1530 ldr $rk10q, [$cc, #160] @ load rk10
1531 aese $ctr4b, $rk9 @ AES block 4 - round 9
1532 aese $ctr3b, $rk9 @ AES block 3 - round 9
1533
1534 aese $ctr2b, $rk9 @ AES block 2 - round 9
1535 aese $ctr5b, $rk9 @ AES block 5 - round 9
1536 aese $ctr7b, $rk9 @ AES block 7 - round 9
1537
1538 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1539 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1540 b.ge .L128_dec_tail @ handle tail
1541
1542 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
1543
1544 eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result
1545 eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result
1546 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
1547
1548 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
1549 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
1550 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
1551
1552 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
1553
1554 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
1555 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
1556 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
1557
1558 eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result
1559 eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result
1560 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
1561
1562 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
1563 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
1564
1565 eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result
1566
1567 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
1568 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
1569
1570 eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result
1571 eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result
1572 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
1573
1574 eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result
1575 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
1576 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
1577
1578 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1579 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
1580 b.ge .L128_dec_prepretail @ do prepretail
1581
1582.L128_dec_main_loop: @ main loop start
3b5b9199 1583 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
954f45ba 1584 ext $h7.16b, $h7.16b, $h7.16b, #8
3b5b9199 1585 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
954f45ba
X
1586 ext $h8.16b, $h8.16b, $h8.16b, #8
1587
1588 rev64 $res1b, $res1b @ GHASH block 8k+1
1589 rev64 $res0b, $res0b @ GHASH block 8k
1590 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1591
1592 rev64 $res6b, $res6b @ GHASH block 8k+6
1593 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
1594 ext $h5.16b, $h5.16b, $h5.16b, #8
1595 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
1596 ext $h6.16b, $h6.16b, $h6.16b, #8
1597
1598 eor $res0b, $res0b, $acc_lb @ PRE 1
1599 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
1600 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
1601
1602 rev64 $res2b, $res2b @ GHASH block 8k+2
1603 rev64 $res4b, $res4b @ GHASH block 8k+4
1604 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
1605
1606 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
1607 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
1608 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
3b5b9199 1609 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
1610
1611 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
1612 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
1613 rev64 $res3b, $res3b @ GHASH block 8k+3
1614
1615 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
1616 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1617 rev64 $res5b, $res5b @ GHASH block 8k+5
1618
1619 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
1620 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
1621 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1622
1623 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
1624 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
1625 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
1626
1627 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
1628 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
1629 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
1630
1631 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
1632 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
1633 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
1634
1635 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
1636 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
1637 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
1638
1639 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
1640 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
1641 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
1642
1643 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
1644 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1645 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
1646
3b5b9199 1647 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
954f45ba
X
1648 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1649 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
1650
1651 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1652 ext $h3.16b, $h3.16b, $h3.16b, #8
1653 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1654 ext $h4.16b, $h4.16b, $h4.16b, #8
1655 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
1656 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
1657
1658 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
1659 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
1660 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
1661
1662 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
1663 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
1664 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
1665
1666 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
1667 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
1668 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
1669
1670 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
1671 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1672 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1673 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 1674 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
1675 ext $h2.16b, $h2.16b, $h2.16b, #8
1676
1677 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
1678 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
1679 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
1680
1681 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1682 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
1683 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
1684
1685 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
1686 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
1687 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
1688
1689 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
1690 rev64 $res7b, $res7b @ GHASH block 8k+7
1691 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
1692
1693 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
1694 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
1695 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1696
1697 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1698 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1699 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
1700 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1701
1702 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
1703 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
1704 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
1705
1706 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
1707 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
1708 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
1709
1710 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
1711 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
1712 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
1713
1714 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
1715 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
1716 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
1717
1718 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
1719 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1720 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
1721
1722 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
1723 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
1724 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
1725
1726 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
1727 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
1728 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1729
1730 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
1731 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
1732 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
1733
1734 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
1735 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
1736 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
1737
1738 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
1739 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
1740 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
1741
1742 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
1743 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
1744 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
1745
1746 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
1747 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
1748 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
1749
1750 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
1751 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
1752 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
1753
1754 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
1755 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
1756 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
1757
1758 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
1759 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
1760 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
1761
1762 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
1763 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
1764 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
1765
1766 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
1767 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
1768 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
1769
1770 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
1771 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
1772 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
1773
1774 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
1775 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
1776 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
1777
1778 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
1779 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
1780 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
1781
1782 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
1783 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
1784 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
1785
1786 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
1787 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1788 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1789
1790 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
1791 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
1792 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
1793
1794 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
1795 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
1796 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
1797
1798 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
1799 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
1800 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
1801
1802 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
1803 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
1804 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
1805
1806 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
1807 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
1808 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
1809
1810 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
1811 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
1812 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
1813
1814 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
1815 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
1816 ldr $rk10q, [$cc, #160] @ load rk10
1817
1818 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
1819 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1820 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
1821
1822 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
1823 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
1824 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1825
1826 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
1827 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
1828
1829 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
1830 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
1831 eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result
1832
1833 eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result
1834 eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result
1835 eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result
1836
1837 eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result
1838 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
1839 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
1840
1841 eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result
1842 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
1843 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
1844
1845 eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result
1846 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1847 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
1848
1849 eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result
1850 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
1851
1852 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
1853 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
1854 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
1855
1856 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
1857 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
1858 b.lt .L128_dec_main_loop
1859
1860.L128_dec_prepretail: @ PREPRETAIL
1861 rev64 $res3b, $res3b @ GHASH block 8k+3
1862 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1863 rev64 $res0b, $res0b @ GHASH block 8k
1864
1865 rev64 $res2b, $res2b @ GHASH block 8k+2
3b5b9199 1866 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
954f45ba
X
1867 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
1868
1869 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
1870 ext $h7.16b, $h7.16b, $h7.16b, #8
1871 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
1872 ext $h8.16b, $h8.16b, $h8.16b, #8
1873 eor $res0b, $res0b, $acc_lb @ PRE 1
1874 rev64 $res1b, $res1b @ GHASH block 8k+1
1875
1876 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
1877 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
1878 ext $h5.16b, $h5.16b, $h5.16b, #8
1879 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
1880 ext $h6.16b, $h6.16b, $h6.16b, #8
1881 rev64 $res5b, $res5b @ GHASH block 8k+5
1882
1883 rev64 $res4b, $res4b @ GHASH block 8k+4
1884
1885 rev64 $res6b, $res6b @ GHASH block 8k+6
1886
1887 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
3b5b9199 1888 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
1889 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
1890 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
1891
1892 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
1893 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
1894 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
1895
1896 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1897 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1898 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
1899
1900 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
1901 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
1902 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
1903
1904 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
1905 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
1906 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
1907
1908 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
1909 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
1910 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
1911
1912 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
1913 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1914 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1915
1916 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
1917 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
1918 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
1919
3b5b9199 1920 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
954f45ba
X
1921 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
1922 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
1923
1924 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
1925 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
1926 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
1927
1928 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
1929 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1930 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
1931
1932 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
1933 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
1934 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
1935
1936 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
1937 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
1938 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
1939
1940 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1941 ext $h3.16b, $h3.16b, $h3.16b, #8
1942 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1943 ext $h4.16b, $h4.16b, $h4.16b, #8
1944 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
1945 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
1946
1947 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
1948 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
1949 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
1950
1951 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1952 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 1953 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
1954 ext $h2.16b, $h2.16b, $h2.16b, #8
1955 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1956
1957 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
1958 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
1959 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
1960
1961 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
1962 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1963 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
1964
1965 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
1966 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
1967 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
1968
1969 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
1970 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
1971 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1972
1973 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
1974 rev64 $res7b, $res7b @ GHASH block 8k+7
1975 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
1976
1977 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1978 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1979 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
1980 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
1981
1982 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
1983 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
1984 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1985
1986 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
1987 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
1988 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1989
1990 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
1991 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
1992 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
1993
1994 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
1995 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
1996 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
1997
1998 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
1999 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
2000 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
2001
2002 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2003 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
2004 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
2005
2006 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
2007 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
2008 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
2009
2010 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
2011 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
2012 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
2013
2014 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
2015 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
2016 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
2017
2018 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
2019 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
2020 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
2021
2022 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
2023 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
2024 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
2025
2026 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
2027 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
2028 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
2029
2030 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
2031 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
2032 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
2033
2034 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
2035 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2036 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
2037
2038 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
2039 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
2040 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
2041
2042 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
2043 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
2044 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
2045
2046 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
2047 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
2048 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
2049
2050 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
2051 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
2052 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
2053
2054 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2055 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
2056 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2057
2058 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
2059 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
2060 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
2061
2062 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
2063 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
2064 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
2065
2066 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
2067 ldr $rk10q, [$cc, #160] @ load rk10
2068
2069 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
2070 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
2071
2072 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2073 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
2074 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2075
2076 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
2077 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
2078 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
2079
2080 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
2081 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
2082 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
2083
2084 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
2085 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
2086 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
2087
2088 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
2089 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
2090 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
2091
2092 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
2093 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
2094 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
2095
2096.L128_dec_tail: @ TAIL
2097
2098 mov $t1.16b, $rk10
2099 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2100
2101 cmp $main_end_input_ptr, #112
2102
3b5b9199 2103 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
2104 ext $h8.16b, $h8.16b, $h8.16b, #8
2105 ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
2106
2107 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
2108 ext $h5.16b, $h5.16b, $h5.16b, #8
2109 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2110
3b5b9199 2111 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
954f45ba
X
2112 ext $h6.16b, $h6.16b, $h6.16b, #8
2113 ext $h7.16b, $h7.16b, $h7.16b, #8
2114
2115 eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
2116 b.gt .L128_dec_blocks_more_than_7
2117
2118 cmp $main_end_input_ptr, #96
2119 mov $ctr7b, $ctr6b
2120 movi $acc_l.8b, #0
2121
2122 movi $acc_h.8b, #0
2123 mov $ctr6b, $ctr5b
2124 mov $ctr5b, $ctr4b
2125
2126 mov $ctr4b, $ctr3b
2127 mov $ctr3b, $ctr2b
2128 mov $ctr2b, $ctr1b
2129
2130 movi $acc_m.8b, #0
2131 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2132 b.gt .L128_dec_blocks_more_than_6
2133
2134 cmp $main_end_input_ptr, #80
2135 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2136
2137 mov $ctr7b, $ctr6b
2138 mov $ctr6b, $ctr5b
2139 mov $ctr5b, $ctr4b
2140
2141 mov $ctr4b, $ctr3b
2142 mov $ctr3b, $ctr1b
2143 b.gt .L128_dec_blocks_more_than_5
2144
2145 cmp $main_end_input_ptr, #64
2146
2147 mov $ctr7b, $ctr6b
2148 mov $ctr6b, $ctr5b
2149 mov $ctr5b, $ctr4b
2150
2151 mov $ctr4b, $ctr1b
2152 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2153 b.gt .L128_dec_blocks_more_than_4
2154
2155 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2156 mov $ctr7b, $ctr6b
2157 mov $ctr6b, $ctr5b
2158
2159 mov $ctr5b, $ctr1b
2160 cmp $main_end_input_ptr, #48
2161 b.gt .L128_dec_blocks_more_than_3
2162
2163 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2164 mov $ctr7b, $ctr6b
2165 cmp $main_end_input_ptr, #32
2166
2167 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
2168 mov $ctr6b, $ctr1b
2169 b.gt .L128_dec_blocks_more_than_2
2170
2171 cmp $main_end_input_ptr, #16
2172
2173 mov $ctr7b, $ctr1b
2174 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2175 b.gt L128_dec_blocks_more_than_1
2176
2177 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2178 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
2179 b .L128_dec_blocks_less_than_1
2180.L128_dec_blocks_more_than_7: @ blocks left > 7
2181 rev64 $res0b, $res1b @ GHASH final-7 block
2182
2183 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2184
2185 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
2186
2187 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
2188 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
2189
eb4129e1 2190 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2191 ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
2192
2193 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
2194
2195 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
2196 st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
2197 eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
2198
2199 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
2200.L128_dec_blocks_more_than_6: @ blocks left > 6
2201
2202 rev64 $res0b, $res1b @ GHASH final-6 block
2203
2204 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2205
2206 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
2207
2208 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
2209
2210 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
2211 ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
eb4129e1 2212 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2213
2214 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
2215 st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
2216 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
2217
2218 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
2219 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
2220
2221 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
2222 eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
2223.L128_dec_blocks_more_than_5: @ blocks left > 5
2224
2225 rev64 $res0b, $res1b @ GHASH final-5 block
2226
2227 ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
2228 st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
2229
2230 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2231
2232 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
2233
2234 eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
2235
2236 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
2237
2238 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
2239 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
eb4129e1 2240 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2241
2242 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
2243 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
2244 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
2245
2246 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
2247 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
2248.L128_dec_blocks_more_than_4: @ blocks left > 4
2249
2250 rev64 $res0b, $res1b @ GHASH final-4 block
2251
2252 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2253 ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
2254
2255 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
eb4129e1 2256 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2257 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
2258
2259 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
2260
2261 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
2262
2263 st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
2264 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
2265
2266 eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
2267 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
2268
2269 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
2270
2271 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
2272.L128_dec_blocks_more_than_3: @ blocks left > 3
2273
2274 st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
2275 rev64 $res0b, $res1b @ GHASH final-3 block
2276
2277 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2278
2279 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
2280
2281 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2282 ext $h4.16b, $h4.16b, $h4.16b, #8
2283 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
2284
2285 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2286
2287 ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
2288
2289 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
2290 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
2291 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
2292
eb4129e1 2293 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2294 eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
2295 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
2296
2297 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
2298
2299 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
2300 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
2301.L128_dec_blocks_more_than_2: @ blocks left > 2
2302
2303 rev64 $res0b, $res1b @ GHASH final-2 block
2304
2305 st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
2306
2307 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2308 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2309 ext $h3.16b, $h3.16b, $h3.16b, #8
eb4129e1 2310 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2311
2312 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
2313
2314 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2315
2316 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2317
2318 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2319 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2320 ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
2321
2322 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2323
2324 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2325
2326 eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
2327 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2328.L128_dec_blocks_more_than_1: @ blocks left > 1
2329
2330 st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
2331 rev64 $res0b, $res1b @ GHASH final-1 block
2332
3b5b9199 2333 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
2334 ext $h2.16b, $h2.16b, $h2.16b, #8
2335
2336 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2337
eb4129e1 2338 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
2339
2340 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
2341
2342 ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
2343 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2344
2345 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2346 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2347 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
2348
2349 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2350 eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
2351
2352 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2353
2354 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2355
2356 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2357
2358 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2359.L128_dec_blocks_less_than_1: @ blocks left <= 1
2360
2361 and $bit_length, $bit_length, #127 @ bit_length %= 128
2362
2363 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2364
2365 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2366
2367 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
2368 and $bit_length, $bit_length, #127 @ bit_length %= 128
2369
2370 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
2371 cmp $bit_length, #64
2372 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
2373
2374 csel $temp2_x, $temp1_x, $temp0_x, lt
2375 csel $temp3_x, $temp0_x, xzr, lt
2376
2377 mov $ctr0.d[1], $temp3_x
2378 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
2379
2380 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2381 ext $h1.16b, $h1.16b, $h1.16b, #8
2382 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2383
2384 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2385
2386 rev64 $res0b, $res1b @ GHASH final block
2387
2388 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2389
2390 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2391 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
2392
2393 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2394 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2395
2396 bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2397
2398 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2399 st1 { $res4b}, [$output_ptr] @ store all 16B
2400
2401 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2402
2403 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2404 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
2405
2406 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2407
2408 eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
2409
2410 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2411 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2412
2413 eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
2414
2415 eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
2416
2417 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2418 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2419
2420 eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
2421 ext $acc_lb, $acc_lb, $acc_lb, #8
2422 rev64 $acc_lb, $acc_lb
2423 st1 { $acc_l.16b }, [$current_tag]
2424 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
2425
2426 str $rtmp_ctrq, [$counter] @ store the updated counter
2427
4596c20b 2428 mov x0, $byte_length
954f45ba
X
2429
2430 ldp d10, d11, [sp, #16]
2431 ldp d12, d13, [sp, #32]
2432 ldp d14, d15, [sp, #48]
2433 ldp d8, d9, [sp], #80
2434 ret
2435.L128_dec_ret:
2436 mov w0, #0x0
2437 ret
2438.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2439___
2440}
2441
2442{
2443my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
2444my ($temp2_x,$temp3_x)=map("x$_",(13..14));
2445my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
2446my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
2447my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
2448my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
2449my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
2450
2451my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
2452my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
2453my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
2454
2455my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
2456my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
2457
2458my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
2459my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
2460my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
2461my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
2462
2463my $t0="v16";
2464my $t0d="d16";
2465
2466my $t1="v29";
2467my $t2=$res1;
2468my $t3=$t1;
2469
2470my $t4=$res0;
2471my $t5=$res2;
2472my $t6=$t0;
2473
2474my $t7=$res3;
2475my $t8=$res4;
2476my $t9=$res5;
2477
2478my $t10=$res6;
2479my $t11="v21";
2480my $t12=$t1;
2481
2482my $rtmp_ctr="v30";
2483my $rtmp_ctrq="q30";
2484my $rctr_inc="v31";
2485my $rctr_incd="d31";
2486
2487my $mod_constantd=$t0d;
2488my $mod_constant=$t0;
2489
2490my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
2491my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
2492my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
2493my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
2494my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
2495my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
2496my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
2497my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
2498my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
2499my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
2500my $rk2q1="v28.1q";
2501my $rk3q1="v26.1q";
2502my $rk4v="v27";
2503
2504#########################################################################################
2505# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
2506# size_t len,
2507# unsigned char *out,
2508# const void *key,
2509# unsigned char ivec[16],
2510# u64 *Xi);
2511#
2512$code.=<<___;
2513.global unroll8_eor3_aes_gcm_enc_192_kernel
2514.type unroll8_eor3_aes_gcm_enc_192_kernel,%function
2515.align 4
2516unroll8_eor3_aes_gcm_enc_192_kernel:
2517 AARCH64_VALID_CALL_TARGET
2518 cbz x1, .L192_enc_ret
2519 stp d8, d9, [sp, #-80]!
4596c20b 2520 lsr $byte_length, $bit_length, #3
954f45ba
X
2521 mov $counter, x4
2522 mov $cc, x5
2523 stp d10, d11, [sp, #16]
2524 stp d12, d13, [sp, #32]
2525 stp d14, d15, [sp, #48]
2526 mov x5, #0xc200000000000000
2527 stp x5, xzr, [sp, #64]
2528 add $modulo_constant, sp, #64
2529
4596c20b 2530 mov $main_end_input_ptr, $byte_length
954f45ba
X
2531 ld1 { $ctr0b}, [$counter] @ CTR block 0
2532
2533 mov $constant_temp, #0x100000000 @ set up counter increment
2534 movi $rctr_inc.16b, #0x0
2535 mov $rctr_inc.d[1], $constant_temp
2536
2537 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
2538
2539 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
2540
2541 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
2542 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
2543
2544 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
2545 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
2546
2547 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
2548 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
2549
2550 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
2551 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
2552 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2553
2554 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2555
2556 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
2557 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
2558 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
2559
2560 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2561
2562 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
2563 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
2564
2565 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
2566
2567 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
2568 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
2569 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2570
2571 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2572 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2573 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
2574
2575 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
2576 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2577 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
2578
2579 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
2580 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
2581
2582 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2583 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2584 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
2585
2586 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
2587 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
2588 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2589
2590 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2591 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
2592 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2593
2594 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2595 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2596
2597 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2598 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
2599 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
2600
2601 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
2602 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
2603
2604 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
2605 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2606 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2607
2608 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2609
2610 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2611
2612 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
2613
2614 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2615 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2616 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
2617
2618 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2619 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2620 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
2621
2622 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
2623 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
2624 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
2625
2626 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3b5b9199 2627 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
954f45ba
X
2628 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2629
2630 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
2631 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
2632 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2633
2634 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
2635 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
2636 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2637
2638 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
2639
2640 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
2641 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
2642 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2643
2644 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2645 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
2646 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2647
2648 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2649 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
2650 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
2651
2652 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
2653 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2654
2655 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
2656 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2657
2658 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
2659 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2660
2661 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2662 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
2663
2664 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
2665 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2666
2667 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
2668 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2669 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
2670
2671 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2672 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2673 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
2674
2675 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2676 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2677 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2678
2679 ld1 { $acc_lb}, [$current_tag]
2680 ext $acc_lb, $acc_lb, $acc_lb, #8
2681 rev64 $acc_lb, $acc_lb
2682 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
2683
2684 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
2685 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2686
2687 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
2688 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2689
2690 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2691 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
2692
2693 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10
2694 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
2695 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10
2696
2697 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10
2698 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10
2699 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10
2700
2701 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10
2702 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10
2703 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10
2704
2705 aese $ctr6b, $rk11 @ AES block 14 - round 11
2706 aese $ctr3b, $rk11 @ AES block 11 - round 11
2707
2708 aese $ctr4b, $rk11 @ AES block 12 - round 11
2709 aese $ctr7b, $rk11 @ AES block 15 - round 11
2710 ldr $rk12q, [$cc, #192] @ load rk12
2711
2712 aese $ctr1b, $rk11 @ AES block 9 - round 11
2713 aese $ctr5b, $rk11 @ AES block 13 - round 11
2714
2715 aese $ctr2b, $rk11 @ AES block 10 - round 11
2716 aese $ctr0b, $rk11 @ AES block 8 - round 11
2717 b.ge .L192_enc_tail @ handle tail
2718
2719 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
2720
2721 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
2722
2723 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
2724
2725 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
2726
2727 eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result
2728 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
2729 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
2730
2731 eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result
2732 eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result
2733
2734 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
2735 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
2736 eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
2737
2738 eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
2739 eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
2740 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
2741
2742 eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result
2743 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
2744 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
2745
2746 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
2747 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2748
2749 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
2750 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
2751 eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
2752
2753 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
2754
2755 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
2756 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
2757 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
2758
2759 b.ge .L192_enc_prepretail @ do prepretail
2760
2761.L192_enc_main_loop: @ main loop start
2762 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
2763 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
2764 rev64 $res2b, $res2b @ GHASH block 8k+2
2765
2766 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
2767 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
2768 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
2769 ext $h7.16b, $h7.16b, $h7.16b, #8
2770 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
2771 ext $h8.16b, $h8.16b, $h8.16b, #8
2772
2773 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2774 rev64 $res0b, $res0b @ GHASH block 8k
2775 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
2776 ext $h5.16b, $h5.16b, $h5.16b, #8
2777 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
2778 ext $h6.16b, $h6.16b, $h6.16b, #8
2779
2780 rev64 $res1b, $res1b @ GHASH block 8k+1
2781 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
2782 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
2783
2784 eor $res0b, $res0b, $acc_lb @ PRE 1
2785 rev64 $res3b, $res3b @ GHASH block 8k+3
2786 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
2787
2788 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
2789 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
2790 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
2791
2792 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
2793 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
2794 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
2795
2796 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
2797 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
2798 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
2799
2800 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
2801 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
2802 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
2803
2804 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
2805 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
2806 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
2807
2808 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
2809 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
2810 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
2811 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
2812
2813 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
2814 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
2815 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
2816
2817 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
2818 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
2819 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
2820
2821 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
2822 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
2823 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
2824
2825 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
2826 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
2827 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
2828
2829 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
2830 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
2831 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
2832
2833 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
2834 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
2835 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
2836
2837 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
2838 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
2839 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
2840
2841 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
2842 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
2843 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
2844
2845 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
2846 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
2847 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
2848
2849 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
2850 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
2851 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2852 ext $h3.16b, $h3.16b, $h3.16b, #8
2853 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2854 ext $h4.16b, $h4.16b, $h4.16b, #8
2855
3b5b9199 2856 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
954f45ba
X
2857 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
2858 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
2859
2860 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
2861 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
2862 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
2863
2864 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
2865 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
2866 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
2867
2868 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
2869 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
2870 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
2871
2872 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
2873 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
2874 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
2875
2876 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
2877 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
2878 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
2879
2880 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
2881 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
2882 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
2883
2884 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
2885 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
2886 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2887 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 2888 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
2889 ext $h2.16b, $h2.16b, $h2.16b, #8
2890
2891 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
2892 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
2893 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
2894
2895 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
2896 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
2897 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
2898
2899 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
2900 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
2901
2902 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
2903 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
2904 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
2905
2906 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
2907 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
2908 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
2909
2910 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
2911 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
2912 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
2913
2914 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
2915 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
2916 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
2917
2918 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
2919 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
2920 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
2921
2922 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
2923 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
2924 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
2925
2926 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
2927 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
2928
2929 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
2930 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
2931 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
2932
2933 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
2934 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2935
2936 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
2937 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
2938 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
2939
2940 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
2941 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
2942 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
2943
2944 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
2945 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
2946 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
2947
2948 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
2949 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
2950 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
2951
2952 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
2953 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
2954 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
2955
2956 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
2957 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
2958 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
2959
2960 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
2961 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
2962 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
2963
2964 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
2965 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
2966 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
2967
2968 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
2969 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
2970 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
2971
2972 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
2973 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
2974 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
2975
2976 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
2977 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2978 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
2979
2980 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
2981 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
2982 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
2983
2984 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2985 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
2986 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
2987
2988 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
2989 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
2990 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
2991
2992 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
2993 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
2994 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
2995
2996 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
2997 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
2998 ldr $rk12q, [$cc, #192] @ load rk12
2999 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3000
3001 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
3002 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
3003 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
3004
3005 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
3006 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
3007 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
3008
3009 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
3010 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
3011 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
3012
3013 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
3014 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
3015
3016 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
3017 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3018
3019 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
3020 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
3021 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
3022
3023 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
3024 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
3025 eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
3026
3027 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
3028 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
3029 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
3030
3031 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
3032 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
3033 eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
3034
3035 eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result
3036 eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result
3037 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
3038
3039 eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result
3040 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
3041 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
3042 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3043
3044 eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
3045 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
3046 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
3047
3048 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
3049 eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
3050 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
3051
3052 eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result
3053 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
3054
3055 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
3056
3057 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
3058
3059 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3060 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
3061 b.lt .L192_enc_main_loop
3062
3063.L192_enc_prepretail: @ PREPRETAIL
3064 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
3065 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
3066 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
3067
3068 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
3069 ext $h7.16b, $h7.16b, $h7.16b, #8
3070 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
3071 ext $h8.16b, $h8.16b, $h8.16b, #8
3072 rev64 $res0b, $res0b @ GHASH block 8k
3073 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3074
3075 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
3076 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
3077 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
3078 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
3079
3080 rev64 $res3b, $res3b @ GHASH block 8k+3
3081 rev64 $res2b, $res2b @ GHASH block 8k+2
3082 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
3083 ext $h5.16b, $h5.16b, $h5.16b, #8
3084 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
3085 ext $h6.16b, $h6.16b, $h6.16b, #8
3086
3087 eor $res0b, $res0b, $acc_lb @ PRE 1
3088 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
3089 rev64 $res1b, $res1b @ GHASH block 8k+1
3090
3091 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
3092 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
3093 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
3094
3095 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
3096 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
3097 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
3098
3099 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
3100 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
3101 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
3102
3103 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
3104 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
3105 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3106
3107 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3108 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
3109 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
3110
3111 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
3112 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
3113 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
3114
3115 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
3116 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
3117 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
3118
3119 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
3120 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
3121 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
3122
3123 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
3124 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
3125 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
3126
3127 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
3128 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
3129 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
3130
3131 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
3132 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
3133 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
3134
3135 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
3136 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
3137 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
3138
3139 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
3140 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
3141 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
3142
3143 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
3144 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
3145 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
3146
3147 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
3148 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
3149 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
3150
3151 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
3152 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
3153 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
3154
3155 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
3156 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
3157 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
3158
3159 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
3160 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
3161 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
3162
3163 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3164 ext $h3.16b, $h3.16b, $h3.16b, #8
3165 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3166 ext $h4.16b, $h4.16b, $h4.16b, #8
3167 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
3168 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
3169
3170 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3171 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 3172 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
3173 ext $h2.16b, $h2.16b, $h2.16b, #8
3174 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
3175 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
3176
3177 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
3178 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
3179 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
3180
3181 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
3182 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
3183 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
3184
3185 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
3186 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
3187 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
3188
3189 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
3190 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
3191 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
3192
3193 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
3194 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
3195 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
3196 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
3197
3198 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
3199 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
3200 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
3201
3202 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
3203 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
3204 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
3205
3206 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
3207 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
3208
3209 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
3210 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
3211 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
3212
3213 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
3214 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
3215 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
3216
3217 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
3218 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
3219 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
3220
3221 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
3222 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
3223 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
3224
3225 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
3226 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
3227
3228 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
3229 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
3230 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
3231
3232 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
3233 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
3234 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
3235
3236 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
3237 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
3238 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
3239
3240 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
3241 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
3242 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
3243
3244 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
3245 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
3246 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
3247
3248 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
3249 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
3250 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
3251
3252 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
3253 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
3254 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
3255
3256 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
3257 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
3258
3259 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
3260 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
3261
3262 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
3263 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3264 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
3265 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3266
3267 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
3268 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
3269
3270 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
3271 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
3272 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
3273
3274 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
3275 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
3276 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
3277
3278 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
3279 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
3280 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
3281
3282 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
3283 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
3284 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
3285
3286 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
3287 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
3288
3289 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3290 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
3291 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
3292 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
3293
3294 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3295 ldr $rk12q, [$cc, #192] @ load rk12
3296
3297 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
3298 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
3299 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
3300
3301 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
3302 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
3303 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
3304
3305 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
3306 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
3307
3308 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
3309 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
3310
3311 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
3312 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
3313
3314 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
3315 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
3316 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
3317
3318 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
3319 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
3320 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
3321
3322.L192_enc_tail: @ TAIL
3323
3324 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
3325 ext $h5.16b, $h5.16b, $h5.16b, #8
3326 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3327
3328 ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext
3329
3b5b9199 3330 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
3331 ext $h8.16b, $h8.16b, $h8.16b, #8
3332
3333 mov $t1.16b, $rk12
3334
3335 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
3336 ext $h6.16b, $h6.16b, $h6.16b, #8
3337 ext $h7.16b, $h7.16b, $h7.16b, #8
3338 cmp $main_end_input_ptr, #112
3339
3340 eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
3341 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3342 b.gt .L192_enc_blocks_more_than_7
3343
3344 cmp $main_end_input_ptr, #96
3345 mov $ctr7b, $ctr6b
3346 movi $acc_h.8b, #0
3347
3348 mov $ctr6b, $ctr5b
3349 movi $acc_l.8b, #0
3350 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3351
3352 mov $ctr5b, $ctr4b
3353 mov $ctr4b, $ctr3b
3354 mov $ctr3b, $ctr2b
3355
3356 mov $ctr2b, $ctr1b
3357 movi $acc_m.8b, #0
3358 b.gt .L192_enc_blocks_more_than_6
3359
3360 mov $ctr7b, $ctr6b
3361 cmp $main_end_input_ptr, #80
3362
3363 mov $ctr6b, $ctr5b
3364 mov $ctr5b, $ctr4b
3365 mov $ctr4b, $ctr3b
3366
3367 mov $ctr3b, $ctr1b
3368 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3369 b.gt .L192_enc_blocks_more_than_5
3370
3371 cmp $main_end_input_ptr, #64
3372 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3373
3374 mov $ctr7b, $ctr6b
3375 mov $ctr6b, $ctr5b
3376 mov $ctr5b, $ctr4b
3377
3378 mov $ctr4b, $ctr1b
3379 b.gt .L192_enc_blocks_more_than_4
3380
3381 mov $ctr7b, $ctr6b
3382 mov $ctr6b, $ctr5b
3383 mov $ctr5b, $ctr1b
3384
3385 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3386 cmp $main_end_input_ptr, #48
3387 b.gt .L192_enc_blocks_more_than_3
3388
3389 mov $ctr7b, $ctr6b
3390 mov $ctr6b, $ctr1b
3391 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3392
3393 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
3394 cmp $main_end_input_ptr, #32
3395 b.gt .L192_enc_blocks_more_than_2
3396
3397 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3398
3399 cmp $main_end_input_ptr, #16
3400 mov $ctr7b, $ctr1b
3401 b.gt .L192_enc_blocks_more_than_1
3402
3403 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3404 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
3405 b .L192_enc_blocks_less_than_1
3406.L192_enc_blocks_more_than_7: @ blocks left > 7
3407 st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
3408
3409 rev64 $res0b, $res1b @ GHASH final-7 block
3410 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
3411
3412 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3413
3414 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
3415
3416 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
3417
3418 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
eb4129e1 3419 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3420 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
3421
3422 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
3423
3424 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
3425 eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
3426.L192_enc_blocks_more_than_6: @ blocks left > 6
3427
3428 st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
3429
3430 rev64 $res0b, $res1b @ GHASH final-6 block
3431
3432 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
3433
3434 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3435
3436 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
3437
3438 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
3439 eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
3440
eb4129e1 3441 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3442 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
3443 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
3444
3445 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
3446
3447 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
3448 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
3449
3450 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
3451.L192_enc_blocks_more_than_5: @ blocks left > 5
3452
3453 st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
3454
3455 rev64 $res0b, $res1b @ GHASH final-5 block
3456
3457 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3458
3459 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
3460
3461 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
3462 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
3463
3464 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
3465 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
3466
3467 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
3468 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
3469
3470 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
3471 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
3472
3473 eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
eb4129e1 3474 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3475
3476 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
3477.L192_enc_blocks_more_than_4: @ blocks left > 4
3478
3479 st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
3480
3481 rev64 $res0b, $res1b @ GHASH final-4 block
3482
3483 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3484
3485 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
3486 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
3487 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
3488
3489 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
3490 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
3491
3492 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
3493
eb4129e1 3494 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3495 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
3496
3497 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
3498
3499 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
3500 eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
3501.L192_enc_blocks_more_than_3: @ blocks left > 3
3502
3503 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
3504 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
3505
3506 rev64 $res0b, $res1b @ GHASH final-3 block
3507
3508 eor $res0b, $res0b, $t0.16b @ feed in partial tag
eb4129e1 3509 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3510
3511 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
3512 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3513 ext $h4.16b, $h4.16b, $h4.16b, #8
3514
3515 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
3516
3517 eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
3518 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3519
3520 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
3521 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
3522
3523 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
3524 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
3525
3526 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
3527
3528 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
3529 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
3530.L192_enc_blocks_more_than_2: @ blocks left > 2
3531
3532 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
3533
3534 rev64 $res0b, $res1b @ GHASH final-2 block
3535 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3536 ext $h3.16b, $h3.16b, $h3.16b, #8
3537
3538 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3539
3540 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
3541 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
3542
3543 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3544
3545 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3546 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
eb4129e1 3547 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3548
3549 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3550
3551 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3552 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3553
3554 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3555 eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
3556.L192_enc_blocks_more_than_1: @ blocks left > 1
3557
3558 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
3559 ext $h2.16b, $h2.16b, $h2.16b, #8
3560 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
3561
3562 rev64 $res0b, $res1b @ GHASH final-1 block
3563
3564 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3565
3566 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
3567 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3568
3569 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3570 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3571 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3572
3573 ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
3574 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
3575
3576 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3577
3578 eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
3579 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3580
eb4129e1 3581 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
3582
3583 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3584 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3585.L192_enc_blocks_less_than_1: @ blocks left <= 1
3586
3587 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
3588 and $bit_length, $bit_length, #127 @ bit_length %= 128
3589
3590 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3591
3592 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3593
3594 and $bit_length, $bit_length, #127 @ bit_length %= 128
3595
3596 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
3597 cmp $bit_length, #64
3598 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
3599
3600 csel $temp2_x, $temp1_x, $temp0_x, lt
3601 csel $temp3_x, $temp0_x, xzr, lt
3602
3603 mov $ctr0.d[1], $temp3_x
3604 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3605 ext $h1.16b, $h1.16b, $h1.16b, #8
3606
3607 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
3608 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
3609
3610 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3611
3612 rev64 $res0b, $res1b @ GHASH final block
3613 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
3614
3615 st1 { $res1b}, [$output_ptr] @ store all 16B
3616
3617 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3618
3619 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
3620 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3621
3622 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3623 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3624
3625 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3626
3627 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3628
3629 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3630 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
3631
3632 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3633 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3634
3635 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
3636
3637 str $rtmp_ctrq, [$counter] @ store the updated counter
3638 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
3639
3640 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3641
3642 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
3643
3644 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3645 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3646
3647 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
3648 ext $acc_lb, $acc_lb, $acc_lb, #8
3649 rev64 $acc_lb, $acc_lb
3650 st1 { $acc_l.16b }, [$current_tag]
3651
4596c20b 3652 mov x0, $byte_length @ return sizes
954f45ba
X
3653
3654 ldp d10, d11, [sp, #16]
3655 ldp d12, d13, [sp, #32]
3656 ldp d14, d15, [sp, #48]
3657 ldp d8, d9, [sp], #80
3658 ret
3659
3660.L192_enc_ret:
3661 mov w0, #0x0
3662 ret
3663.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
3664___
3665
3666#########################################################################################
3667# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
3668# size_t len,
3669# unsigned char *out,
3670# const void *key,
3671# unsigned char ivec[16],
3672# u64 *Xi);
3673#
3674$code.=<<___;
3675.global unroll8_eor3_aes_gcm_dec_192_kernel
3676.type unroll8_eor3_aes_gcm_dec_192_kernel,%function
3677.align 4
3678unroll8_eor3_aes_gcm_dec_192_kernel:
3679 AARCH64_VALID_CALL_TARGET
3680 cbz x1, .L192_dec_ret
3681 stp d8, d9, [sp, #-80]!
4596c20b 3682 lsr $byte_length, $bit_length, #3
954f45ba
X
3683 mov $counter, x4
3684 mov $cc, x5
3685 stp d10, d11, [sp, #16]
3686 stp d12, d13, [sp, #32]
3687 stp d14, d15, [sp, #48]
3688 mov x5, #0xc200000000000000
3689 stp x5, xzr, [sp, #64]
3690 add $modulo_constant, sp, #64
3691
4596c20b 3692 mov $main_end_input_ptr, $byte_length
954f45ba
X
3693 ld1 { $ctr0b}, [$counter] @ CTR block 0
3694 ld1 { $acc_lb}, [$current_tag]
3695
3696 mov $constant_temp, #0x100000000 @ set up counter increment
3697 movi $rctr_inc.16b, #0x0
3698 mov $rctr_inc.d[1], $constant_temp
3699
3700 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
3701
3702 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
3703
3704 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
3705 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
3706
3707 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
3708 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
3709
3710 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
3711 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
3712
3713 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
3714 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
3715
3716 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
3717 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
3718 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
3719
3720 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
3721 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
3722
3723 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
3724
3725 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3726 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
3727 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
3728
3729 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3730 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3731 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
3732
3733 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3734 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
3735 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
3736
3737 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3738
3739 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3740
3741 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3742 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3743 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
3744
3745 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
3746 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
3747
3748 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
3749 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3750 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
3751
3752 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
3753 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3754 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3755
3756 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3757 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
3758 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
3759
3760 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
3761
3762 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
3763 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3764 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
3765
3766 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3767 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3768
3769 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
3770 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3771 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
3772
3773 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3774 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3775 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
3776
3777 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3778 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
3779 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
3780
3781 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3782 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
3783 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
3784
3785 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
3786 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
3787
3788 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3789 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
3790 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3791
3792 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3793 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3794 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
3795
3796 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3797
3798 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
3799 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
3800 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3801
3802 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3803 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3804 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
3805
3806 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
3807 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3808 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
3809
3810 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
3811
3812 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3813 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
3814
3815 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3816 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3817 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
3818
3819 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
3820 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3821 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
3822
3823 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3824 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3825 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3826
3827 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
3828 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
3829 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
3830
3831 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
3832 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3833 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3834
3835 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3836 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
3837
3838 ld1 { $acc_lb}, [$current_tag]
3839 ext $acc_lb, $acc_lb, $acc_lb, #8
3840 rev64 $acc_lb, $acc_lb
3841
3842 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
3843
3844 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3845 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3846
3847 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3848 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
3849 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
3850
3851 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3852 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3853
3854 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
3855 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3856
3857 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3858 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3859 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
3860
3861 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
3862 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3863 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3864
3865 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
3866 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
3867 ldr $rk12q, [$cc, #192] @ load rk12
3868
3869 aese $ctr0b, $rk11 @ AES block 0 - round 11
3870 aese $ctr1b, $rk11 @ AES block 1 - round 11
3871 aese $ctr4b, $rk11 @ AES block 4 - round 11
3872
3873 aese $ctr6b, $rk11 @ AES block 6 - round 11
3874 aese $ctr5b, $rk11 @ AES block 5 - round 11
3875 aese $ctr7b, $rk11 @ AES block 7 - round 11
3876
3877 aese $ctr2b, $rk11 @ AES block 2 - round 11
3878 aese $ctr3b, $rk11 @ AES block 3 - round 11
3879 b.ge .L192_dec_tail @ handle tail
3880
3881 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
3882
3883 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
3884
3885 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
3886
3887 eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result
3888 eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result
3889 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
3890
3891 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
3892 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
3893
3894 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
3895 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
3896 eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result
3897
3898 eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result
3899 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
3900 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
3901
3902 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
3903 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
3904
3905 eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result
3906
3907 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
3908 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
3909
3910 eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result
3911 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
3912 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3913
3914 eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result
3915 eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result
3916 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
3917
3918 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
3919 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
3920 b.ge .L192_dec_prepretail @ do prepretail
3921
3922.L192_dec_main_loop: @ main loop start
3923 rev64 $res1b, $res1b @ GHASH block 8k+1
3924 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
3925 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3926
3927 rev64 $res0b, $res0b @ GHASH block 8k
3928 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
3929 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
3930
3931 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
3932 ext $h7.16b, $h7.16b, $h7.16b, #8
3933 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
3934 ext $h8.16b, $h8.16b, $h8.16b, #8
3935 rev64 $res4b, $res4b @ GHASH block 8k+4
3936 rev64 $res3b, $res3b @ GHASH block 8k+3
3937
3938 eor $res0b, $res0b, $acc_lb @ PRE 1
3939 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
3940 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
3941
3942 rev64 $res5b, $res5b @ GHASH block 8k+5
3943
3944 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
3945 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
3946 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
3947
3948 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
3949 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
3950 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
3951
3952 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
3953 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
3954 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
3955
3956 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
3957 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
3958 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
3959
3960 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
3961 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
3962 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
3963 ext $h5.16b, $h5.16b, $h5.16b, #8
3964 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
3965 ext $h6.16b, $h6.16b, $h6.16b, #8
3966
3967 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
3968 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
3969 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
3970
3971 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
3972 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
3973 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
3974
3975 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3976 rev64 $res2b, $res2b @ GHASH block 8k+2
3977 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
3978
3979 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
3980 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
3981 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
3982 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3983
3984 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
3985 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
3986 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
3987
3988 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
3989 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
3990 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
3991
3992 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
3993 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
3994 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
3995
3996 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
3997 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
3998 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
3999
4000 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
4001 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
4002 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
4003
4004 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4005 ext $h3.16b, $h3.16b, $h3.16b, #8
4006 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4007 ext $h4.16b, $h4.16b, $h4.16b, #8
4008 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
4009 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
4010
4011 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
4012 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4013 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4014
4015 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
4016 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
4017
4018 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
4019 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
4020 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
4021
4022 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4023 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
4024 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
4025
4026 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4027 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
4028
4029 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
4030 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
4031 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
4032
4033 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
4034 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
4035 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
4036
4037 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
4038 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
4039 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
4040
4041 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
4042 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
4043 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
4044
4045 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
4046 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
4047 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
4048
4049 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4050 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 4051 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
4052 ext $h2.16b, $h2.16b, $h2.16b, #8
4053 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
4054 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
4055
4056 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
4057 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
4058 rev64 $res7b, $res7b @ GHASH block 8k+7
4059
4060 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
4061 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4062 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
4063
4064 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
4065 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4066 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
4067
4068 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
4069 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
4070 rev64 $res6b, $res6b @ GHASH block 8k+6
4071
4072 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4073 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4074 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
4075 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
4076
4077 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
4078 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4079 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4080
4081 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
4082 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
4083 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
4084
4085 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
4086 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
4087 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
4088
4089 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
4090 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
4091 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
4092
4093 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
4094 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
4095 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
4096
4097 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
4098 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4099 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
4100
4101 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
4102 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
4103 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
4104
4105 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4106 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
4107 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
4108
4109 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
4110 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
4111 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
4112
4113 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4114 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
4115 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
4116
4117 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
4118 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
4119 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
4120
4121 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
4122 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
4123 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
4124
4125 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
4126 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
4127 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
4128
4129 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
4130 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
4131 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
4132
4133 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
4134 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
4135 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
4136
4137 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
4138 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4139 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
4140
4141 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
4142 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
4143 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
4144
4145 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
4146 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
4147
4148 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
4149 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
4150 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
4151
4152 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
4153 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4154 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
4155
4156 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
4157 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
4158 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4159
4160 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
4161 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
4162 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
4163
4164 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
4165 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
4166 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
4167
4168 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
4169 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
4170 ldr $rk12q, [$cc, #192] @ load rk12
4171
4172 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
4173 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
4174 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
4175
4176 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
4177 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4178 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
4179
4180 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
4181 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
4182 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
4183
4184 eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result
4185 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
4186 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
4187
4188 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
4189 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
4190 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
4191
4192 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
4193 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
4194 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4195
4196 eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result
4197 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
4198 eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result
4199
4200 eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result
4201 eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result
4202 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
4203
4204 eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result
4205 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
4206 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
4207
4208 eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result
4209 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
4210 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4211
4212 eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result
4213 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
4214 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
4215
4216 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
4217 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
4218
4219 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
4220 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
4221 b.lt .L192_dec_main_loop
4222
4223.L192_dec_prepretail: @ PREPRETAIL
4224 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
4225 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
4226 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
4227
4228 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
4229 ext $h7.16b, $h7.16b, $h7.16b, #8
4230 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
4231 ext $h8.16b, $h8.16b, $h8.16b, #8
4232 rev64 $res0b, $res0b @ GHASH block 8k
4233 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4234
4235 rev64 $res3b, $res3b @ GHASH block 8k+3
4236 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
4237 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
4238
4239 eor $res0b, $res0b, $acc_lb @ PRE 1
4240 rev64 $res2b, $res2b @ GHASH block 8k+2
4241 rev64 $res1b, $res1b @ GHASH block 8k+1
4242
4243 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
4244 ext $h5.16b, $h5.16b, $h5.16b, #8
4245 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
4246 ext $h6.16b, $h6.16b, $h6.16b, #8
4247 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
4248
4249 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
4250 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
4251 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
4252
4253 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
4254 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
4255 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
4256
4257 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
4258 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
4259 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
4260
4261 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
4262 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
4263 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
4264
4265 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
4266 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
4267 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
4268
4269 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
4270 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
4271 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
4272
4273 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
4274 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
4275 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
4276
4277 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
4278 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
4279 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
4280
4281 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
4282 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
4283 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
4284
4285 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
4286 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
4287 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
4288 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
4289
4290 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
4291 rev64 $res5b, $res5b @ GHASH block 8k+5
4292 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
4293
4294 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
4295 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
4296 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
4297
4298 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4299 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
4300 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
4301
4302 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
4303 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
4304 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4305
4306 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
4307 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
4308 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
4309
4310 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
4311 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4312 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
4313
4314 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
4315 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
4316 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
4317
4318 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
4319 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
4320 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
4321
4322 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4323 ext $h3.16b, $h3.16b, $h3.16b, #8
4324 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4325 ext $h4.16b, $h4.16b, $h4.16b, #8
4326 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
4327 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
4328
4329 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4330 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 4331 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
4332 ext $h2.16b, $h2.16b, $h2.16b, #8
4333 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
4334 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
4335
4336 rev64 $res7b, $res7b @ GHASH block 8k+7
4337
4338 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4339 rev64 $res4b, $res4b @ GHASH block 8k+4
4340
4341 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
4342 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
4343 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
4344
4345 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
4346 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
4347 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
4348
4349 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
4350 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
4351 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
4352
4353 rev64 $res6b, $res6b @ GHASH block 8k+6
4354 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4355 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4356 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4357
4358 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
4359 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
4360 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
4361
4362 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
4363 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
4364 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
4365
4366 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
4367 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
4368 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
4369
4370 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
4371
4372 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
4373 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4374 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
4375
4376 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
4377 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4378 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
4379
4380 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4381 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
4382 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4383
4384 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
4385 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
4386
4387 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4388 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
4389 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
4390
4391 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
4392 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
4393 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
4394
4395 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
4396 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
4397 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
4398
4399 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4400 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
4401 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
4402
4403 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
4404 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
4405 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
4406
4407 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
4408 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
4409 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
4410
4411 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
4412 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
4413 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
4414
4415 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
4416 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
4417 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
4418
4419 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
4420 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
4421 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4422
4423 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
4424 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
4425
4426 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
4427 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4428 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
4429
4430 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
4431 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
4432 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
4433
4434 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
4435 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4436 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
4437
4438 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
4439 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
4440 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
4441
4442 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
4443 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
4444 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
4445
4446 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
4447 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
4448 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
4449
4450 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
4451 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
4452 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
4453
4454 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4455 ldr $rk12q, [$cc, #192] @ load rk12
4456 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4457
4458 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
4459 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
4460 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
4461
4462 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
4463 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
4464 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
4465
4466 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
4467 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
4468 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
4469
4470 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
4471 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
4472 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
4473
4474 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
4475 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
4476 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
4477
4478 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
4479 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
4480 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
4481
4482.L192_dec_tail: @ TAIL
4483
4484 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4485
4486 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
4487 ext $h5.16b, $h5.16b, $h5.16b, #8
4488 ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
4489
3b5b9199 4490 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
4491 ext $h8.16b, $h8.16b, $h8.16b, #8
4492
4493 mov $t1.16b, $rk12
4494
4495 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
4496 ext $h6.16b, $h6.16b, $h6.16b, #8
4497 ext $h7.16b, $h7.16b, $h7.16b, #8
4498 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4499
4500 eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
4501 cmp $main_end_input_ptr, #112
4502 b.gt .L192_dec_blocks_more_than_7
4503
4504 mov $ctr7b, $ctr6b
4505 movi $acc_h.8b, #0
4506 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4507
4508 mov $ctr6b, $ctr5b
4509 mov $ctr5b, $ctr4b
4510 mov $ctr4b, $ctr3b
4511
4512 cmp $main_end_input_ptr, #96
4513 movi $acc_l.8b, #0
4514 mov $ctr3b, $ctr2b
4515
4516 mov $ctr2b, $ctr1b
4517 movi $acc_m.8b, #0
4518 b.gt .L192_dec_blocks_more_than_6
4519
4520 mov $ctr7b, $ctr6b
4521 mov $ctr6b, $ctr5b
4522 mov $ctr5b, $ctr4b
4523
4524 mov $ctr4b, $ctr3b
4525 mov $ctr3b, $ctr1b
4526
4527 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4528 cmp $main_end_input_ptr, #80
4529 b.gt .L192_dec_blocks_more_than_5
4530
4531 mov $ctr7b, $ctr6b
4532 mov $ctr6b, $ctr5b
4533
4534 mov $ctr5b, $ctr4b
4535 mov $ctr4b, $ctr1b
4536 cmp $main_end_input_ptr, #64
4537
4538 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4539 b.gt .L192_dec_blocks_more_than_4
4540
4541 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4542 mov $ctr7b, $ctr6b
4543 mov $ctr6b, $ctr5b
4544
4545 mov $ctr5b, $ctr1b
4546 cmp $main_end_input_ptr, #48
4547 b.gt .L192_dec_blocks_more_than_3
4548
4549 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4550 mov $ctr7b, $ctr6b
4551 cmp $main_end_input_ptr, #32
4552
4553 mov $ctr6b, $ctr1b
4554 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4555 b.gt .L192_dec_blocks_more_than_2
4556
4557 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4558
4559 mov $ctr7b, $ctr1b
4560 cmp $main_end_input_ptr, #16
4561 b.gt .L192_dec_blocks_more_than_1
4562
4563 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4564 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4565 b .L192_dec_blocks_less_than_1
4566.L192_dec_blocks_more_than_7: @ blocks left > 7
4567 rev64 $res0b, $res1b @ GHASH final-7 block
4568
4569 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
4570 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4571
4572 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
4573 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
4574 ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
4575
4576 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
4577
4578 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
4579 st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
4580
4581 eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
4582
4583 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
eb4129e1 4584 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4585.L192_dec_blocks_more_than_6: @ blocks left > 6
4586
4587 rev64 $res0b, $res1b @ GHASH final-6 block
4588
4589 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4590
4591 ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
4592 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
4593
4594 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
eb4129e1 4595 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4596 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
4597
4598 st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
4599 eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
4600
4601 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
4602 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
4603 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
4604
4605 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
4606 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
4607.L192_dec_blocks_more_than_5: @ blocks left > 5
4608
4609 rev64 $res0b, $res1b @ GHASH final-5 block
4610
4611 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4612
4613 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
4614
4615 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
4616
4617 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
4618 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
4619
4620 ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
4621
4622 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
4623 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
4624
4625 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
4626
4627 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
eb4129e1 4628 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4629 st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
4630
4631 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
4632 eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
4633.L192_dec_blocks_more_than_4: @ blocks left > 4
4634
4635 rev64 $res0b, $res1b @ GHASH final-4 block
4636
4637 eor $res0b, $res0b, $t0.16b @ feed in partial tag
eb4129e1 4638 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4639
4640 ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
4641 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
4642 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
4643
4644 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
4645
4646 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
4647
4648 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
4649 st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
4650 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
4651
4652 eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
4653
4654 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
4655 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
4656.L192_dec_blocks_more_than_3: @ blocks left > 3
4657
4658 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4659 ext $h4.16b, $h4.16b, $h4.16b, #8
4660 rev64 $res0b, $res1b @ GHASH final-3 block
4661 ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
4662
4663 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4664
4665 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
4666 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
4667
4668 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
eb4129e1 4669 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4670 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
4671
4672 st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
4673 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4674 eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
4675
4676 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
4677 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4678
4679 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
4680
4681 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
4682
4683 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
4684.L192_dec_blocks_more_than_2: @ blocks left > 2
4685
4686 rev64 $res0b, $res1b @ GHASH final-2 block
4687 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4688 ext $h3.16b, $h3.16b, $h3.16b, #8
4689
4690 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4691
4692 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
4693 ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
4694
4695 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4696
4697 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4698
4699 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4700 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4701
4702 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eb4129e1 4703 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4704
4705 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4706 st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
4707
4708 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4709 eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
4710.L192_dec_blocks_more_than_1: @ blocks left > 1
4711
4712 rev64 $res0b, $res1b @ GHASH final-1 block
4713 ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
4714 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
4715 ext $h2.16b, $h2.16b, $h2.16b, #8
4716
4717 eor $res0b, $res0b, $t0.16b @ feed in partial tag
eb4129e1 4718 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
4719 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4720
4721 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4722 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
4723 st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
4724
4725 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4726
4727 eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
4728
4729 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4730
4731 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4732
4733 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4734
4735 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4736
4737 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4738 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4739.L192_dec_blocks_less_than_1: @ blocks left <= 1
4740
4741 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
4742 and $bit_length, $bit_length, #127 @ bit_length %= 128
4743
4744 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4745 str $rtmp_ctrq, [$counter] @ store the updated counter
4746
4747 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4748 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
4749
4750 and $bit_length, $bit_length, #127 @ bit_length %= 128
4751
4752 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
4753 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
4754 cmp $bit_length, #64
4755
4756 csel $temp2_x, $temp1_x, $temp0_x, lt
4757 csel $temp3_x, $temp0_x, xzr, lt
4758 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4759 ext $h1.16b, $h1.16b, $h1.16b, #8
4760
4761 mov $ctr0.d[1], $temp3_x
4762 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4763
4764 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
4765
4766 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4767 bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4768
4769 rev64 $res0b, $res1b @ GHASH final block
4770
4771 st1 { $res4b}, [$output_ptr] @ store all 16B
4772
4773 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4774
4775 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
4776 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4777
4778 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4779 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4780 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4781
4782 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4783 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4784
4785 eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
4786 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4787 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
4788
4789 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4790 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4791
4792 eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
4793
4794 eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
4795
4796 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4797 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4798
4799 eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
4800 ext $acc_lb, $acc_lb, $acc_lb, #8
4801 rev64 $acc_lb, $acc_lb
4802 st1 { $acc_l.16b }, [$current_tag]
4803
4596c20b
TC
4804 mov x0, $byte_length
4805
954f45ba
X
4806 ldp d10, d11, [sp, #16]
4807 ldp d12, d13, [sp, #32]
4808 ldp d14, d15, [sp, #48]
4809 ldp d8, d9, [sp], #80
4810 ret
4811
4812.L192_dec_ret:
4813 mov w0, #0x0
4814 ret
4815.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
4816___
4817}
4818
4819{
4820
4821my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
4822my ($temp2_x,$temp3_x)=map("x$_",(13..14));
4823my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
4824my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
4825my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
4826my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
4827my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
4828
4829my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
4830my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
4831my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
4832
4833my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
4834my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
4835
4836my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
4837my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
4838my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
4839my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
4840
4841my $t0="v16";
4842my $t0d="d16";
4843
4844my $t1="v29";
4845my $t2=$res1;
4846my $t3=$t1;
4847
4848my $t4=$res0;
4849my $t5=$res2;
4850my $t6=$t0;
4851
4852my $t7=$res3;
4853my $t8=$res4;
4854my $t9=$res5;
4855
4856my $t10=$res6;
4857my $t11="v21";
4858my $t12=$t1;
4859
4860my $rtmp_ctr="v30";
4861my $rtmp_ctrq="q30";
4862my $rctr_inc="v31";
4863my $rctr_incd="d31";
4864
4865my $mod_constantd=$t0d;
4866my $mod_constant=$t0;
4867
4868my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
4869my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
4870my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
4871my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
4872my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
4873my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
4874my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
4875my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
4876my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
4877my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
4878my $rk2q1="v28.1q";
4879my $rk3q1="v26.1q";
4880my $rk4v="v27";
4881#########################################################################################
4882# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
4883# size_t len,
4884# unsigned char *out,
4885# const void *key,
4886# unsigned char ivec[16],
4887# u64 *Xi);
4888#
4889$code.=<<___;
4890.global unroll8_eor3_aes_gcm_enc_256_kernel
4891.type unroll8_eor3_aes_gcm_enc_256_kernel,%function
4892.align 4
4893unroll8_eor3_aes_gcm_enc_256_kernel:
4894 AARCH64_VALID_CALL_TARGET
4895 cbz x1, .L256_enc_ret
4896 stp d8, d9, [sp, #-80]!
4596c20b 4897 lsr $byte_length, $bit_length, #3
954f45ba
X
4898 mov $counter, x4
4899 mov $cc, x5
4900 stp d10, d11, [sp, #16]
4901 stp d12, d13, [sp, #32]
4902 stp d14, d15, [sp, #48]
4903 mov x5, #0xc200000000000000
4904 stp x5, xzr, [sp, #64]
4905 add $modulo_constant, sp, #64
4906
4907 ld1 { $ctr0b}, [$counter] @ CTR block 0
4908
4596c20b 4909 mov $main_end_input_ptr, $byte_length
954f45ba
X
4910
4911 mov $constant_temp, #0x100000000 @ set up counter increment
4912 movi $rctr_inc.16b, #0x0
4913 mov $rctr_inc.d[1], $constant_temp
4914 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4915
4916 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4917
4918 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4919
4920 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
4921
4922 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
4923
4924 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
4925 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
4926
4927 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
4928 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
4929
4930 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
4931 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
4932
4933 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
4934 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
4935
4936 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
4937 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
4938 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
4939
4940 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
4941 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
4942
4943 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
4944
4945 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4946 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
4947 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4948
4949 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4950 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4951 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
4952
4953 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
4954 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
4955 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
4956
4957 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
4958 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4959 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4960
4961 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
4962 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
4963
4964 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4965
4966 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
4967
4968 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4969 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4970 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4971
4972 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
4973 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
4974 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
4975
4976 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
4977 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4978 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4979
4980 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
4981 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4982 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
4983
4984 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
4985
4986 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4987 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
4988 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
4989
4990 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4991 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4992
4993 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
4994 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
4995 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4996
4997 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4998 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4999
5000 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
5001 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
5002 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
5003
5004 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
5005 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
5006 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
5007
5008 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
5009 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
5010 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
5011
5012 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
5013 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
5014 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
5015
5016 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
5017 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
5018 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
5019
5020 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
5021 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
5022 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
5023
5024 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
5025 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
5026 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
5027
5028 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
5029 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
5030
5031 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
5032 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
5033 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
5034
5035 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
5036 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
5037
5038 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
5039
5040 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
5041 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
5042
5043 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
5044 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
5045 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
5046
5047 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
5048 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
5049 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
5050
5051 ld1 { $acc_lb}, [$current_tag]
5052 ext $acc_lb, $acc_lb, $acc_lb, #8
5053 rev64 $acc_lb, $acc_lb
5054 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
5055
5056 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
5057 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
5058 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
5059
5060 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
5061 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
5062 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
5063
5064 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
5065
5066 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
5067 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
5068 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
5069
5070 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
5071 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
5072 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
5073
5074 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
5075 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
5076 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
5077
5078 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
5079 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
5080 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
5081
5082 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
5083 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
5084 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
5085
5086 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
5087 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
5088 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
5089
5090 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
5091 ldr $rk14q, [$cc, #224] @ load rk14
5092
5093 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
5094 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
5095 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
5096
5097 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
5098 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
5099 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
5100
5101 aese $ctr2b, $rk13 @ AES block 2 - round 13
5102 aese $ctr1b, $rk13 @ AES block 1 - round 13
5103 aese $ctr4b, $rk13 @ AES block 4 - round 13
5104
5105 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
5106 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
5107
5108 aese $ctr0b, $rk13 @ AES block 0 - round 13
5109 aese $ctr5b, $rk13 @ AES block 5 - round 13
5110
5111 aese $ctr6b, $rk13 @ AES block 6 - round 13
5112 aese $ctr7b, $rk13 @ AES block 7 - round 13
5113 aese $ctr3b, $rk13 @ AES block 3 - round 13
5114
5115 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
5116 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
5117 b.ge .L256_enc_tail @ handle tail
5118
5119 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
5120
5121 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
5122
5123 eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result
5124 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
5125 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
5126
5127 eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result
5128 eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result
5129
5130 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
5131 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
5132 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
5133
5134 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
5135 eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result
5136 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
5137
5138 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
5139 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
5140 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
5141
5142 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
5143
5144 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
5145 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
5146
5147 eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
5148
5149 eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
5150 eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
5151 eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
5152
5153 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
5154 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
5155
5156 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
5157 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
5158 b.ge .L256_enc_prepretail @ do prepretail
5159
5160.L256_enc_main_loop: @ main loop start
5161 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
5162
5163 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
5164 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
5165 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
5166 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
5167
5168 rev64 $res3b, $res3b @ GHASH block 8k+3
5169 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
5170 ext $h5.16b, $h5.16b, $h5.16b, #8
5171 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
5172 ext $h6.16b, $h6.16b, $h6.16b, #8
5173 rev64 $res1b, $res1b @ GHASH block 8k+1
5174
5175 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
5176 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
5177 rev64 $res0b, $res0b @ GHASH block 8k
5178
5179 rev64 $res4b, $res4b @ GHASH block 8k+4
5180 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5181 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
5182 ext $h7.16b, $h7.16b, $h7.16b, #8
5183 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
5184 ext $h8.16b, $h8.16b, $h8.16b, #8
5185
5186 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
5187 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
5188 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
5189
5190 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
5191 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
5192 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
5193
5194 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
5195 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
5196 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
5197
5198 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
5199 eor $res0b, $res0b, $acc_lb @ PRE 1
5200 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
5201
5202 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
5203 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
5204 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
5205
5206 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
5207 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
5208 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
5209
5210 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
5211 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
5212 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
5213
5214 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5215 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5216 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
5217
5218 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
5219 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
5220 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
5221
5222 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
5223 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
5224 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
5225
5226 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
5227 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
5228 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
5229
5230 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
5231 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
5232 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
5233
5234 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
5235 rev64 $res6b, $res6b @ GHASH block 8k+6
5236 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
5237
5238 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
5239 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
5240 rev64 $res2b, $res2b @ GHASH block 8k+2
5241
5242 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
5243 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
5244 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
5245
5246 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
5247 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
5248 rev64 $res5b, $res5b @ GHASH block 8k+5
5249
5250 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
5251 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
5252 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5253 ext $h3.16b, $h3.16b, $h3.16b, #8
5254 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5255 ext $h4.16b, $h4.16b, $h4.16b, #8
5256
5257 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5258 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
5259 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
5260
5261 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
5262 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
5263 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
5264
5265 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
5266 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
5267 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
5268
5269 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5270 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
5271 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
5272
5273 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5274 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
5275 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
5276
5277 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
5278 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
5279 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
5280
5281 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5282 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
5283 rev64 $res7b, $res7b @ GHASH block 8k+7
5284
5285 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
5286 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
5287 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
5288
5289 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
5290 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
5291 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
5292
5293 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
5294 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
5295 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
5296
5297 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
5298 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
5299 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
5300
5301 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
5302 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
5303 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
5304
5305 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
5306 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
5307 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
5308
5309 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
5310 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
5311 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
5312
5313 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
5314 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 5315 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
5316 ext $h2.16b, $h2.16b, $h2.16b, #8
5317 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
5318 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5319
5320 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
5321 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5322 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
5323 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
5324
5325 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
5326 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
5327 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
5328
5329 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5330 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
5331 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
5332
5333 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
5334 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
5335 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
5336
5337 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
5338 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5339 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5340
5341 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
5342 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
5343 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
5344
5345 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
5346 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
5347 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
5348
5349 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
5350 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
5351 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
5352
5353 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
5354 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5355 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
5356
5357 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5358 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
5359 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
5360
5361 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5362 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
5363 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
5364
5365 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
5366 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
5367 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
5368
5369 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
5370 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
5371 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
5372
5373 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
5374 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
5375 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
5376
5377 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
5378
5379 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5380 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
5381 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
5382
5383 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
5384 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
5385 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
5386
5387 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
5388 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
5389 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
5390
5391 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
5392 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
5393 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
5394
5395 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
5396
5397 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
5398 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
5399
5400 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5401 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
5402 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
5403
5404 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
5405 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
5406 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
5407
5408 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
5409 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
5410
5411 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5412 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
5413
5414 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
5415 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
5416
5417 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
5418 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
5419 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
5420
5421 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
5422 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
5423 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
5424
5425 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
5426 ldr $rk14q, [$cc, #224] @ load rk14
5427 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
5428
5429 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
5430 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
5431 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
5432
5433 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
5434 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
5435 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
5436
5437 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
5438 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
5439 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
5440
5441 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
5442 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
5443 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
5444
5445 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
5446 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
5447 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5448
5449 eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result
5450 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
5451 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
5452
5453 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
5454 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
5455 eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
5456
5457 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5458 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5459 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
5460
5461 eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
5462 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
5463 eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result
5464
5465 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
5466 eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result
5467 eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result
5468
5469 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
5470 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
5471 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
5472
5473 eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
5474 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
5475 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
5476
5477 eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
5478 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
5479 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
5480
5481 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
5482 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
5483 b.lt .L256_enc_main_loop
5484
5485.L256_enc_prepretail: @ PREPRETAIL
5486 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
5487 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
5488 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
5489
5490 rev64 $res2b, $res2b @ GHASH block 8k+2
5491
5492 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
5493 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
5494
5495 rev64 $res5b, $res5b @ GHASH block 8k+5
5496 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
5497 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
5498
5499 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
5500
5501 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
5502 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
5503 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
5504
5505 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
5506 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
5507
5508 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
5509 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
5510 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
5511
5512 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5513 rev64 $res0b, $res0b @ GHASH block 8k
5514 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
5515
5516 rev64 $res1b, $res1b @ GHASH block 8k+1
5517 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
5518 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
5519
5520 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
5521 ext $h7.16b, $h7.16b, $h7.16b, #8
5522 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
5523 ext $h8.16b, $h8.16b, $h8.16b, #8
5524 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
5525
5526 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
5527 ext $h5.16b, $h5.16b, $h5.16b, #8
5528 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
5529 ext $h6.16b, $h6.16b, $h6.16b, #8
5530 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
5531 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
5532
5533 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
5534 eor $res0b, $res0b, $acc_lb @ PRE 1
5535
5536 rev64 $res3b, $res3b @ GHASH block 8k+3
5537 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
5538
5539 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
5540 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
5541 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
5542
5543 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
5544 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
5545 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
5546
5547 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
5548 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
5549 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
5550
5551 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
5552 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5553 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
5554
5555 rev64 $res6b, $res6b @ GHASH block 8k+6
5556 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
5557 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
5558
5559 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
5560 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
5561 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5562
5563 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
5564 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
5565
5566 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
5567 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
5568 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
5569
5570 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
5571 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
5572 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
5573
5574 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
5575 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
5576 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
5577
5578 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
5579 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
5580 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
5581
5582 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
5583 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
5584 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
5585
5586 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
5587 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
5588 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
5589
5590 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
5591 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5592 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5593
5594 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
5595 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
5596 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
5597
5598 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
5599 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
5600 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5601
5602 rev64 $res4b, $res4b @ GHASH block 8k+4
5603 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
5604 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
5605
5606 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
5607 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
5608 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
5609
5610 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5611 ext $h3.16b, $h3.16b, $h3.16b, #8
5612 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5613 ext $h4.16b, $h4.16b, $h4.16b, #8
5614 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
5615 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
5616
5617 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
5618 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
5619
5620 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
5621 rev64 $res7b, $res7b @ GHASH block 8k+7
5622 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5623
5624 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
5625 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
5626 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5627
5628 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
5629 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
5630 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
5631
5632 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
5633 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5634 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
5635 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
5636
5637 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
5638 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
5639 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
5640
5641 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
5642 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
5643 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
5644 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 5645 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
5646 ext $h2.16b, $h2.16b, $h2.16b, #8
5647
5648 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
5649 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
5650 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
5651
5652 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
5653 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5654
5655 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
5656 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
5657 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
5658
5659 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
5660 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
5661 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5662
5663 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
5664 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
5665 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
5666
5667 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5668 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5669 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
5670
5671 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
5672 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
5673 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
5674
5675 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
5676 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
5677 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
5678
5679 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
5680 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5681 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
5682
5683 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
5684 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
5685 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
5686
5687 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
5688 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
5689 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
5690
5691 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
5692 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5693 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
5694
5695 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
5696 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
5697 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
5698
5699 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
5700 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5701 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
5702
5703 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
5704
5705 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
5706 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
5707 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
5708
5709 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
5710 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
5711
5712 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
5713 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
5714 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
5715
5716 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
5717 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
5718 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
5719
5720 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
5721 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
5722 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
5723
5724 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5725 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
5726 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
5727
5728 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
5729 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5730 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
5731
5732 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
5733 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
5734 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
5735
5736 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
5737 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
5738 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
5739
5740 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5741 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
5742 ldr $rk14q, [$cc, #224] @ load rk14
5743
5744 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
5745 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
5746 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
5747
5748 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
5749 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
5750 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5751
5752 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
5753 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
5754
5755 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
5756 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
5757 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
5758
5759 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
5760 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
5761 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
5762
5763 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
5764 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
5765 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
5766
5767 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
5768 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
5769.L256_enc_tail: @ TAIL
5770
5771 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
5772 ext $h8.16b, $h8.16b, $h8.16b, #8
5773 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5774
5775 ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
5776
5777 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
5778 ext $h5.16b, $h5.16b, $h5.16b, #8
5779
5780 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5781 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
5782 ext $h6.16b, $h6.16b, $h6.16b, #8
5783 ext $h7.16b, $h7.16b, $h7.16b, #8
5784 mov $t1.16b, $rk14
5785
5786 cmp $main_end_input_ptr, #112
5787 eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
5788 b.gt .L256_enc_blocks_more_than_7
5789
5790 movi $acc_l.8b, #0
5791 mov $ctr7b, $ctr6b
5792 movi $acc_h.8b, #0
5793
5794 mov $ctr6b, $ctr5b
5795 mov $ctr5b, $ctr4b
5796 mov $ctr4b, $ctr3b
5797
5798 mov $ctr3b, $ctr2b
5799 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5800 mov $ctr2b, $ctr1b
5801
5802 movi $acc_m.8b, #0
5803 cmp $main_end_input_ptr, #96
5804 b.gt .L256_enc_blocks_more_than_6
5805
5806 mov $ctr7b, $ctr6b
5807 mov $ctr6b, $ctr5b
5808 cmp $main_end_input_ptr, #80
5809
5810 mov $ctr5b, $ctr4b
5811 mov $ctr4b, $ctr3b
5812 mov $ctr3b, $ctr1b
5813
5814 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5815 b.gt .L256_enc_blocks_more_than_5
5816
5817 mov $ctr7b, $ctr6b
5818 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5819
5820 mov $ctr6b, $ctr5b
5821 mov $ctr5b, $ctr4b
5822
5823 cmp $main_end_input_ptr, #64
5824 mov $ctr4b, $ctr1b
5825 b.gt .L256_enc_blocks_more_than_4
5826
5827 cmp $main_end_input_ptr, #48
5828 mov $ctr7b, $ctr6b
5829 mov $ctr6b, $ctr5b
5830
5831 mov $ctr5b, $ctr1b
5832 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5833 b.gt .L256_enc_blocks_more_than_3
5834
5835 cmp $main_end_input_ptr, #32
5836 mov $ctr7b, $ctr6b
5837 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5838
5839 mov $ctr6b, $ctr1b
5840 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5841 b.gt .L256_enc_blocks_more_than_2
5842
5843 mov $ctr7b, $ctr1b
5844
5845 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5846 cmp $main_end_input_ptr, #16
5847 b.gt .L256_enc_blocks_more_than_1
5848
5849 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5850 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
5851 b .L256_enc_blocks_less_than_1
5852.L256_enc_blocks_more_than_7: @ blocks left > 7
5853 st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
5854
5855 rev64 $res0b, $res1b @ GHASH final-7 block
5856
5857 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5858
5859 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
5860
5861 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
5862 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
5863 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
5864
eb4129e1 5865 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
5866
5867 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
5868 eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
5869
5870 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
5871 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
5872.L256_enc_blocks_more_than_6: @ blocks left > 6
5873
5874 st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
5875
5876 rev64 $res0b, $res1b @ GHASH final-6 block
5877
5878 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5879
5880 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
5881 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
5882 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
5883
5884 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
5885
5886 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
5887
5888 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
5889
5890 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
5891 eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
5892
eb4129e1 5893 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
5894
5895 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
5896 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
5897.L256_enc_blocks_more_than_5: @ blocks left > 5
5898
5899 st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
5900
5901 rev64 $res0b, $res1b @ GHASH final-5 block
5902
5903 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5904
5905 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
5906
5907 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
5908
5909 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
5910 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
5911
5912 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
5913
5914 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
3b5b9199 5915 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
954f45ba
X
5916
5917 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
eb4129e1 5918 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
5919 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
5920
5921 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
5922 eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
5923.L256_enc_blocks_more_than_4: @ blocks left > 4
5924
5925 st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
5926
5927 rev64 $res0b, $res1b @ GHASH final-4 block
5928
5929 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
5930
5931 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5932
5933 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
5934 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
5935
5936 eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
5937 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
5938
5939 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
5940 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
5941
5942 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
5943
eb4129e1 5944 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
5945
5946 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
5947 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
5948.L256_enc_blocks_more_than_3: @ blocks left > 3
5949
5950 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
5951
5952 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5953 ext $h4.16b, $h4.16b, $h4.16b, #8
5954 rev64 $res0b, $res1b @ GHASH final-3 block
5955
5956 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5957
5958 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
5959 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
5960
5961 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
5962 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5963 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5964
5965 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
5966 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
5967
5968 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
5969 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
5970
5971 eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
eb4129e1 5972 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
5973
5974 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
5975 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
5976.L256_enc_blocks_more_than_2: @ blocks left > 2
5977
5978 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5979 ext $h3.16b, $h3.16b, $h3.16b, #8
5980
5981 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
5982
5983 rev64 $res0b, $res1b @ GHASH final-2 block
5984 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
5985
5986 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5987
5988 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
5989
eb4129e1 5990 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
5991
5992 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5993 eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
5994
5995 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5996
5997 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5998
5999 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
6000 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
6001
6002 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
6003 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
6004.L256_enc_blocks_more_than_1: @ blocks left > 1
6005
6006 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
6007
3b5b9199 6008 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
6009 ext $h2.16b, $h2.16b, $h2.16b, #8
6010 rev64 $res0b, $res1b @ GHASH final-1 block
6011 ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
6012
6013 eor $res0b, $res0b, $t0.16b @ feed in partial tag
eb4129e1 6014 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
6015
6016 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
6017 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
6018
6019 eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
6020 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
6021
6022 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
6023 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
6024
6025 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
6026
6027 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
6028 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
6029
6030 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
6031
6032 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
6033.L256_enc_blocks_less_than_1: @ blocks left <= 1
6034
6035 and $bit_length, $bit_length, #127 @ bit_length %= 128
6036
6037 sub $bit_length, $bit_length, #128 @ bit_length -= 128
6038
6039 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
6040
6041 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
6042 and $bit_length, $bit_length, #127 @ bit_length %= 128
6043
6044 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
6045 cmp $bit_length, #64
6046 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
6047
6048 csel $temp3_x, $temp0_x, xzr, lt
6049 csel $temp2_x, $temp1_x, $temp0_x, lt
6050
6051 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
6052 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
6053 ext $h1.16b, $h1.16b, $h1.16b, #8
6054
6055 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
6056 mov $ctr0.d[1], $temp3_x
6057
6058 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
6059
6060 rev64 $res0b, $res1b @ GHASH final block
6061
6062 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
6063 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
6064 str $rtmp_ctrq, [$counter] @ store the updated counter
6065
6066 eor $res0b, $res0b, $t0.16b @ feed in partial tag
6067 st1 { $res1b}, [$output_ptr] @ store all 16B
6068
6069 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
6070 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
6071 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
6072
6073 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
6074 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
6075
6076 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
6077
6078 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
6079
6080 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
6081 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
6082
6083 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
6084
6085 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
6086 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
6087
6088 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
6089
6090 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6091 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6092
6093 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
6094 ext $acc_lb, $acc_lb, $acc_lb, #8
6095 rev64 $acc_lb, $acc_lb
6096 st1 { $acc_l.16b }, [$current_tag]
4596c20b 6097 mov x0, $byte_length @ return sizes
954f45ba
X
6098
6099 ldp d10, d11, [sp, #16]
6100 ldp d12, d13, [sp, #32]
6101 ldp d14, d15, [sp, #48]
6102 ldp d8, d9, [sp], #80
6103 ret
6104
6105.L256_enc_ret:
6106 mov w0, #0x0
6107 ret
6108.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6109___
6110
6111{
6112#########################################################################################
6113# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
6114# size_t len,
6115# unsigned char *out,
6116# const void *key,
6117# unsigned char ivec[16],
6118# u64 *Xi);
6119#
6120$code.=<<___;
6121.global unroll8_eor3_aes_gcm_dec_256_kernel
6122.type unroll8_eor3_aes_gcm_dec_256_kernel,%function
6123.align 4
6124unroll8_eor3_aes_gcm_dec_256_kernel:
6125 AARCH64_VALID_CALL_TARGET
6126 cbz x1, .L256_dec_ret
6127 stp d8, d9, [sp, #-80]!
4596c20b 6128 lsr $byte_length, $bit_length, #3
954f45ba
X
6129 mov $counter, x4
6130 mov $cc, x5
6131 stp d10, d11, [sp, #16]
6132 stp d12, d13, [sp, #32]
6133 stp d14, d15, [sp, #48]
6134 mov x5, #0xc200000000000000
6135 stp x5, xzr, [sp, #64]
6136 add $modulo_constant, sp, #64
6137
6138 ld1 { $ctr0b}, [$counter] @ CTR block 0
6139
6140 mov $constant_temp, #0x100000000 @ set up counter increment
6141 movi $rctr_inc.16b, #0x0
6142 mov $rctr_inc.d[1], $constant_temp
4596c20b 6143 mov $main_end_input_ptr, $byte_length
954f45ba
X
6144
6145 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
6146
6147 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
6148
6149 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
6150
6151 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
6152 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
6153
6154 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
6155 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
6156 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
6157
6158 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
6159 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
6160
6161 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
6162 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
6163
6164 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
6165
6166 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
6167 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
6168
6169 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
6170 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
6171
6172 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
6173 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
6174
6175 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
6176 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
6177
6178 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
6179 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
6180
6181 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
6182 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
6183 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
6184
6185 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
6186 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
6187 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
6188
6189 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
6190 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
6191 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
6192
6193 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
6194 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
6195
6196 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
6197 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
6198 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
6199
6200 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
6201 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
6202 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
6203
6204 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
6205 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
3b5b9199 6206 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
954f45ba
X
6207
6208 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
6209 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
6210
6211 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
6212 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
6213
6214 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
6215 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
6216 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
6217
6218 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
6219
6220 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
6221 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
6222
6223 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
6224 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
6225 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
6226
6227 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
6228 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
6229 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
6230
6231 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
6232 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
6233
6234 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
6235 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
6236 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
6237
6238 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
6239
6240 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
6241 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
6242
6243 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
6244
6245 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
6246 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
6247 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
6248
6249 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
6250 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
6251 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
6252
6253 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
6254 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
6255 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
6256
6257 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
6258 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
6259
6260 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
6261 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
6262 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
6263
6264 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
6265 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
6266 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
6267
6268 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
6269 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
6270 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
6271
6272 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
6273 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
6274 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
6275
6276 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
6277 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
6278 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
6279
6280 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
6281
6282 ld1 { $acc_lb}, [$current_tag]
6283 ext $acc_lb, $acc_lb, $acc_lb, #8
6284 rev64 $acc_lb, $acc_lb
6285 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
6286 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
6287 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
6288
6289 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
6290 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
6291
6292 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
6293 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
6294
6295 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
6296
6297 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
6298 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
6299
6300 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
6301 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
6302 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
6303
6304 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
6305 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
6306 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
6307
6308 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
6309 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
6310 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
6311
6312 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
6313 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
6314
6315 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
6316 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
6317 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
6318
6319 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
6320 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
6321 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
6322
6323 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
6324 ldr $rk14q, [$cc, #224] @ load rk14
6325
6326 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
6327 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
6328 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
6329
6330 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
6331 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
6332 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
6333
6334 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
6335 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
6336 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
6337
6338 aese $ctr5b, $rk13 @ AES block 5 - round 13
6339 aese $ctr1b, $rk13 @ AES block 1 - round 13
6340 aese $ctr2b, $rk13 @ AES block 2 - round 13
6341
6342 aese $ctr0b, $rk13 @ AES block 0 - round 13
6343 aese $ctr4b, $rk13 @ AES block 4 - round 13
6344 aese $ctr6b, $rk13 @ AES block 6 - round 13
6345
6346 aese $ctr3b, $rk13 @ AES block 3 - round 13
6347 aese $ctr7b, $rk13 @ AES block 7 - round 13
6348 b.ge .L256_dec_tail @ handle tail
6349
6350 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
6351
6352 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
6353
6354 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
6355
6356 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
6357 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
6358
6359 eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result
6360 eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result
6361 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
6362
6363 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
6364 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
6365 eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result
6366
6367 eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result
6368
6369 eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result
6370 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
6371 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
6372
6373 eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result
6374 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
6375
6376 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
6377 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
6378
6379 eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result
6380
6381 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
6382 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
6383 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
6384
6385 eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result
6386 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
6387
6388 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
6389 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
6390 b.ge .L256_dec_prepretail @ do prepretail
6391
6392.L256_dec_main_loop: @ main loop start
6393 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
6394 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
6395 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
6396
6397 rev64 $res1b, $res1b @ GHASH block 8k+1
6398 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
6399 ext $h7.16b, $h7.16b, $h7.16b, #8
6400 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
6401 ext $h8.16b, $h8.16b, $h8.16b, #8
6402
6403 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
6404 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
6405 rev64 $res0b, $res0b @ GHASH block 8k
6406
6407 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
6408 rev64 $res4b, $res4b @ GHASH block 8k+4
6409 rev64 $res3b, $res3b @ GHASH block 8k+3
6410
6411 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
6412 rev64 $res7b, $res7b @ GHASH block 8k+7
6413
6414 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
6415 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
6416 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
6417
6418 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
6419 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
6420 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
6421
6422 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
6423 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
6424 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
6425
6426 eor $res0b, $res0b, $acc_lb @ PRE 1
6427 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
6428 ext $h5.16b, $h5.16b, $h5.16b, #8
6429 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
6430 ext $h6.16b, $h6.16b, $h6.16b, #8
6431 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
6432
6433 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
6434 rev64 $res2b, $res2b @ GHASH block 8k+2
6435 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
6436
6437 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
6438 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
6439 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
6440
6441 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6442 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
6443 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
6444
6445 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
6446 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
6447 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
6448
6449 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
6450 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
6451 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
6452
6453 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
6454 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
6455 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
6456
6457 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
6458 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
6459 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
6460
6461 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
6462 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
6463 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
6464
6465 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
6466 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
6467 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
6468
6469 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
6470 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
6471 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6472
6473 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
6474 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
6475 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
6476
6477 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
6478 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
6479 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
6480
6481 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
6482 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
6483 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
6484
6485 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
6486 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
6487 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
6488
6489 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
6490 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
6491 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
6492 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
6493
6494 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
6495 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
6496 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
6497
6498 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
6499 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
6500 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
6501
6502 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
6503 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
6504 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
6505
6506 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
6507 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6508 rev64 $res5b, $res5b @ GHASH block 8k+5
6509
6510 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
6511 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
6512 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6513
6514 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
6515 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
6516 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
6517
6518 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6519 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
6520 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
6521
6522 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6523 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
6524 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
6525
6526 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
6527 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
6528 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
6529
6530 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
6531 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
6532 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
6533
6534 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
6535 ext $h3.16b, $h3.16b, $h3.16b, #8
6536 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
6537 ext $h4.16b, $h4.16b, $h4.16b, #8
6538 rev64 $res6b, $res6b @ GHASH block 8k+6
6539 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
6540
6541 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
6542 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
6543 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
6544
6545 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
6546 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 6547 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
6548 ext $h2.16b, $h2.16b, $h2.16b, #8
6549 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6550 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
6551
6552 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
6553 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
6554 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
6555
6556 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
6557 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
6558 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
6559 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
6560
6561 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
6562 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
6563 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6564
6565 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
6566 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
6567 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
6568
6569 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
6570 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
6571 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
6572
6573 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
6574 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
6575 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
6576
6577 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6578 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
6579 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
6580
6581 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
6582 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
6583 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6584
6585 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
6586 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
6587 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
6588
6589 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
6590 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6591 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
6592
6593 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
6594 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6595 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
6596
6597 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
6598 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
6599 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
6600
6601 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
6602 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
6603 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
6604
6605 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
6606 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
6607 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
6608
6609 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
6610 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
6611 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
6612
6613 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
6614 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6615 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
6616
6617 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
6618 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
6619 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
6620
6621 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
6622 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
6623 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
6624
6625 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
6626 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
6627 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
6628
6629 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
6630 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
6631 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
6632
6633 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
6634 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
6635
6636 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6637 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
6638 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
6639
6640 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
6641 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
6642 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
6643
6644 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
6645 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
6646 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
6647
6648 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
6649 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
6650 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
6651
6652 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
6653 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
6654 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
6655
6656 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
6657 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
6658 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
6659
6660 ldr $rk14q, [$cc, #224] @ load rk14
6661 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
6662 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
6663
6664 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
6665 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
6666 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
6667
6668 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
6669 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
6670 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
6671
6672 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
6673 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
6674 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
6675
6676 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
6677 eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result
6678 eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result
6679
6680 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6681 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
6682
6683 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
6684 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6685 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
6686
6687 eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result
6688 eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result
6689 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
6690
6691 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
6692 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
6693 eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result
6694
6695 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
6696 eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result
6697 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
6698
6699 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
6700 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
6701 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
6702
6703 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
6704 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
6705 eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result
6706
6707 eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result
6708 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
6709 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
6710
6711 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
6712 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
6713 b.lt .L256_dec_main_loop
6714
6715.L256_dec_prepretail: @ PREPRETAIL
6716 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
6717 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
6718 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
6719
6720 rev64 $res4b, $res4b @ GHASH block 8k+4
6721 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
6722 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
6723
6724 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
6725 rev64 $res0b, $res0b @ GHASH block 8k
6726 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
6727
6728 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
6729 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
6730 ext $h7.16b, $h7.16b, $h7.16b, #8
6731 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
6732 ext $h8.16b, $h8.16b, $h8.16b, #8
6733 rev64 $res1b, $res1b @ GHASH block 8k+1
6734
6735 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
6736 rev64 $res2b, $res2b @ GHASH block 8k+2
6737 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
6738 ext $h5.16b, $h5.16b, $h5.16b, #8
6739 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
6740 ext $h6.16b, $h6.16b, $h6.16b, #8
6741
6742 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
6743 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
6744 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
6745
6746 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
6747 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
6748 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
6749
6750 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
6751 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
6752 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
6753
6754 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
6755 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
6756 eor $res0b, $res0b, $acc_lb @ PRE 1
6757
6758 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
6759 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
6760 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
6761
6762 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
6763 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
6764 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
6765
6766 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
6767 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6768 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
6769
6770 rev64 $res3b, $res3b @ GHASH block 8k+3
6771 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
6772
6773 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
6774 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
6775 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
6776
6777 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
6778 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
6779 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
6780
6781 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
6782 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
6783
6784 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
6785 rev64 $res6b, $res6b @ GHASH block 8k+6
6786
6787 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
6788 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
6789 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
6790
6791 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
6792 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6793 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
6794
6795 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
6796 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
6797 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
6798
6799 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
6800 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
6801 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
6802
6803 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
6804 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
6805 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
6806
6807 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
6808 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6809 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6810
6811 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
6812 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
6813 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
6814
6815 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
6816 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
6817 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
6818
6819 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
6820 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
6821 ext $h1.16b, $h1.16b, $h1.16b, #8
3b5b9199 6822 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
6823 ext $h2.16b, $h2.16b, $h2.16b, #8
6824 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
6825
6826 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
6827 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
6828 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
6829
6830 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6831 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
6832 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
6833
6834 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
6835 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
6836 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
6837
6838 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
6839 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
6840 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
6841
6842 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
6843 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
6844 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
6845
6846 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
6847 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
6848 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
6849
6850 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
6851 ext $h3.16b, $h3.16b, $h3.16b, #8
6852 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
6853 ext $h4.16b, $h4.16b, $h4.16b, #8
6854 rev64 $res7b, $res7b @ GHASH block 8k+7
6855 rev64 $res5b, $res5b @ GHASH block 8k+5
6856
6857 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6858
6859 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6860
6861 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
6862 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
6863 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
6864 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
6865
6866 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
6867 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
6868
6869 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
6870 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
6871 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
6872
6873 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6874 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
6875 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6876
6877 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
6878 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
6879 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
6880
6881 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
6882 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
6883 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
6884
6885 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
6886 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
6887 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
6888
6889 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
6890 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
6891
6892 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
6893 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
6894 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
6895
6896 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
6897 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6898 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
6899
6900 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
6901 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
6902 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
6903
6904 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
6905 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
6906 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
6907
6908 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
6909 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
6910 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6911
6912 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
6913 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
6914 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6915
6916 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
6917 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
6918 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
6919
6920 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
6921 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
6922 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
6923
6924 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
6925 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
6926 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
6927
6928 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
6929 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
6930 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6931
6932 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
6933 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
6934 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
6935
6936 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
6937 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
6938 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
6939
6940 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6941
6942 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
6943 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
6944 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
6945
6946 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
6947 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
6948 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
6949
6950 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
6951
6952 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
6953 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
6954 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
6955
6956 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
6957
6958 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
6959 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
6960 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
6961
6962 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
6963 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
6964
6965 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
6966 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
6967 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
6968
6969 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
6970 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
6971
6972 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
6973
6974 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
6975 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
6976 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
6977
6978 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6979 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
6980 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
6981
6982 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
6983 ldr $rk14q, [$cc, #224] @ load rk14
6984 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
6985
6986 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
6987 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6988 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
6989
6990 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
6991 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
6992 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
6993
6994 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
6995 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
6996 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
6997
6998 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
6999 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
7000.L256_dec_tail: @ TAIL
7001
7002 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
7003 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
7004 cmp $main_end_input_ptr, #112
7005
7006 ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
7007
3b5b9199 7008 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
954f45ba
X
7009 ext $h8.16b, $h8.16b, $h8.16b, #8
7010 mov $t1.16b, $rk14
7011
7012 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
7013 ext $h5.16b, $h5.16b, $h5.16b, #8
7014
7015 eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
7016 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
7017 ext $h6.16b, $h6.16b, $h6.16b, #8
7018 ext $h7.16b, $h7.16b, $h7.16b, #8
7019 b.gt .L256_dec_blocks_more_than_7
7020
7021 mov $ctr7b, $ctr6b
7022 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7023 mov $ctr6b, $ctr5b
7024
7025 mov $ctr5b, $ctr4b
7026 mov $ctr4b, $ctr3b
7027 movi $acc_l.8b, #0
7028
7029 movi $acc_h.8b, #0
7030 movi $acc_m.8b, #0
7031 mov $ctr3b, $ctr2b
7032
7033 cmp $main_end_input_ptr, #96
7034 mov $ctr2b, $ctr1b
7035 b.gt .L256_dec_blocks_more_than_6
7036
7037 mov $ctr7b, $ctr6b
7038 mov $ctr6b, $ctr5b
7039
7040 mov $ctr5b, $ctr4b
7041 cmp $main_end_input_ptr, #80
7042 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7043
7044 mov $ctr4b, $ctr3b
7045 mov $ctr3b, $ctr1b
7046 b.gt .L256_dec_blocks_more_than_5
7047
7048 cmp $main_end_input_ptr, #64
7049 mov $ctr7b, $ctr6b
7050 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7051
7052 mov $ctr6b, $ctr5b
7053
7054 mov $ctr5b, $ctr4b
7055 mov $ctr4b, $ctr1b
7056 b.gt .L256_dec_blocks_more_than_4
7057
7058 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7059 mov $ctr7b, $ctr6b
7060 cmp $main_end_input_ptr, #48
7061
7062 mov $ctr6b, $ctr5b
7063 mov $ctr5b, $ctr1b
7064 b.gt .L256_dec_blocks_more_than_3
7065
7066 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
7067 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7068 mov $ctr7b, $ctr6b
7069
7070 cmp $main_end_input_ptr, #32
7071 mov $ctr6b, $ctr1b
7072 b.gt .L256_dec_blocks_more_than_2
7073
7074 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7075
7076 mov $ctr7b, $ctr1b
7077 cmp $main_end_input_ptr, #16
7078 b.gt .L256_dec_blocks_more_than_1
7079
7080 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7081 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
7082 b .L256_dec_blocks_less_than_1
7083.L256_dec_blocks_more_than_7: @ blocks left > 7
7084 rev64 $res0b, $res1b @ GHASH final-7 block
7085 ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
7086 st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
7087
7088 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
7089
7090 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7091
7092 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
7093 eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
7094
7095 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
7096
7097 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
eb4129e1 7098 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7099
7100 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
7101 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
7102.L256_dec_blocks_more_than_6: @ blocks left > 6
7103
7104 rev64 $res0b, $res1b @ GHASH final-6 block
7105
7106 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7107 ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
eb4129e1 7108 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7109
7110 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
7111 st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
7112 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
7113
7114 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
7115
7116 eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
7117 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
7118 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
7119
7120 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
7121
7122 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
7123 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
7124.L256_dec_blocks_more_than_5: @ blocks left > 5
7125
7126 rev64 $res0b, $res1b @ GHASH final-5 block
7127
7128 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7129
7130 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
7131 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
7132
7133 ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
7134
7135 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
7136 st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
7137
7138 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
7139 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
7140
7141 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
7142
7143 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
7144 eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
7145 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
7146
7147 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
eb4129e1 7148 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7149.L256_dec_blocks_more_than_4: @ blocks left > 4
7150
7151 rev64 $res0b, $res1b @ GHASH final-4 block
7152
7153 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7154
7155 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
7156 ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
7157
eb4129e1 7158 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7159
7160 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
7161 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
7162
7163 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
7164
7165 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
7166
7167 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
7168
7169 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
7170 st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
7171
7172 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
7173 eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
7174.L256_dec_blocks_more_than_3: @ blocks left > 3
7175
7176 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
7177 ext $h4.16b, $h4.16b, $h4.16b, #8
7178 rev64 $res0b, $res1b @ GHASH final-3 block
7179
7180 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7181 ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
7182 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
7183
7184 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
7185 st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
7186
7187 eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
7188
7189 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
7190
7191 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
7192 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
7193 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
7194
eb4129e1 7195 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7196 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
7197 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
7198
7199 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
7200
7201 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
7202.L256_dec_blocks_more_than_2: @ blocks left > 2
7203
7204 rev64 $res0b, $res1b @ GHASH final-2 block
7205
7206 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
7207 ext $h3.16b, $h3.16b, $h3.16b, #8
7208 ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
7209
7210 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7211
7212 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
7213
7214 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
7215 st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
7216 eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
7217
7218 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
7219 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eb4129e1 7220 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7221
7222 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
7223 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
7224
7225 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
7226 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
7227.L256_dec_blocks_more_than_1: @ blocks left > 1
7228
7229 rev64 $res0b, $res1b @ GHASH final-1 block
7230
7231 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7232
7233 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
3b5b9199 7234 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
954f45ba
X
7235 ext $h2.16b, $h2.16b, $h2.16b, #8
7236
7237 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
7238 ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
7239 st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
7240
7241 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
7242 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
7243
7244 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
7245
7246 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
7247
7248 eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
7249 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
7250
7251 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
7252
eb4129e1 7253 movi $t0.8b, #0 @ suppress further partial tag feed in
954f45ba
X
7254 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
7255
7256 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
7257.L256_dec_blocks_less_than_1: @ blocks left <= 1
7258
7259 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
7260 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
7261 and $bit_length, $bit_length, #127 @ bit_length %= 128
7262
7263 sub $bit_length, $bit_length, #128 @ bit_length -= 128
7264 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
7265 str $rtmp_ctrq, [$counter] @ store the updated counter
7266
7267 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
7268
7269 and $bit_length, $bit_length, #127 @ bit_length %= 128
7270
7271 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
7272 cmp $bit_length, #64
7273 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
7274
7275 csel $temp3_x, $temp0_x, xzr, lt
7276 csel $temp2_x, $temp1_x, $temp0_x, lt
7277
7278 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
7279 mov $ctr0.d[1], $temp3_x
7280
7281 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
7282 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
7283 ext $h1.16b, $h1.16b, $h1.16b, #8
7284 bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
7285
7286 rev64 $res0b, $res1b @ GHASH final block
7287
7288 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7289
7290 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
7291 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
7292
7293 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
7294
7295 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
7296 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
7297
7298 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
7299
7300 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
7301 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
7302 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
7303
7304 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
7305 eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
7306
7307 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
7308 st1 { $res4b}, [$output_ptr] @ store all 16B
7309
7310 eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
7311
7312 eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid
7313 eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid
7314
7315 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
7316
7317 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
7318 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
7319
7320 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
7321 ext $acc_lb, $acc_lb, $acc_lb, #8
7322 rev64 $acc_lb, $acc_lb
7323 st1 { $acc_l.16b }, [$current_tag]
4596c20b 7324 mov x0, $byte_length
954f45ba
X
7325
7326 ldp d10, d11, [sp, #16]
7327 ldp d12, d13, [sp, #32]
7328 ldp d14, d15, [sp, #48]
7329 ldp d8, d9, [sp], #80
7330 ret
7331
7332.L256_dec_ret:
7333 mov w0, #0x0
7334 ret
7335.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
7336___
7337}
7338}
7339
7340$code.=<<___;
7341.asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
7342.align 2
7343#endif
7344___
7345
7346{
7347 my %opcode = (
7348 "rax1" => 0xce608c00, "eor3" => 0xce000000,
7349 "bcax" => 0xce200000, "xar" => 0xce800000 );
7350
7351 sub unsha3 {
7352 my ($mnemonic,$arg)=@_;
7353
7354 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
7355 &&
7356 sprintf ".inst\t0x%08x\t//%s %s",
7357 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
7358 $mnemonic,$arg;
7359 }
7360 sub unvmov {
7361 my $arg=shift;
7362
7363 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
7364 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
7365 $3<8?$3:$3+8,($4 eq "lo")?0:1;
7366 }
7367
7368 foreach(split("\n",$code)) {
7369 s/@\s/\/\//o; # old->new style commentary
7370 s/\`([^\`]*)\`/eval($1)/ge;
7371
7372 m/\bld1r\b/ and s/\.16b/.2d/g or
7373 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
7374 print $_,"\n";
7375 }
7376}
7377
7378close STDOUT or die "error closing STDOUT: $!"; # enforce flush