]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
Fix incorrect comments in aes-gcm-armv8-unroll8_64.pl
[thirdparty/openssl.git] / crypto / modes / asm / aes-gcm-armv8-unroll8_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 #========================================================================
11 # Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
12 # derived from https://github.com/ARM-software/AArch64cryptolib, original
13 # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14 # licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
15 # obtain it.
16 #========================================================================
17 #
18 # Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
19 # Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
20 # intermediate hashesfrom the 8 blocks.
21 #
22 # ____________________________________________________
23 # | |
24 # | PRE |
25 # |____________________________________________________|
26 # | | | |
27 # | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
28 # |________________|________________|__________________|
29 # | | | |
30 # | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
31 # |________________|________________|__________________|
32 # | | | |
33 # | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
34 # |________________|________________|__________________|
35 # | | | |
36 # | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
37 # |________________|________________|__________________|
38 # | | | |
39 # | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
40 # |________________|________________|__________________|
41 # | | | |
42 # | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
43 # |________________|________________|__________________|
44 # | | | |
45 # | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
46 # |________________|________________|__________________|
47 # | | | |
48 # | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
49 # |________________|____(mostly)____|__________________|
50 # | |
51 # | MODULO |
52 # |____________________________________________________|
53 #
54 # PRE:
55 # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
56 # EXT low_acc, low_acc, low_acc, #8
57 # EOR res_curr (8k+0), res_curr (4k+0), low_acc
58 #
59 # CTR block:
60 # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
61 # REV ctr32, rev_ctr32
62 # ORR ctr64, constctr96_top32, ctr32, LSL #32
63 # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
64 # INS ctr_next.d[1], ctr64X
65 # ADD rev_ctr32, #1
66 #
67 # AES block:
68 # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
69 # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
70 # Given we are very constrained in our ASIMD registers this is quite important
71 #
72 # Encrypt:
73 # LDR input_low, [ input_ptr ], #8
74 # LDR input_high, [ input_ptr ], #8
75 # EOR input_low, k14_low
76 # EOR input_high, k14_high
77 # INS res_curr.d[0], input_low
78 # INS res_curr.d[1], input_high
79 # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
80 # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
81 # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
82 # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
83 # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
84 # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
85 # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
86 # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
87 # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
88 # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
89 # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
90 # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
91 # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
92 # AESE ctr_curr, k13
93 # EOR res_curr, res_curr, ctr_curr
94 # ST1 { res_curr.16b }, [ output_ptr ], #16
95 #
96 # Decrypt:
97 # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
98 # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
99 # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
100 # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
101 # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
102 # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
103 # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
104 # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
105 # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
106 # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
107 # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
108 # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
109 # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
110 # AESE ctr_curr, k13
111 # LDR res_curr, [ input_ptr ], #16
112 # EOR res_curr, res_curr, ctr_curr
113 # MOV output_low, res_curr.d[0]
114 # MOV output_high, res_curr.d[1]
115 # EOR output_low, k14_low
116 # EOR output_high, k14_high
117 # STP output_low, output_high, [ output_ptr ], #16
118
119 # GHASH block X:
120 # Do 128b karatsuba polynomial multiplication on block
121 # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
122 #
123 # multiplication:
124 # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
125 #
126 # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
127 # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
128 #
129 # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
130 # multiplying with "twisted" powers of H
131 #
132 # Note: We can PMULL directly into the acc_x in first GHASH of the loop
133 # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
134 # path latency dominates the performance
135 #
136 # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
137 # than indicated here
138 # REV64 res_curr, res_curr
139 # INS t_m.d[0], res_curr.d[1]
140 # EOR t_m.8B, t_m.8B, res_curr.8B
141 # PMULL2 t_h, res_curr, HX
142 # PMULL t_l, res_curr, HX
143 # PMULL t_m, t_m, HX_k
144 # EOR acc_h, acc_h, t_h
145 # EOR acc_l, acc_l, t_l
146 # EOR acc_m, acc_m, t_m
147 #
148 # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
149 # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
150 # with a reversed constant
151 # EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing
152 # PMULL t_mod, acc_h, mod_constant
153 # EXT acc_h, acc_h, acc_h, #8
154 # EOR3 acc_m, acc_m, t_mod, acc_h
155 # PMULL acc_h, acc_m, mod_constant
156 # EXT acc_m, acc_m, acc_m, #8
157 # EOR3 acc_l, acc_l, acc_m, acc_h
158
159 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
160 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
161
162 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
163 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
164 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
165 die "can't locate arm-xlate.pl";
166
167 die "only for 64 bit" if $flavour !~ /64/;
168
169 open OUT,"| \"$^X\" $xlate $flavour $output";
170 *STDOUT=*OUT;
171
172 $code=<<___;
173 #include "arm_arch.h"
174
175 #if __ARM_MAX_ARCH__>=8
176 ___
177 $code.=".arch armv8.2-a+crypto\n.text\n";
178
179 $input_ptr="x0"; #argument block
180 $bit_length="x1";
181 $output_ptr="x2";
182 $current_tag="x3";
183 $counter="x16";
184 $constant_temp="x15";
185 $modulo_constant="x10";
186 $cc="x8";
187 {
188 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
189 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
190 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
191 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
192 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
193 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
194 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
195
196 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
197 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
198 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
199
200 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
201 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
202
203 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
204 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
205 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
206 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
207
208 my $t0="v16";
209 my $t0d="d16";
210
211 my $t1="v29";
212 my $t2=$res1;
213 my $t3=$t1;
214
215 my $t4=$res0;
216 my $t5=$res2;
217 my $t6=$t0;
218
219 my $t7=$res3;
220 my $t8=$res4;
221 my $t9=$res5;
222
223 my $t10=$res6;
224 my $t11="v21";
225 my $t12=$t1;
226
227 my $rtmp_ctr="v30";
228 my $rtmp_ctrq="q30";
229 my $rctr_inc="v31";
230 my $rctr_incd="d31";
231
232 my $mod_constantd=$t0d;
233 my $mod_constant=$t0;
234
235 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
236 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
237 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
238 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
239 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
240 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
241 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
242 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
243 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
244 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
245 my $rk2q1="v28.1q";
246 my $rk3q1="v26.1q";
247 my $rk4v="v27";
248
249
250 #########################################################################################
251 # size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
252 # size_t len,
253 # unsigned char *out,
254 # const void *key,
255 # unsigned char ivec[16],
256 # u64 *Xi);
257 #
258 $code.=<<___;
259 .global unroll8_eor3_aes_gcm_enc_128_kernel
260 .type unroll8_eor3_aes_gcm_enc_128_kernel,%function
261 .align 4
262 unroll8_eor3_aes_gcm_enc_128_kernel:
263 AARCH64_VALID_CALL_TARGET
264 cbz x1, .L128_enc_ret
265 stp d8, d9, [sp, #-80]!
266 mov $counter, x4
267 mov $cc, x5
268 stp d10, d11, [sp, #16]
269 stp d12, d13, [sp, #32]
270 stp d14, d15, [sp, #48]
271 mov x5, #0xc200000000000000
272 stp x5, xzr, [sp, #64]
273 add $modulo_constant, sp, #64
274
275 mov $constant_temp, #0x100000000 @ set up counter increment
276 movi $rctr_inc.16b, #0x0
277 mov $rctr_inc.d[1], $constant_temp
278 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
279 ld1 { $ctr0b}, [$counter] @ CTR block 0
280
281 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
282
283 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
284
285 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
286
287 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
288
289 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
290 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
291
292 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
293 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
294
295 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
296 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
297
298 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
299 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
300
301 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
302 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
303 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
304
305 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
306 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
307
308 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
309 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
310
311 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
312 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
313 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
314
315 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
316 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
317 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
318
319 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
320 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
321 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
322
323 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
324
325 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
326 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
327 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
328
329 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
330 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
331 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
332
333 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
334 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
335 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
336
337 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
338 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
339 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
340
341 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
342 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
343 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
344
345 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
346
347 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
348 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
349 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
350
351 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
352 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
353 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
354
355 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
356
357 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
358 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
359 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
360
361 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
362 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
363 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
364
365 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
366 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
367 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
368
369 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
370 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
371 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
372
373 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
374 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
375 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
376
377 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
378 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
379 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
380
381 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
382 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
383 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
384
385 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
386 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
387 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
388
389 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
390 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
391 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
392
393 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
394
395 ld1 { $acc_lb}, [$current_tag]
396 ext $acc_lb, $acc_lb, $acc_lb, #8
397 rev64 $acc_lb, $acc_lb
398
399 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
400
401 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
402 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
403 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
404
405 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
406 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
407 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
408
409 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
410 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
411 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
412
413 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
414 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
415 ldr $rk10q, [$cc, #160] @ load rk10
416
417 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
418 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
419 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
420
421 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
422 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
423 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
424
425 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
426 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
427 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
428
429 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
430 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
431 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
432
433 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
434 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
435 b.ge .L128_enc_tail @ handle tail
436
437 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
438
439 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
440
441 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
442
443 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
444 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
445
446 eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result
447 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
448 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
449
450 eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result
451 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
452
453 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
454 eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
455 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
456
457 eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result
458 eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
459 eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
460
461 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
462 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
463
464 eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result
465 eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result
466 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
467
468 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
469 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
470 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
471
472 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
473
474 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
475 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
476 b.ge .L128_enc_prepretail @ do prepretail
477
478 .L128_enc_main_loop: @ main loop start
479 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
480 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
481 ext $h5.16b, $h5.16b, $h5.16b, #8
482 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
483 ext $h6.16b, $h6.16b, $h6.16b, #8
484 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
485
486 rev64 $res1b, $res1b @ GHASH block 8k+1
487 rev64 $res0b, $res0b @ GHASH block 8k
488 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
489 ext $h7.16b, $h7.16b, $h7.16b, #8
490 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
491 ext $h8.16b, $h8.16b, $h8.16b, #8
492
493 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
494 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
495 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
496
497 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
498 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
499 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
500 rev64 $res3b, $res3b @ GHASH block 8k+3
501
502 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
503 eor $res0b, $res0b, $acc_lb @ PRE 1
504 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
505
506 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
507
508 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
509 rev64 $res2b, $res2b @ GHASH block 8k+2
510 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
511
512 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
513 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
514 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
515
516 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
517 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
518 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
519
520 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
521 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
522 ext $h3.16b, $h3.16b, $h3.16b, #8
523 ldr $h4q, [$current_tag, #112] @ load h3l | h3h
524 ext $h4.16b, $h4.16b, $h4.16b, #8
525 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
526
527 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
528 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
529 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
530
531 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
532 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
533 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
534
535 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
536 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
537 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
538
539 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
540 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
541 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
542
543 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
544 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
545 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
546
547 eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high
548 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
549 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
550
551 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
552 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
553 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
554
555 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
556 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
557 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
558
559 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
560 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
561 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
562
563 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
564 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
565
566 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
567 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
568 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
569
570 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
571 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
572 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
573
574 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
575 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
576 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
577
578 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
579 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
580 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
581
582 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
583 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
584 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
585 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
586
587 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
588 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
589 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
590
591 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
592 ext $h1.16b, $h1.16b, $h1.16b, #8
593 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
594 ext $h2.16b, $h2.16b, $h2.16b, #8
595 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
596 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
597
598 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
599 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
600
601 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
602 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
603
604 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
605 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
606
607 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
608 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
609
610 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
611 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
612 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
613
614 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
615 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
616 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
617
618 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
619 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
620 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
621
622 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
623 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
624 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
625
626 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
627 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
628 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
629
630 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
631 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
632 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
633
634 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
635 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
636 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
637
638 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
639 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
640
641 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
642 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
643 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
644
645 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
646 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
647 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
648
649 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
650 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
651 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
652
653 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
654 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
655
656 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
657 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
658 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
659
660 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
661 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
662 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
663
664 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
665 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
666 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
667
668 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
669 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
670 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
671
672 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
673 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
674 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
675
676 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
677 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
678 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
679
680 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
681 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
682 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
683
684 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
685 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
686 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
687
688 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
689 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
690
691 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
692 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
693 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
694
695 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
696 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
697 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
698
699 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
700 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
701 ldr $rk10q, [$cc, #160] @ load rk10
702
703 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
704 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
705 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
706 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
707
708 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
709 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
710 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
711
712 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
713 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
714 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
715
716 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
717 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
718 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
719
720 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
721 eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
722 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
723
724 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
725 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
726
727 eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result
728
729 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
730 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
731
732 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
733 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
734
735 eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result
736 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
737 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
738
739 eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result
740 eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result
741 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
742
743 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
744 eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
745 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
746
747 eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result
748 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
749 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
750
751 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
752 eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
753
754 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
755 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
756
757 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
758 b.lt .L128_enc_main_loop
759
760 .L128_enc_prepretail: @ PREPRETAIL
761 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
762 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
763 ext $h7.16b, $h7.16b, $h7.16b, #8
764 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
765 ext $h8.16b, $h8.16b, $h8.16b, #8
766 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
767
768 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
769 ext $h5.16b, $h5.16b, $h5.16b, #8
770 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
771 ext $h6.16b, $h6.16b, $h6.16b, #8
772 rev64 $res0b, $res0b @ GHASH block 8k
773 rev64 $res1b, $res1b @ GHASH block 8k+1
774
775 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
776 ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
777 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
778 rev64 $res3b, $res3b @ GHASH block 8k+3
779
780 rev64 $res2b, $res2b @ GHASH block 8k+2
781 eor $res0b, $res0b, $acc_lb @ PRE 1
782
783 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
784
785 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
786 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
787 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
788
789 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
790 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
791
792 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
793 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
794 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
795
796 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
797 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
798
799 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
800 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
801
802 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
803 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
804
805 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
806 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
807
808 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
809
810 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
811
812 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
813
814 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
815
816 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
817 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
818
819 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
820 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
821
822 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
823 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
824
825 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
826 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
827 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
828
829 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
830 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
831
832 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
833 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
834 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
835
836 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
837 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
838
839 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
840 ext $h3.16b, $h3.16b, $h3.16b, #8
841 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
842 ext $h4.16b, $h4.16b, $h4.16b, #8
843
844 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
845 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
846 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
847
848 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
849 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
850
851 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
852 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
853
854 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
855 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
856 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
857 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
858
859 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
860 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
861
862 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
863 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
864 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
865
866 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
867 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
868 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
869
870 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
871 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
872
873 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
874 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
875 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
876
877 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
878 ext $h1.16b, $h1.16b, $h1.16b, #8
879 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
880 ext $h2.16b, $h2.16b, $h2.16b, #8
881 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
882 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
883
884 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
885 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
886 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
887
888 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
889 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
890 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
891
892 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
893 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
894
895 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
896 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
897 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
898
899 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
900 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
901 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
902
903 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
904 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
905 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
906
907 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
908 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
909 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
910
911 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
912 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
913 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
914
915 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
916 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
917 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
918
919 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
920 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
921
922 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
923 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
924 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
925
926 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
927 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
928 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
929
930 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
931 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
932 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
933
934 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
935 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
936 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
937
938 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
939 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
940
941 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
942 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
943 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
944
945 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
946 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
947
948 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
949 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
950 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
951
952 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
953 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
954 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
955
956 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
957 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
958 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
959
960 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
961 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
962 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
963
964 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
965 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
966 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
967 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
968
969 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
970 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
971 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
972
973 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
974 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
975
976 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
977 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
978
979 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
980 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
981 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
982 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
983
984 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
985 eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low
986 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
987
988 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
989 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
990 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
991
992 ldr $rk10q, [$cc, #160] @ load rk10
993 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
994 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
995
996 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
997 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
998
999 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
1000 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
1001
1002 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
1003 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
1004 .L128_enc_tail: @ TAIL
1005
1006 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1007 ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
1008
1009 mov $t1.16b, $rk10
1010 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
1011 ext $h5.16b, $h5.16b, $h5.16b, #8
1012
1013 eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
1014 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
1015 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
1016 ext $h6.16b, $h6.16b, $h6.16b, #8
1017 ext $h7.16b, $h7.16b, $h7.16b, #8
1018
1019 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
1020 ext $h8.16b, $h8.16b, $h8.16b, #8
1021 cmp $main_end_input_ptr, #112
1022 b.gt .L128_enc_blocks_more_than_7
1023
1024 mov $ctr7b, $ctr6b
1025 mov $ctr6b, $ctr5b
1026 movi $acc_h.8b, #0
1027
1028 cmp $main_end_input_ptr, #96
1029 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1030 mov $ctr5b, $ctr4b
1031
1032 mov $ctr4b, $ctr3b
1033 mov $ctr3b, $ctr2b
1034 mov $ctr2b, $ctr1b
1035
1036 movi $acc_l.8b, #0
1037 movi $acc_m.8b, #0
1038 b.gt .L128_enc_blocks_more_than_6
1039
1040 mov $ctr7b, $ctr6b
1041 cmp $main_end_input_ptr, #80
1042
1043 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1044 mov $ctr6b, $ctr5b
1045 mov $ctr5b, $ctr4b
1046
1047 mov $ctr4b, $ctr3b
1048 mov $ctr3b, $ctr1b
1049 b.gt .L128_enc_blocks_more_than_5
1050
1051 cmp $main_end_input_ptr, #64
1052 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1053
1054 mov $ctr7b, $ctr6b
1055 mov $ctr6b, $ctr5b
1056
1057 mov $ctr5b, $ctr4b
1058 mov $ctr4b, $ctr1b
1059 b.gt .L128_enc_blocks_more_than_4
1060
1061 mov $ctr7b, $ctr6b
1062 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1063 mov $ctr6b, $ctr5b
1064
1065 mov $ctr5b, $ctr1b
1066 cmp $main_end_input_ptr, #48
1067 b.gt .L128_enc_blocks_more_than_3
1068
1069 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1070 mov $ctr7b, $ctr6b
1071 mov $ctr6b, $ctr1b
1072
1073 cmp $main_end_input_ptr, #32
1074 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1075 b.gt .L128_enc_blocks_more_than_2
1076
1077 cmp $main_end_input_ptr, #16
1078
1079 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1080 mov $ctr7b, $ctr1b
1081 b.gt .L128_enc_blocks_more_than_1
1082
1083 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1084 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1085 b .L128_enc_blocks_less_than_1
1086 .L128_enc_blocks_more_than_7: @ blocks left > 7
1087 st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
1088
1089 rev64 $res0b, $res1b @ GHASH final-7 block
1090 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
1091
1092 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1093
1094 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
1095
1096 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
1097
1098 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
1099
1100 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
1101 movi $t0.8b, #0 @ supress further partial tag feed in
1102
1103 eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
1104
1105 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
1106 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
1107 .L128_enc_blocks_more_than_6: @ blocks left > 6
1108
1109 st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
1110
1111 rev64 $res0b, $res1b @ GHASH final-6 block
1112 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
1113
1114 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1115
1116 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
1117
1118 eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
1119 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
1120
1121 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
1122 movi $t0.8b, #0 @ supress further partial tag feed in
1123
1124 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
1125 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
1126
1127 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
1128
1129 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
1130 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
1131 .L128_enc_blocks_more_than_5: @ blocks left > 5
1132
1133 st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
1134
1135 rev64 $res0b, $res1b @ GHASH final-5 block
1136
1137 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1138
1139 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
1140 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
1141 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
1142
1143 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
1144
1145 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
1146
1147 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
1148
1149 eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
1150 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
1151 movi $t0.8b, #0 @ supress further partial tag feed in
1152
1153 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
1154 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
1155
1156 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
1157 .L128_enc_blocks_more_than_4: @ blocks left > 4
1158
1159 st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
1160
1161 rev64 $res0b, $res1b @ GHASH final-4 block
1162
1163 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
1164
1165 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1166
1167 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
1168 movi $t0.8b, #0 @ supress further partial tag feed in
1169 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
1170
1171 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
1172
1173 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
1174
1175 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
1176 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
1177
1178 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
1179
1180 eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
1181 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
1182 .L128_enc_blocks_more_than_3: @ blocks left > 3
1183
1184 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
1185
1186 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1187 ext $h4.16b, $h4.16b, $h4.16b, #8
1188
1189 rev64 $res0b, $res1b @ GHASH final-3 block
1190
1191 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1192 movi $t0.8b, #0 @ supress further partial tag feed in
1193
1194 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
1195 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1196 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
1197
1198 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
1199
1200 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1201
1202 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
1203 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
1204
1205 eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
1206
1207 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
1208 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
1209
1210 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
1211 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
1212 .L128_enc_blocks_more_than_2: @ blocks left > 2
1213
1214 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
1215
1216 rev64 $res0b, $res1b @ GHASH final-2 block
1217
1218 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1219
1220 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
1221
1222 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
1223 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1224 ext $h3.16b, $h3.16b, $h3.16b, #8
1225 movi $t0.8b, #0 @ supress further partial tag feed in
1226
1227 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1228 eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
1229
1230 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1231
1232 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1233 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1234
1235 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1236
1237 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1238 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1239 .L128_enc_blocks_more_than_1: @ blocks left > 1
1240
1241 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
1242
1243 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1244 ext $h2.16b, $h2.16b, $h2.16b, #8
1245 rev64 $res0b, $res1b @ GHASH final-1 block
1246 ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
1247
1248 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1249
1250 movi $t0.8b, #0 @ supress further partial tag feed in
1251 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
1252 eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
1253
1254 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1255
1256 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1257
1258 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1259
1260 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1261
1262 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1263 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1264
1265 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1266
1267 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1268 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1269 .L128_enc_blocks_less_than_1: @ blocks left <= 1
1270
1271 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
1272 str $rtmp_ctrq, [$counter] @ store the updated counter
1273 and $bit_length, $bit_length, #127 @ bit_length %= 128
1274
1275 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1276
1277 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1278
1279 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
1280 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1281 and $bit_length, $bit_length, #127 @ bit_length %= 128
1282
1283 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
1284 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
1285 cmp $bit_length, #64
1286
1287 csel $temp2_x, $temp1_x, $temp0_x, lt
1288 csel $temp3_x, $temp0_x, xzr, lt
1289
1290 mov $ctr0.d[1], $temp3_x
1291 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
1292
1293 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1294
1295 rev64 $res0b, $res1b @ GHASH final block
1296
1297 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1298 st1 { $res1b}, [$output_ptr] @ store all 16B
1299
1300 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1301
1302 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
1303
1304 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1305 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1306 ext $h1.16b, $h1.16b, $h1.16b, #8
1307
1308 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1309
1310 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1311 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1312 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
1313
1314 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1315
1316 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1317
1318 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1319
1320 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1321 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1322
1323 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
1324
1325 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
1326
1327 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1328 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1329
1330 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
1331 ext $acc_lb, $acc_lb, $acc_lb, #8
1332 rev64 $acc_lb, $acc_lb
1333 st1 { $acc_l.16b }, [$current_tag]
1334 lsr x0, $bit_length, #3 @ return sizes
1335
1336 ldp d10, d11, [sp, #16]
1337 ldp d12, d13, [sp, #32]
1338 ldp d14, d15, [sp, #48]
1339 ldp d8, d9, [sp], #80
1340 ret
1341
1342 .L128_enc_ret:
1343 mov w0, #0x0
1344 ret
1345 .size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1346 ___
1347
1348 #########################################################################################
1349 # size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
1350 # size_t len,
1351 # unsigned char *out,
1352 # u64 *Xi,
1353 # unsigned char ivec[16],
1354 # const void *key);
1355 #
1356 $code.=<<___;
1357 .global unroll8_eor3_aes_gcm_dec_128_kernel
1358 .type unroll8_eor3_aes_gcm_dec_128_kernel,%function
1359 .align 4
1360 unroll8_eor3_aes_gcm_dec_128_kernel:
1361 AARCH64_VALID_CALL_TARGET
1362 cbz x1, .L128_dec_ret
1363 stp d8, d9, [sp, #-80]!
1364 mov $counter, x4
1365 mov $cc, x5
1366 stp d10, d11, [sp, #16]
1367 stp d12, d13, [sp, #32]
1368 stp d14, d15, [sp, #48]
1369 mov x5, #0xc200000000000000
1370 stp x5, xzr, [sp, #64]
1371 add $modulo_constant, sp, #64
1372
1373 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
1374 ld1 { $ctr0b}, [$counter] @ CTR block 0
1375
1376 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
1377 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1378
1379 mov $constant_temp, #0x100000000 @ set up counter increment
1380 movi $rctr_inc.16b, #0x0
1381 mov $rctr_inc.d[1], $constant_temp
1382 ld1 { $acc_lb}, [$current_tag]
1383 ext $acc_lb, $acc_lb, $acc_lb, #8
1384 rev64 $acc_lb, $acc_lb
1385
1386 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
1387
1388 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1389
1390 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
1391
1392 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
1393 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
1394
1395 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1396
1397 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
1398 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
1399 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1400
1401 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
1402 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
1403
1404 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1405 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1406
1407 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
1408 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
1409
1410 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
1411 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
1412
1413 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1414
1415 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
1416 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
1417 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
1418
1419 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1420 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
1421
1422 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
1423
1424 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
1425 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1426
1427 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
1428
1429 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
1430
1431 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
1432 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
1433
1434 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
1435 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
1436
1437 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
1438 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1439 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1440
1441 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
1442 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1443 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
1444
1445 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
1446 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1447 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1448
1449 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
1450 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1451
1452 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
1453 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
1454
1455 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1456 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
1457
1458 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1459 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1460
1461 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1462 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
1463 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
1464
1465 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
1466 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1467 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1468
1469 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
1470 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
1471 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1472
1473 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
1474 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1475 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1476
1477 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
1478 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1479
1480 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
1481 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
1482
1483 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
1484
1485 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1486 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1487 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1488
1489 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
1490 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
1491 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1492
1493 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1494 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
1495 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
1496
1497 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1498 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
1499 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1500
1501 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
1502 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
1503 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
1504
1505 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
1506 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1507 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1508
1509 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1510 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
1511
1512 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
1513 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1514
1515 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1516 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
1517 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1518
1519 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
1520 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1521 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
1522
1523 aese $ctr0b, $rk9 @ AES block 0 - round 9
1524 aese $ctr1b, $rk9 @ AES block 1 - round 9
1525 aese $ctr6b, $rk9 @ AES block 6 - round 9
1526
1527 ldr $rk10q, [$cc, #160] @ load rk10
1528 aese $ctr4b, $rk9 @ AES block 4 - round 9
1529 aese $ctr3b, $rk9 @ AES block 3 - round 9
1530
1531 aese $ctr2b, $rk9 @ AES block 2 - round 9
1532 aese $ctr5b, $rk9 @ AES block 5 - round 9
1533 aese $ctr7b, $rk9 @ AES block 7 - round 9
1534
1535 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1536 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1537 b.ge .L128_dec_tail @ handle tail
1538
1539 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
1540
1541 eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result
1542 eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result
1543 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
1544
1545 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
1546 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
1547 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
1548
1549 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
1550
1551 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
1552 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
1553 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
1554
1555 eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result
1556 eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result
1557 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
1558
1559 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
1560 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
1561
1562 eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result
1563
1564 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
1565 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
1566
1567 eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result
1568 eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result
1569 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
1570
1571 eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result
1572 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
1573 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
1574
1575 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1576 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
1577 b.ge .L128_dec_prepretail @ do prepretail
1578
1579 .L128_dec_main_loop: @ main loop start
1580 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
1581 ext $h7.16b, $h7.16b, $h7.16b, #8
1582 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
1583 ext $h8.16b, $h8.16b, $h8.16b, #8
1584
1585 rev64 $res1b, $res1b @ GHASH block 8k+1
1586 rev64 $res0b, $res0b @ GHASH block 8k
1587 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1588
1589 rev64 $res6b, $res6b @ GHASH block 8k+6
1590 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
1591 ext $h5.16b, $h5.16b, $h5.16b, #8
1592 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
1593 ext $h6.16b, $h6.16b, $h6.16b, #8
1594
1595 eor $res0b, $res0b, $acc_lb @ PRE 1
1596 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
1597 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
1598
1599 rev64 $res2b, $res2b @ GHASH block 8k+2
1600 rev64 $res4b, $res4b @ GHASH block 8k+4
1601 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
1602
1603 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
1604 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
1605 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
1606 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
1607
1608 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
1609 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
1610 rev64 $res3b, $res3b @ GHASH block 8k+3
1611
1612 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
1613 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1614 rev64 $res5b, $res5b @ GHASH block 8k+5
1615
1616 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
1617 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
1618 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1619
1620 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
1621 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
1622 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
1623
1624 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
1625 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
1626 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
1627
1628 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
1629 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
1630 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
1631
1632 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
1633 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
1634 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
1635
1636 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
1637 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
1638 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
1639
1640 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
1641 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1642 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
1643
1644 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
1645 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1646 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
1647
1648 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1649 ext $h3.16b, $h3.16b, $h3.16b, #8
1650 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1651 ext $h4.16b, $h4.16b, $h4.16b, #8
1652 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
1653 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
1654
1655 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
1656 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
1657 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
1658
1659 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
1660 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
1661 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
1662
1663 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
1664 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
1665 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
1666
1667 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
1668 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1669 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1670 ext $h1.16b, $h1.16b, $h1.16b, #8
1671 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1672 ext $h2.16b, $h2.16b, $h2.16b, #8
1673
1674 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
1675 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
1676 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
1677
1678 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1679 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
1680 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
1681
1682 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
1683 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
1684 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
1685
1686 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
1687 rev64 $res7b, $res7b @ GHASH block 8k+7
1688 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
1689
1690 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
1691 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
1692 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1693
1694 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1695 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1696 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
1697 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1698
1699 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
1700 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
1701 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
1702
1703 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
1704 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
1705 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
1706
1707 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
1708 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
1709 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
1710
1711 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
1712 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
1713 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
1714
1715 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
1716 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1717 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
1718
1719 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
1720 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
1721 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
1722
1723 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
1724 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
1725 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1726
1727 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
1728 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
1729 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
1730
1731 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
1732 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
1733 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
1734
1735 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
1736 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
1737 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
1738
1739 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
1740 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
1741 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
1742
1743 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
1744 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
1745 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
1746
1747 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
1748 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
1749 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
1750
1751 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
1752 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
1753 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
1754
1755 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
1756 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
1757 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
1758
1759 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
1760 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
1761 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
1762
1763 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
1764 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
1765 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
1766
1767 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
1768 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
1769 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
1770
1771 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
1772 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
1773 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
1774
1775 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
1776 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
1777 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
1778
1779 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
1780 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
1781 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
1782
1783 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
1784 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1785 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1786
1787 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
1788 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
1789 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
1790
1791 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
1792 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
1793 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
1794
1795 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
1796 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
1797 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
1798
1799 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
1800 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
1801 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
1802
1803 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
1804 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
1805 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
1806
1807 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
1808 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
1809 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
1810
1811 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
1812 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
1813 ldr $rk10q, [$cc, #160] @ load rk10
1814
1815 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
1816 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1817 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
1818
1819 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
1820 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
1821 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1822
1823 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
1824 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
1825
1826 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
1827 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
1828 eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result
1829
1830 eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result
1831 eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result
1832 eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result
1833
1834 eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result
1835 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
1836 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
1837
1838 eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result
1839 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
1840 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
1841
1842 eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result
1843 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1844 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
1845
1846 eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result
1847 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
1848
1849 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
1850 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
1851 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
1852
1853 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
1854 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
1855 b.lt .L128_dec_main_loop
1856
1857 .L128_dec_prepretail: @ PREPRETAIL
1858 rev64 $res3b, $res3b @ GHASH block 8k+3
1859 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1860 rev64 $res0b, $res0b @ GHASH block 8k
1861
1862 rev64 $res2b, $res2b @ GHASH block 8k+2
1863 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
1864 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
1865
1866 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
1867 ext $h7.16b, $h7.16b, $h7.16b, #8
1868 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
1869 ext $h8.16b, $h8.16b, $h8.16b, #8
1870 eor $res0b, $res0b, $acc_lb @ PRE 1
1871 rev64 $res1b, $res1b @ GHASH block 8k+1
1872
1873 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
1874 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
1875 ext $h5.16b, $h5.16b, $h5.16b, #8
1876 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
1877 ext $h6.16b, $h6.16b, $h6.16b, #8
1878 rev64 $res5b, $res5b @ GHASH block 8k+5
1879
1880 rev64 $res4b, $res4b @ GHASH block 8k+4
1881
1882 rev64 $res6b, $res6b @ GHASH block 8k+6
1883
1884 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
1885 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
1886 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
1887 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
1888
1889 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
1890 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
1891 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
1892
1893 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1894 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
1895 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
1896
1897 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
1898 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
1899 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
1900
1901 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
1902 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
1903 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
1904
1905 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
1906 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
1907 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
1908
1909 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
1910 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1911 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
1912
1913 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
1914 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
1915 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
1916
1917 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
1918 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
1919 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
1920
1921 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
1922 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
1923 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
1924
1925 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
1926 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1927 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
1928
1929 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
1930 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
1931 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
1932
1933 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
1934 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
1935 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
1936
1937 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1938 ext $h3.16b, $h3.16b, $h3.16b, #8
1939 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1940 ext $h4.16b, $h4.16b, $h4.16b, #8
1941 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
1942 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
1943
1944 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
1945 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
1946 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
1947
1948 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1949 ext $h1.16b, $h1.16b, $h1.16b, #8
1950 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1951 ext $h2.16b, $h2.16b, $h2.16b, #8
1952 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
1953
1954 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
1955 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
1956 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
1957
1958 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
1959 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1960 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
1961
1962 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
1963 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
1964 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
1965
1966 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
1967 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
1968 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
1969
1970 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
1971 rev64 $res7b, $res7b @ GHASH block 8k+7
1972 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
1973
1974 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
1975 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
1976 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
1977 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
1978
1979 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
1980 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
1981 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1982
1983 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
1984 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
1985 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
1986
1987 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
1988 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
1989 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
1990
1991 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
1992 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
1993 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
1994
1995 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
1996 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
1997 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
1998
1999 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2000 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
2001 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
2002
2003 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
2004 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
2005 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
2006
2007 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
2008 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
2009 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
2010
2011 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
2012 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
2013 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
2014
2015 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
2016 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
2017 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
2018
2019 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
2020 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
2021 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
2022
2023 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
2024 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
2025 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
2026
2027 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
2028 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
2029 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
2030
2031 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
2032 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2033 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
2034
2035 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
2036 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
2037 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
2038
2039 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
2040 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
2041 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
2042
2043 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
2044 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
2045 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
2046
2047 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
2048 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
2049 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
2050
2051 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2052 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
2053 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2054
2055 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
2056 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
2057 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
2058
2059 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
2060 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
2061 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
2062
2063 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
2064 ldr $rk10q, [$cc, #160] @ load rk10
2065
2066 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
2067 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
2068
2069 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2070 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
2071 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2072
2073 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
2074 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
2075 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
2076
2077 aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
2078 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
2079 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
2080
2081 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
2082 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
2083 aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
2084
2085 aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
2086 aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
2087 aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
2088
2089 aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
2090 aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
2091 aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
2092
2093 .L128_dec_tail: @ TAIL
2094
2095 mov $t1.16b, $rk10
2096 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2097
2098 cmp $main_end_input_ptr, #112
2099
2100 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
2101 ext $h8.16b, $h8.16b, $h8.16b, #8
2102 ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
2103
2104 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
2105 ext $h5.16b, $h5.16b, $h5.16b, #8
2106 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2107
2108 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
2109 ext $h6.16b, $h6.16b, $h6.16b, #8
2110 ext $h7.16b, $h7.16b, $h7.16b, #8
2111
2112 eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
2113 b.gt .L128_dec_blocks_more_than_7
2114
2115 cmp $main_end_input_ptr, #96
2116 mov $ctr7b, $ctr6b
2117 movi $acc_l.8b, #0
2118
2119 movi $acc_h.8b, #0
2120 mov $ctr6b, $ctr5b
2121 mov $ctr5b, $ctr4b
2122
2123 mov $ctr4b, $ctr3b
2124 mov $ctr3b, $ctr2b
2125 mov $ctr2b, $ctr1b
2126
2127 movi $acc_m.8b, #0
2128 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2129 b.gt .L128_dec_blocks_more_than_6
2130
2131 cmp $main_end_input_ptr, #80
2132 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2133
2134 mov $ctr7b, $ctr6b
2135 mov $ctr6b, $ctr5b
2136 mov $ctr5b, $ctr4b
2137
2138 mov $ctr4b, $ctr3b
2139 mov $ctr3b, $ctr1b
2140 b.gt .L128_dec_blocks_more_than_5
2141
2142 cmp $main_end_input_ptr, #64
2143
2144 mov $ctr7b, $ctr6b
2145 mov $ctr6b, $ctr5b
2146 mov $ctr5b, $ctr4b
2147
2148 mov $ctr4b, $ctr1b
2149 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2150 b.gt .L128_dec_blocks_more_than_4
2151
2152 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2153 mov $ctr7b, $ctr6b
2154 mov $ctr6b, $ctr5b
2155
2156 mov $ctr5b, $ctr1b
2157 cmp $main_end_input_ptr, #48
2158 b.gt .L128_dec_blocks_more_than_3
2159
2160 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2161 mov $ctr7b, $ctr6b
2162 cmp $main_end_input_ptr, #32
2163
2164 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
2165 mov $ctr6b, $ctr1b
2166 b.gt .L128_dec_blocks_more_than_2
2167
2168 cmp $main_end_input_ptr, #16
2169
2170 mov $ctr7b, $ctr1b
2171 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2172 b.gt L128_dec_blocks_more_than_1
2173
2174 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2175 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
2176 b .L128_dec_blocks_less_than_1
2177 .L128_dec_blocks_more_than_7: @ blocks left > 7
2178 rev64 $res0b, $res1b @ GHASH final-7 block
2179
2180 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2181
2182 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
2183
2184 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
2185 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
2186
2187 movi $t0.8b, #0 @ supress further partial tag feed in
2188 ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
2189
2190 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
2191
2192 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
2193 st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
2194 eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
2195
2196 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
2197 .L128_dec_blocks_more_than_6: @ blocks left > 6
2198
2199 rev64 $res0b, $res1b @ GHASH final-6 block
2200
2201 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2202
2203 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
2204
2205 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
2206
2207 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
2208 ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
2209 movi $t0.8b, #0 @ supress further partial tag feed in
2210
2211 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
2212 st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
2213 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
2214
2215 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
2216 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
2217
2218 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
2219 eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
2220 .L128_dec_blocks_more_than_5: @ blocks left > 5
2221
2222 rev64 $res0b, $res1b @ GHASH final-5 block
2223
2224 ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
2225 st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
2226
2227 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2228
2229 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
2230
2231 eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
2232
2233 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
2234
2235 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
2236 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
2237 movi $t0.8b, #0 @ supress further partial tag feed in
2238
2239 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
2240 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
2241 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
2242
2243 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
2244 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
2245 .L128_dec_blocks_more_than_4: @ blocks left > 4
2246
2247 rev64 $res0b, $res1b @ GHASH final-4 block
2248
2249 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2250 ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
2251
2252 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
2253 movi $t0.8b, #0 @ supress further partial tag feed in
2254 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
2255
2256 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
2257
2258 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
2259
2260 st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
2261 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
2262
2263 eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
2264 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
2265
2266 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
2267
2268 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
2269 .L128_dec_blocks_more_than_3: @ blocks left > 3
2270
2271 st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
2272 rev64 $res0b, $res1b @ GHASH final-3 block
2273
2274 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2275
2276 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
2277
2278 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2279 ext $h4.16b, $h4.16b, $h4.16b, #8
2280 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
2281
2282 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2283
2284 ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
2285
2286 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
2287 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
2288 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
2289
2290 movi $t0.8b, #0 @ supress further partial tag feed in
2291 eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
2292 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
2293
2294 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
2295
2296 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
2297 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
2298 .L128_dec_blocks_more_than_2: @ blocks left > 2
2299
2300 rev64 $res0b, $res1b @ GHASH final-2 block
2301
2302 st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
2303
2304 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2305 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2306 ext $h3.16b, $h3.16b, $h3.16b, #8
2307 movi $t0.8b, #0 @ supress further partial tag feed in
2308
2309 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
2310
2311 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2312
2313 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2314
2315 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2316 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2317 ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
2318
2319 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2320
2321 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2322
2323 eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
2324 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2325 .L128_dec_blocks_more_than_1: @ blocks left > 1
2326
2327 st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
2328 rev64 $res0b, $res1b @ GHASH final-1 block
2329
2330 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2331 ext $h2.16b, $h2.16b, $h2.16b, #8
2332
2333 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2334
2335 movi $t0.8b, #0 @ supress further partial tag feed in
2336
2337 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
2338
2339 ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
2340 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2341
2342 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2343 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2344 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
2345
2346 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2347 eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
2348
2349 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2350
2351 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2352
2353 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2354
2355 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2356 .L128_dec_blocks_less_than_1: @ blocks left <= 1
2357
2358 and $bit_length, $bit_length, #127 @ bit_length %= 128
2359
2360 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2361
2362 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2363
2364 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
2365 and $bit_length, $bit_length, #127 @ bit_length %= 128
2366
2367 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
2368 cmp $bit_length, #64
2369 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
2370
2371 csel $temp2_x, $temp1_x, $temp0_x, lt
2372 csel $temp3_x, $temp0_x, xzr, lt
2373
2374 mov $ctr0.d[1], $temp3_x
2375 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
2376
2377 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2378 ext $h1.16b, $h1.16b, $h1.16b, #8
2379 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2380
2381 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2382
2383 rev64 $res0b, $res1b @ GHASH final block
2384
2385 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2386
2387 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2388 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
2389
2390 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2391 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2392
2393 bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2394
2395 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2396 st1 { $res4b}, [$output_ptr] @ store all 16B
2397
2398 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2399
2400 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2401 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
2402
2403 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2404
2405 eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
2406
2407 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2408 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2409
2410 eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
2411
2412 eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
2413
2414 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2415 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2416
2417 eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
2418 ext $acc_lb, $acc_lb, $acc_lb, #8
2419 rev64 $acc_lb, $acc_lb
2420 st1 { $acc_l.16b }, [$current_tag]
2421 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
2422
2423 str $rtmp_ctrq, [$counter] @ store the updated counter
2424
2425 lsr x0, $bit_length, #3
2426
2427 ldp d10, d11, [sp, #16]
2428 ldp d12, d13, [sp, #32]
2429 ldp d14, d15, [sp, #48]
2430 ldp d8, d9, [sp], #80
2431 ret
2432 .L128_dec_ret:
2433 mov w0, #0x0
2434 ret
2435 .size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2436 ___
2437 }
2438
2439 {
2440 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
2441 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
2442 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
2443 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
2444 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
2445 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
2446 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
2447
2448 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
2449 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
2450 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
2451
2452 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
2453 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
2454
2455 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
2456 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
2457 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
2458 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
2459
2460 my $t0="v16";
2461 my $t0d="d16";
2462
2463 my $t1="v29";
2464 my $t2=$res1;
2465 my $t3=$t1;
2466
2467 my $t4=$res0;
2468 my $t5=$res2;
2469 my $t6=$t0;
2470
2471 my $t7=$res3;
2472 my $t8=$res4;
2473 my $t9=$res5;
2474
2475 my $t10=$res6;
2476 my $t11="v21";
2477 my $t12=$t1;
2478
2479 my $rtmp_ctr="v30";
2480 my $rtmp_ctrq="q30";
2481 my $rctr_inc="v31";
2482 my $rctr_incd="d31";
2483
2484 my $mod_constantd=$t0d;
2485 my $mod_constant=$t0;
2486
2487 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
2488 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
2489 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
2490 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
2491 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
2492 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
2493 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
2494 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
2495 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
2496 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
2497 my $rk2q1="v28.1q";
2498 my $rk3q1="v26.1q";
2499 my $rk4v="v27";
2500
2501 #########################################################################################
2502 # size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
2503 # size_t len,
2504 # unsigned char *out,
2505 # const void *key,
2506 # unsigned char ivec[16],
2507 # u64 *Xi);
2508 #
2509 $code.=<<___;
2510 .global unroll8_eor3_aes_gcm_enc_192_kernel
2511 .type unroll8_eor3_aes_gcm_enc_192_kernel,%function
2512 .align 4
2513 unroll8_eor3_aes_gcm_enc_192_kernel:
2514 AARCH64_VALID_CALL_TARGET
2515 cbz x1, .L192_enc_ret
2516 stp d8, d9, [sp, #-80]!
2517 mov $counter, x4
2518 mov $cc, x5
2519 stp d10, d11, [sp, #16]
2520 stp d12, d13, [sp, #32]
2521 stp d14, d15, [sp, #48]
2522 mov x5, #0xc200000000000000
2523 stp x5, xzr, [sp, #64]
2524 add $modulo_constant, sp, #64
2525
2526 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2527 ld1 { $ctr0b}, [$counter] @ CTR block 0
2528
2529 mov $constant_temp, #0x100000000 @ set up counter increment
2530 movi $rctr_inc.16b, #0x0
2531 mov $rctr_inc.d[1], $constant_temp
2532
2533 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
2534
2535 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
2536
2537 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
2538 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
2539
2540 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
2541 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
2542
2543 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
2544 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
2545
2546 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
2547 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
2548 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2549
2550 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2551
2552 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
2553 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
2554 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
2555
2556 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2557
2558 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
2559 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
2560
2561 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
2562
2563 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
2564 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
2565 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2566
2567 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2568 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2569 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
2570
2571 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
2572 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2573 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
2574
2575 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
2576 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
2577
2578 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2579 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2580 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
2581
2582 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
2583 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
2584 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2585
2586 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2587 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
2588 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2589
2590 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2591 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2592
2593 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2594 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
2595 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
2596
2597 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
2598 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
2599
2600 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
2601 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2602 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2603
2604 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2605
2606 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2607
2608 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
2609
2610 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2611 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2612 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
2613
2614 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2615 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2616 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
2617
2618 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
2619 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
2620 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
2621
2622 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2623 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
2624 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2625
2626 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
2627 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
2628 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2629
2630 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
2631 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
2632 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2633
2634 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
2635
2636 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
2637 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
2638 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2639
2640 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2641 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
2642 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2643
2644 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2645 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
2646 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
2647
2648 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
2649 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2650
2651 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
2652 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2653
2654 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
2655 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2656
2657 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2658 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
2659
2660 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
2661 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2662
2663 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
2664 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2665 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
2666
2667 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2668 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2669 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
2670
2671 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2672 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2673 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2674
2675 ld1 { $acc_lb}, [$current_tag]
2676 ext $acc_lb, $acc_lb, $acc_lb, #8
2677 rev64 $acc_lb, $acc_lb
2678 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
2679
2680 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
2681 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2682
2683 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
2684 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2685
2686 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2687 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
2688
2689 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10
2690 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
2691 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10
2692
2693 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10
2694 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10
2695 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10
2696
2697 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10
2698 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10
2699 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10
2700
2701 aese $ctr6b, $rk11 @ AES block 14 - round 11
2702 aese $ctr3b, $rk11 @ AES block 11 - round 11
2703
2704 aese $ctr4b, $rk11 @ AES block 12 - round 11
2705 aese $ctr7b, $rk11 @ AES block 15 - round 11
2706 ldr $rk12q, [$cc, #192] @ load rk12
2707
2708 aese $ctr1b, $rk11 @ AES block 9 - round 11
2709 aese $ctr5b, $rk11 @ AES block 13 - round 11
2710
2711 aese $ctr2b, $rk11 @ AES block 10 - round 11
2712 aese $ctr0b, $rk11 @ AES block 8 - round 11
2713 b.ge .L192_enc_tail @ handle tail
2714
2715 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
2716
2717 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
2718
2719 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
2720
2721 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
2722
2723 eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result
2724 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
2725 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
2726
2727 eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result
2728 eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result
2729
2730 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
2731 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
2732 eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
2733
2734 eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
2735 eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
2736 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
2737
2738 eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result
2739 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
2740 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
2741
2742 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
2743 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2744
2745 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
2746 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
2747 eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
2748
2749 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
2750
2751 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
2752 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
2753 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
2754
2755 b.ge .L192_enc_prepretail @ do prepretail
2756
2757 .L192_enc_main_loop: @ main loop start
2758 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
2759 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
2760 rev64 $res2b, $res2b @ GHASH block 8k+2
2761
2762 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
2763 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
2764 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
2765 ext $h7.16b, $h7.16b, $h7.16b, #8
2766 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
2767 ext $h8.16b, $h8.16b, $h8.16b, #8
2768
2769 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2770 rev64 $res0b, $res0b @ GHASH block 8k
2771 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
2772 ext $h5.16b, $h5.16b, $h5.16b, #8
2773 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
2774 ext $h6.16b, $h6.16b, $h6.16b, #8
2775
2776 rev64 $res1b, $res1b @ GHASH block 8k+1
2777 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
2778 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
2779
2780 eor $res0b, $res0b, $acc_lb @ PRE 1
2781 rev64 $res3b, $res3b @ GHASH block 8k+3
2782 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
2783
2784 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
2785 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
2786 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
2787
2788 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
2789 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
2790 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
2791
2792 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
2793 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
2794 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
2795
2796 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
2797 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
2798 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
2799
2800 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
2801 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
2802 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
2803
2804 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
2805 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
2806 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
2807 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
2808
2809 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
2810 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
2811 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
2812
2813 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
2814 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
2815 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
2816
2817 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
2818 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
2819 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
2820
2821 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
2822 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
2823 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
2824
2825 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
2826 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
2827 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
2828
2829 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
2830 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
2831 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
2832
2833 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
2834 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
2835 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
2836
2837 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
2838 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
2839 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
2840
2841 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
2842 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
2843 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
2844
2845 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
2846 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
2847 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2848 ext $h3.16b, $h3.16b, $h3.16b, #8
2849 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2850 ext $h4.16b, $h4.16b, $h4.16b, #8
2851
2852 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
2853 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
2854 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
2855
2856 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
2857 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
2858 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
2859
2860 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
2861 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
2862 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
2863
2864 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
2865 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
2866 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
2867
2868 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
2869 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
2870 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
2871
2872 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
2873 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
2874 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
2875
2876 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
2877 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
2878 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
2879
2880 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
2881 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
2882 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2883 ext $h1.16b, $h1.16b, $h1.16b, #8
2884 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2885 ext $h2.16b, $h2.16b, $h2.16b, #8
2886
2887 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
2888 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
2889 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
2890
2891 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
2892 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
2893 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
2894
2895 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
2896 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
2897
2898 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
2899 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
2900 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
2901
2902 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
2903 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
2904 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
2905
2906 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
2907 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
2908 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
2909
2910 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
2911 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
2912 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
2913
2914 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
2915 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
2916 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
2917
2918 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
2919 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
2920 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
2921
2922 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
2923 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
2924
2925 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
2926 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
2927 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
2928
2929 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
2930 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2931
2932 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
2933 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
2934 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
2935
2936 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
2937 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
2938 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
2939
2940 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
2941 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
2942 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
2943
2944 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
2945 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
2946 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
2947
2948 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
2949 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
2950 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
2951
2952 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
2953 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
2954 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
2955
2956 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
2957 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
2958 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
2959
2960 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
2961 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
2962 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
2963
2964 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
2965 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
2966 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
2967
2968 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
2969 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
2970 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
2971
2972 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
2973 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
2974 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
2975
2976 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
2977 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
2978 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
2979
2980 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2981 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
2982 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
2983
2984 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
2985 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
2986 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
2987
2988 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
2989 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
2990 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
2991
2992 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
2993 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
2994 ldr $rk12q, [$cc, #192] @ load rk12
2995 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2996
2997 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
2998 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
2999 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
3000
3001 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
3002 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
3003 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
3004
3005 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
3006 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
3007 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
3008
3009 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
3010 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
3011
3012 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
3013 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3014
3015 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
3016 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
3017 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
3018
3019 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
3020 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
3021 eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
3022
3023 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
3024 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
3025 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
3026
3027 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
3028 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
3029 eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
3030
3031 eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result
3032 eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result
3033 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
3034
3035 eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result
3036 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
3037 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
3038 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3039
3040 eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
3041 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
3042 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
3043
3044 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
3045 eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
3046 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
3047
3048 eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result
3049 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
3050
3051 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
3052
3053 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
3054
3055 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3056 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
3057 b.lt .L192_enc_main_loop
3058
3059 .L192_enc_prepretail: @ PREPRETAIL
3060 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
3061 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
3062 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
3063
3064 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
3065 ext $h7.16b, $h7.16b, $h7.16b, #8
3066 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
3067 ext $h8.16b, $h8.16b, $h8.16b, #8
3068 rev64 $res0b, $res0b @ GHASH block 8k
3069 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3070
3071 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
3072 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
3073 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
3074 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
3075
3076 rev64 $res3b, $res3b @ GHASH block 8k+3
3077 rev64 $res2b, $res2b @ GHASH block 8k+2
3078 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
3079 ext $h5.16b, $h5.16b, $h5.16b, #8
3080 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
3081 ext $h6.16b, $h6.16b, $h6.16b, #8
3082
3083 eor $res0b, $res0b, $acc_lb @ PRE 1
3084 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
3085 rev64 $res1b, $res1b @ GHASH block 8k+1
3086
3087 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
3088 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
3089 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
3090
3091 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
3092 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
3093 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
3094
3095 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
3096 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
3097 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
3098
3099 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
3100 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
3101 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3102
3103 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3104 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
3105 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
3106
3107 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
3108 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
3109 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
3110
3111 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
3112 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
3113 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
3114
3115 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
3116 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
3117 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
3118
3119 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
3120 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
3121 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
3122
3123 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
3124 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
3125 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
3126
3127 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
3128 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
3129 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
3130
3131 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
3132 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
3133 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
3134
3135 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
3136 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
3137 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
3138
3139 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
3140 rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
3141 rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
3142
3143 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
3144 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
3145 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
3146
3147 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
3148 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
3149 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
3150
3151 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
3152 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
3153 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
3154
3155 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
3156 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
3157 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
3158
3159 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3160 ext $h3.16b, $h3.16b, $h3.16b, #8
3161 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3162 ext $h4.16b, $h4.16b, $h4.16b, #8
3163 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
3164 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
3165
3166 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3167 ext $h1.16b, $h1.16b, $h1.16b, #8
3168 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
3169 ext $h2.16b, $h2.16b, $h2.16b, #8
3170 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
3171 rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
3172
3173 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
3174 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
3175 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
3176
3177 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
3178 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
3179 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
3180
3181 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
3182 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
3183 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
3184
3185 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
3186 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
3187 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
3188
3189 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
3190 rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
3191 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
3192 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
3193
3194 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
3195 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
3196 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
3197
3198 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
3199 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
3200 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
3201
3202 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
3203 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
3204
3205 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
3206 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
3207 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
3208
3209 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
3210 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
3211 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
3212
3213 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
3214 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
3215 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
3216
3217 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
3218 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
3219 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
3220
3221 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
3222 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
3223
3224 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
3225 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
3226 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
3227
3228 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
3229 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
3230 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
3231
3232 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
3233 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
3234 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
3235
3236 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
3237 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
3238 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
3239
3240 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
3241 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
3242 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
3243
3244 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
3245 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
3246 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
3247
3248 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
3249 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
3250 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
3251
3252 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
3253 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
3254
3255 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
3256 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
3257
3258 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
3259 ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3260 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
3261 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3262
3263 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
3264 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
3265
3266 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
3267 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
3268 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
3269
3270 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
3271 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
3272 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
3273
3274 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
3275 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
3276 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
3277
3278 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
3279 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
3280 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
3281
3282 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
3283 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
3284
3285 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3286 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
3287 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
3288 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
3289
3290 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3291 ldr $rk12q, [$cc, #192] @ load rk12
3292
3293 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
3294 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
3295 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
3296
3297 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
3298 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
3299 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
3300
3301 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
3302 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
3303
3304 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
3305 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
3306
3307 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
3308 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
3309
3310 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
3311 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
3312 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
3313
3314 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
3315 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
3316 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
3317
3318 .L192_enc_tail: @ TAIL
3319
3320 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
3321 ext $h5.16b, $h5.16b, $h5.16b, #8
3322 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3323
3324 ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext
3325
3326 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
3327 ext $h8.16b, $h8.16b, $h8.16b, #8
3328
3329 mov $t1.16b, $rk12
3330
3331 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
3332 ext $h6.16b, $h6.16b, $h6.16b, #8
3333 ext $h7.16b, $h7.16b, $h7.16b, #8
3334 cmp $main_end_input_ptr, #112
3335
3336 eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
3337 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3338 b.gt .L192_enc_blocks_more_than_7
3339
3340 cmp $main_end_input_ptr, #96
3341 mov $ctr7b, $ctr6b
3342 movi $acc_h.8b, #0
3343
3344 mov $ctr6b, $ctr5b
3345 movi $acc_l.8b, #0
3346 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3347
3348 mov $ctr5b, $ctr4b
3349 mov $ctr4b, $ctr3b
3350 mov $ctr3b, $ctr2b
3351
3352 mov $ctr2b, $ctr1b
3353 movi $acc_m.8b, #0
3354 b.gt .L192_enc_blocks_more_than_6
3355
3356 mov $ctr7b, $ctr6b
3357 cmp $main_end_input_ptr, #80
3358
3359 mov $ctr6b, $ctr5b
3360 mov $ctr5b, $ctr4b
3361 mov $ctr4b, $ctr3b
3362
3363 mov $ctr3b, $ctr1b
3364 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3365 b.gt .L192_enc_blocks_more_than_5
3366
3367 cmp $main_end_input_ptr, #64
3368 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3369
3370 mov $ctr7b, $ctr6b
3371 mov $ctr6b, $ctr5b
3372 mov $ctr5b, $ctr4b
3373
3374 mov $ctr4b, $ctr1b
3375 b.gt .L192_enc_blocks_more_than_4
3376
3377 mov $ctr7b, $ctr6b
3378 mov $ctr6b, $ctr5b
3379 mov $ctr5b, $ctr1b
3380
3381 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3382 cmp $main_end_input_ptr, #48
3383 b.gt .L192_enc_blocks_more_than_3
3384
3385 mov $ctr7b, $ctr6b
3386 mov $ctr6b, $ctr1b
3387 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3388
3389 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
3390 cmp $main_end_input_ptr, #32
3391 b.gt .L192_enc_blocks_more_than_2
3392
3393 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3394
3395 cmp $main_end_input_ptr, #16
3396 mov $ctr7b, $ctr1b
3397 b.gt .L192_enc_blocks_more_than_1
3398
3399 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3400 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
3401 b .L192_enc_blocks_less_than_1
3402 .L192_enc_blocks_more_than_7: @ blocks left > 7
3403 st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
3404
3405 rev64 $res0b, $res1b @ GHASH final-7 block
3406 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
3407
3408 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3409
3410 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
3411
3412 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
3413
3414 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
3415 movi $t0.8b, #0 @ supress further partial tag feed in
3416 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
3417
3418 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
3419
3420 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
3421 eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
3422 .L192_enc_blocks_more_than_6: @ blocks left > 6
3423
3424 st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
3425
3426 rev64 $res0b, $res1b @ GHASH final-6 block
3427
3428 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
3429
3430 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3431
3432 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
3433
3434 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
3435 eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
3436
3437 movi $t0.8b, #0 @ supress further partial tag feed in
3438 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
3439 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
3440
3441 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
3442
3443 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
3444 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
3445
3446 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
3447 .L192_enc_blocks_more_than_5: @ blocks left > 5
3448
3449 st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
3450
3451 rev64 $res0b, $res1b @ GHASH final-5 block
3452
3453 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3454
3455 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
3456
3457 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
3458 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
3459
3460 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
3461 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
3462
3463 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
3464 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
3465
3466 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
3467 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
3468
3469 eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
3470 movi $t0.8b, #0 @ supress further partial tag feed in
3471
3472 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
3473 .L192_enc_blocks_more_than_4: @ blocks left > 4
3474
3475 st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
3476
3477 rev64 $res0b, $res1b @ GHASH final-4 block
3478
3479 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3480
3481 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
3482 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
3483 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
3484
3485 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
3486 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
3487
3488 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
3489
3490 movi $t0.8b, #0 @ supress further partial tag feed in
3491 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
3492
3493 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
3494
3495 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
3496 eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
3497 .L192_enc_blocks_more_than_3: @ blocks left > 3
3498
3499 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
3500 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
3501
3502 rev64 $res0b, $res1b @ GHASH final-3 block
3503
3504 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3505 movi $t0.8b, #0 @ supress further partial tag feed in
3506
3507 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
3508 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3509 ext $h4.16b, $h4.16b, $h4.16b, #8
3510
3511 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
3512
3513 eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
3514 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3515
3516 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
3517 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
3518
3519 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
3520 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
3521
3522 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
3523
3524 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
3525 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
3526 .L192_enc_blocks_more_than_2: @ blocks left > 2
3527
3528 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
3529
3530 rev64 $res0b, $res1b @ GHASH final-2 block
3531 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3532 ext $h3.16b, $h3.16b, $h3.16b, #8
3533
3534 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3535
3536 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
3537 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
3538
3539 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3540
3541 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3542 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
3543 movi $t0.8b, #0 @ supress further partial tag feed in
3544
3545 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3546
3547 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3548 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3549
3550 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3551 eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
3552 .L192_enc_blocks_more_than_1: @ blocks left > 1
3553
3554 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
3555 ext $h2.16b, $h2.16b, $h2.16b, #8
3556 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
3557
3558 rev64 $res0b, $res1b @ GHASH final-1 block
3559
3560 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3561
3562 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
3563 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3564
3565 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3566 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3567 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3568
3569 ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
3570 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
3571
3572 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3573
3574 eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
3575 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3576
3577 movi $t0.8b, #0 @ supress further partial tag feed in
3578
3579 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3580 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3581 .L192_enc_blocks_less_than_1: @ blocks left <= 1
3582
3583 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
3584 and $bit_length, $bit_length, #127 @ bit_length %= 128
3585
3586 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3587
3588 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3589
3590 and $bit_length, $bit_length, #127 @ bit_length %= 128
3591
3592 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
3593 cmp $bit_length, #64
3594 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
3595
3596 csel $temp2_x, $temp1_x, $temp0_x, lt
3597 csel $temp3_x, $temp0_x, xzr, lt
3598
3599 mov $ctr0.d[1], $temp3_x
3600 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3601 ext $h1.16b, $h1.16b, $h1.16b, #8
3602
3603 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
3604 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
3605
3606 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3607
3608 rev64 $res0b, $res1b @ GHASH final block
3609 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
3610
3611 st1 { $res1b}, [$output_ptr] @ store all 16B
3612
3613 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3614
3615 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
3616 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3617
3618 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3619 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3620
3621 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3622
3623 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3624
3625 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3626 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
3627
3628 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3629 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3630
3631 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
3632
3633 str $rtmp_ctrq, [$counter] @ store the updated counter
3634 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
3635
3636 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3637
3638 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
3639
3640 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3641 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3642
3643 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
3644 ext $acc_lb, $acc_lb, $acc_lb, #8
3645 rev64 $acc_lb, $acc_lb
3646 st1 { $acc_l.16b }, [$current_tag]
3647
3648 lsr x0, $bit_length, #3 @ return sizes
3649
3650 ldp d10, d11, [sp, #16]
3651 ldp d12, d13, [sp, #32]
3652 ldp d14, d15, [sp, #48]
3653 ldp d8, d9, [sp], #80
3654 ret
3655
3656 .L192_enc_ret:
3657 mov w0, #0x0
3658 ret
3659 .size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
3660 ___
3661
3662 #########################################################################################
3663 # size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
3664 # size_t len,
3665 # unsigned char *out,
3666 # const void *key,
3667 # unsigned char ivec[16],
3668 # u64 *Xi);
3669 #
3670 $code.=<<___;
3671 .global unroll8_eor3_aes_gcm_dec_192_kernel
3672 .type unroll8_eor3_aes_gcm_dec_192_kernel,%function
3673 .align 4
3674 unroll8_eor3_aes_gcm_dec_192_kernel:
3675 AARCH64_VALID_CALL_TARGET
3676 cbz x1, .L192_dec_ret
3677 stp d8, d9, [sp, #-80]!
3678 mov $counter, x4
3679 mov $cc, x5
3680 stp d10, d11, [sp, #16]
3681 stp d12, d13, [sp, #32]
3682 stp d14, d15, [sp, #48]
3683 mov x5, #0xc200000000000000
3684 stp x5, xzr, [sp, #64]
3685 add $modulo_constant, sp, #64
3686
3687 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
3688 ld1 { $ctr0b}, [$counter] @ CTR block 0
3689 ld1 { $acc_lb}, [$current_tag]
3690
3691 mov $constant_temp, #0x100000000 @ set up counter increment
3692 movi $rctr_inc.16b, #0x0
3693 mov $rctr_inc.d[1], $constant_temp
3694
3695 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
3696
3697 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
3698
3699 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
3700 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
3701
3702 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
3703 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
3704
3705 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
3706 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
3707
3708 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
3709 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
3710
3711 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
3712 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
3713 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
3714
3715 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
3716 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
3717
3718 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
3719
3720 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3721 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
3722 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
3723
3724 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3725 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3726 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
3727
3728 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3729 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
3730 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
3731
3732 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3733
3734 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3735
3736 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3737 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3738 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
3739
3740 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
3741 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
3742
3743 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
3744 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3745 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
3746
3747 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
3748 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3749 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3750
3751 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3752 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
3753 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
3754
3755 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
3756
3757 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
3758 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3759 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
3760
3761 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3762 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3763
3764 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
3765 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3766 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
3767
3768 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3769 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3770 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
3771
3772 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3773 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
3774 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
3775
3776 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3777 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
3778 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
3779
3780 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
3781 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
3782
3783 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3784 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
3785 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3786
3787 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3788 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3789 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
3790
3791 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3792
3793 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
3794 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
3795 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3796
3797 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3798 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3799 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
3800
3801 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
3802 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3803 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
3804
3805 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
3806
3807 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3808 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
3809
3810 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3811 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3812 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
3813
3814 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
3815 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3816 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
3817
3818 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3819 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3820 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3821
3822 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
3823 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
3824 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
3825
3826 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
3827 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3828 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3829
3830 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3831 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
3832
3833 ld1 { $acc_lb}, [$current_tag]
3834 ext $acc_lb, $acc_lb, $acc_lb, #8
3835 rev64 $acc_lb, $acc_lb
3836
3837 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
3838
3839 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3840 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3841
3842 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3843 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
3844 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
3845
3846 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3847 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3848
3849 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
3850 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3851
3852 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3853 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3854 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
3855
3856 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
3857 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3858 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3859
3860 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
3861 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
3862 ldr $rk12q, [$cc, #192] @ load rk12
3863
3864 aese $ctr0b, $rk11 @ AES block 0 - round 11
3865 aese $ctr1b, $rk11 @ AES block 1 - round 11
3866 aese $ctr4b, $rk11 @ AES block 4 - round 11
3867
3868 aese $ctr6b, $rk11 @ AES block 6 - round 11
3869 aese $ctr5b, $rk11 @ AES block 5 - round 11
3870 aese $ctr7b, $rk11 @ AES block 7 - round 11
3871
3872 aese $ctr2b, $rk11 @ AES block 2 - round 11
3873 aese $ctr3b, $rk11 @ AES block 3 - round 11
3874 b.ge .L192_dec_tail @ handle tail
3875
3876 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
3877
3878 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
3879
3880 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
3881
3882 eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result
3883 eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result
3884 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
3885
3886 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
3887 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
3888
3889 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
3890 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
3891 eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result
3892
3893 eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result
3894 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
3895 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
3896
3897 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
3898 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
3899
3900 eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result
3901
3902 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
3903 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
3904
3905 eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result
3906 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
3907 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3908
3909 eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result
3910 eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result
3911 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
3912
3913 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
3914 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
3915 b.ge .L192_dec_prepretail @ do prepretail
3916
3917 .L192_dec_main_loop: @ main loop start
3918 rev64 $res1b, $res1b @ GHASH block 8k+1
3919 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
3920 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3921
3922 rev64 $res0b, $res0b @ GHASH block 8k
3923 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
3924 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
3925
3926 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
3927 ext $h7.16b, $h7.16b, $h7.16b, #8
3928 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
3929 ext $h8.16b, $h8.16b, $h8.16b, #8
3930 rev64 $res4b, $res4b @ GHASH block 8k+4
3931 rev64 $res3b, $res3b @ GHASH block 8k+3
3932
3933 eor $res0b, $res0b, $acc_lb @ PRE 1
3934 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
3935 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
3936
3937 rev64 $res5b, $res5b @ GHASH block 8k+5
3938
3939 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
3940 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
3941 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
3942
3943 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
3944 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
3945 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
3946
3947 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
3948 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
3949 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
3950
3951 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
3952 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
3953 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
3954
3955 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
3956 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
3957 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
3958 ext $h5.16b, $h5.16b, $h5.16b, #8
3959 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
3960 ext $h6.16b, $h6.16b, $h6.16b, #8
3961
3962 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
3963 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
3964 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
3965
3966 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
3967 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
3968 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
3969
3970 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3971 rev64 $res2b, $res2b @ GHASH block 8k+2
3972 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
3973
3974 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
3975 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
3976 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
3977 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
3978
3979 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
3980 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
3981 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
3982
3983 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
3984 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
3985 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
3986
3987 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
3988 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
3989 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
3990
3991 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
3992 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
3993 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
3994
3995 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
3996 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
3997 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
3998
3999 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4000 ext $h3.16b, $h3.16b, $h3.16b, #8
4001 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4002 ext $h4.16b, $h4.16b, $h4.16b, #8
4003 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
4004 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
4005
4006 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
4007 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4008 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4009
4010 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
4011 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
4012
4013 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
4014 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
4015 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
4016
4017 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4018 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
4019 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
4020
4021 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4022 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
4023
4024 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
4025 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
4026 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
4027
4028 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
4029 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
4030 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
4031
4032 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
4033 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
4034 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
4035
4036 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
4037 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
4038 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
4039
4040 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
4041 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
4042 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
4043
4044 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4045 ext $h1.16b, $h1.16b, $h1.16b, #8
4046 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4047 ext $h2.16b, $h2.16b, $h2.16b, #8
4048 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
4049 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
4050
4051 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
4052 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
4053 rev64 $res7b, $res7b @ GHASH block 8k+7
4054
4055 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
4056 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4057 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
4058
4059 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
4060 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4061 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
4062
4063 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
4064 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
4065 rev64 $res6b, $res6b @ GHASH block 8k+6
4066
4067 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4068 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4069 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
4070 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
4071
4072 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
4073 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4074 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4075
4076 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
4077 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
4078 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
4079
4080 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
4081 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
4082 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
4083
4084 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
4085 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
4086 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
4087
4088 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
4089 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
4090 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
4091
4092 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
4093 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4094 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
4095
4096 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
4097 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
4098 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
4099
4100 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4101 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
4102 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
4103
4104 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
4105 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
4106 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
4107
4108 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4109 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
4110 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
4111
4112 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
4113 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
4114 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
4115
4116 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
4117 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
4118 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
4119
4120 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
4121 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
4122 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
4123
4124 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
4125 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
4126 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
4127
4128 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
4129 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
4130 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
4131
4132 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
4133 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4134 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
4135
4136 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
4137 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
4138 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
4139
4140 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
4141 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
4142
4143 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
4144 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
4145 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
4146
4147 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
4148 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4149 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
4150
4151 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
4152 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
4153 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4154
4155 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
4156 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
4157 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
4158
4159 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
4160 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
4161 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
4162
4163 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
4164 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
4165 ldr $rk12q, [$cc, #192] @ load rk12
4166
4167 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
4168 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
4169 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
4170
4171 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
4172 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4173 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
4174
4175 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
4176 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
4177 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
4178
4179 eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result
4180 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
4181 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
4182
4183 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
4184 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
4185 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
4186
4187 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
4188 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
4189 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4190
4191 eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result
4192 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
4193 eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result
4194
4195 eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result
4196 eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result
4197 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
4198
4199 eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result
4200 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
4201 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
4202
4203 eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result
4204 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
4205 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4206
4207 eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result
4208 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
4209 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
4210
4211 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
4212 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
4213
4214 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
4215 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
4216 b.lt .L192_dec_main_loop
4217
4218 .L192_dec_prepretail: @ PREPRETAIL
4219 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
4220 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
4221 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
4222
4223 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
4224 ext $h7.16b, $h7.16b, $h7.16b, #8
4225 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
4226 ext $h8.16b, $h8.16b, $h8.16b, #8
4227 rev64 $res0b, $res0b @ GHASH block 8k
4228 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4229
4230 rev64 $res3b, $res3b @ GHASH block 8k+3
4231 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
4232 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
4233
4234 eor $res0b, $res0b, $acc_lb @ PRE 1
4235 rev64 $res2b, $res2b @ GHASH block 8k+2
4236 rev64 $res1b, $res1b @ GHASH block 8k+1
4237
4238 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
4239 ext $h5.16b, $h5.16b, $h5.16b, #8
4240 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
4241 ext $h6.16b, $h6.16b, $h6.16b, #8
4242 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
4243
4244 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
4245 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
4246 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
4247
4248 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
4249 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
4250 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
4251
4252 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
4253 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
4254 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
4255
4256 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
4257 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
4258 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
4259
4260 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
4261 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
4262 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
4263
4264 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
4265 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
4266 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
4267
4268 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
4269 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
4270 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
4271
4272 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
4273 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
4274 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
4275
4276 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
4277 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
4278 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
4279
4280 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
4281 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
4282 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
4283 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
4284
4285 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
4286 rev64 $res5b, $res5b @ GHASH block 8k+5
4287 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
4288
4289 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
4290 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
4291 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
4292
4293 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4294 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
4295 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
4296
4297 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
4298 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
4299 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
4300
4301 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
4302 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
4303 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
4304
4305 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
4306 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4307 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
4308
4309 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
4310 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
4311 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
4312
4313 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
4314 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
4315 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
4316
4317 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4318 ext $h3.16b, $h3.16b, $h3.16b, #8
4319 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4320 ext $h4.16b, $h4.16b, $h4.16b, #8
4321 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
4322 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
4323
4324 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4325 ext $h1.16b, $h1.16b, $h1.16b, #8
4326 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4327 ext $h2.16b, $h2.16b, $h2.16b, #8
4328 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
4329 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
4330
4331 rev64 $res7b, $res7b @ GHASH block 8k+7
4332
4333 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
4334 rev64 $res4b, $res4b @ GHASH block 8k+4
4335
4336 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
4337 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
4338 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
4339
4340 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
4341 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
4342 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
4343
4344 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
4345 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
4346 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
4347
4348 rev64 $res6b, $res6b @ GHASH block 8k+6
4349 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4350 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4351 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4352
4353 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
4354 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
4355 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
4356
4357 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
4358 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
4359 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
4360
4361 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
4362 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
4363 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
4364
4365 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
4366
4367 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
4368 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
4369 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
4370
4371 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
4372 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4373 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
4374
4375 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
4376 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
4377 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4378
4379 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
4380 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
4381
4382 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4383 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
4384 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
4385
4386 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
4387 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
4388 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
4389
4390 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
4391 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
4392 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
4393
4394 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
4395 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
4396 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
4397
4398 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
4399 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
4400 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
4401
4402 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
4403 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
4404 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
4405
4406 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
4407 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
4408 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
4409
4410 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
4411 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
4412 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
4413
4414 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
4415 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
4416 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
4417
4418 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
4419 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
4420
4421 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
4422 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4423 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
4424
4425 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
4426 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
4427 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
4428
4429 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
4430 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4431 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
4432
4433 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
4434 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
4435 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
4436
4437 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
4438 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
4439 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
4440
4441 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
4442 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
4443 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
4444
4445 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
4446 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
4447 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
4448
4449 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4450 ldr $rk12q, [$cc, #192] @ load rk12
4451 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4452
4453 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
4454 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
4455 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
4456
4457 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
4458 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
4459 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
4460
4461 aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
4462 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
4463 aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
4464
4465 aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
4466 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
4467 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
4468
4469 aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
4470 aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
4471 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
4472
4473 aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
4474 aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
4475 aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
4476
4477 .L192_dec_tail: @ TAIL
4478
4479 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4480
4481 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
4482 ext $h5.16b, $h5.16b, $h5.16b, #8
4483 ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
4484
4485 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
4486 ext $h8.16b, $h8.16b, $h8.16b, #8
4487
4488 mov $t1.16b, $rk12
4489
4490 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
4491 ext $h6.16b, $h6.16b, $h6.16b, #8
4492 ext $h7.16b, $h7.16b, $h7.16b, #8
4493 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4494
4495 eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
4496 cmp $main_end_input_ptr, #112
4497 b.gt .L192_dec_blocks_more_than_7
4498
4499 mov $ctr7b, $ctr6b
4500 movi $acc_h.8b, #0
4501 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4502
4503 mov $ctr6b, $ctr5b
4504 mov $ctr5b, $ctr4b
4505 mov $ctr4b, $ctr3b
4506
4507 cmp $main_end_input_ptr, #96
4508 movi $acc_l.8b, #0
4509 mov $ctr3b, $ctr2b
4510
4511 mov $ctr2b, $ctr1b
4512 movi $acc_m.8b, #0
4513 b.gt .L192_dec_blocks_more_than_6
4514
4515 mov $ctr7b, $ctr6b
4516 mov $ctr6b, $ctr5b
4517 mov $ctr5b, $ctr4b
4518
4519 mov $ctr4b, $ctr3b
4520 mov $ctr3b, $ctr1b
4521
4522 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4523 cmp $main_end_input_ptr, #80
4524 b.gt .L192_dec_blocks_more_than_5
4525
4526 mov $ctr7b, $ctr6b
4527 mov $ctr6b, $ctr5b
4528
4529 mov $ctr5b, $ctr4b
4530 mov $ctr4b, $ctr1b
4531 cmp $main_end_input_ptr, #64
4532
4533 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4534 b.gt .L192_dec_blocks_more_than_4
4535
4536 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4537 mov $ctr7b, $ctr6b
4538 mov $ctr6b, $ctr5b
4539
4540 mov $ctr5b, $ctr1b
4541 cmp $main_end_input_ptr, #48
4542 b.gt .L192_dec_blocks_more_than_3
4543
4544 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4545 mov $ctr7b, $ctr6b
4546 cmp $main_end_input_ptr, #32
4547
4548 mov $ctr6b, $ctr1b
4549 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4550 b.gt .L192_dec_blocks_more_than_2
4551
4552 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4553
4554 mov $ctr7b, $ctr1b
4555 cmp $main_end_input_ptr, #16
4556 b.gt .L192_dec_blocks_more_than_1
4557
4558 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4559 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4560 b .L192_dec_blocks_less_than_1
4561 .L192_dec_blocks_more_than_7: @ blocks left > 7
4562 rev64 $res0b, $res1b @ GHASH final-7 block
4563
4564 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
4565 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4566
4567 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
4568 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
4569 ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
4570
4571 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
4572
4573 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
4574 st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
4575
4576 eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
4577
4578 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
4579 movi $t0.8b, #0 @ supress further partial tag feed in
4580 .L192_dec_blocks_more_than_6: @ blocks left > 6
4581
4582 rev64 $res0b, $res1b @ GHASH final-6 block
4583
4584 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4585
4586 ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
4587 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
4588
4589 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
4590 movi $t0.8b, #0 @ supress further partial tag feed in
4591 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
4592
4593 st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
4594 eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
4595
4596 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
4597 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
4598 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
4599
4600 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
4601 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
4602 .L192_dec_blocks_more_than_5: @ blocks left > 5
4603
4604 rev64 $res0b, $res1b @ GHASH final-5 block
4605
4606 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4607
4608 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
4609
4610 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
4611
4612 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
4613 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
4614
4615 ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
4616
4617 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
4618 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
4619
4620 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
4621
4622 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
4623 movi $t0.8b, #0 @ supress further partial tag feed in
4624 st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
4625
4626 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
4627 eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
4628 .L192_dec_blocks_more_than_4: @ blocks left > 4
4629
4630 rev64 $res0b, $res1b @ GHASH final-4 block
4631
4632 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4633 movi $t0.8b, #0 @ supress further partial tag feed in
4634
4635 ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
4636 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
4637 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
4638
4639 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
4640
4641 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
4642
4643 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
4644 st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
4645 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
4646
4647 eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
4648
4649 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
4650 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
4651 .L192_dec_blocks_more_than_3: @ blocks left > 3
4652
4653 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4654 ext $h4.16b, $h4.16b, $h4.16b, #8
4655 rev64 $res0b, $res1b @ GHASH final-3 block
4656 ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
4657
4658 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4659
4660 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
4661 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
4662
4663 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
4664 movi $t0.8b, #0 @ supress further partial tag feed in
4665 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
4666
4667 st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
4668 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4669 eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
4670
4671 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
4672 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
4673
4674 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
4675
4676 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
4677
4678 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
4679 .L192_dec_blocks_more_than_2: @ blocks left > 2
4680
4681 rev64 $res0b, $res1b @ GHASH final-2 block
4682 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4683 ext $h3.16b, $h3.16b, $h3.16b, #8
4684
4685 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4686
4687 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
4688 ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
4689
4690 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4691
4692 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4693
4694 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4695 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4696
4697 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
4698 movi $t0.8b, #0 @ supress further partial tag feed in
4699
4700 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4701 st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
4702
4703 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4704 eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
4705 .L192_dec_blocks_more_than_1: @ blocks left > 1
4706
4707 rev64 $res0b, $res1b @ GHASH final-1 block
4708 ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
4709 ldr $h2q, [$current_tag, #64] @ load h1l | h1h
4710 ext $h2.16b, $h2.16b, $h2.16b, #8
4711
4712 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4713 movi $t0.8b, #0 @ supress further partial tag feed in
4714 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
4715
4716 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4717 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
4718 st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
4719
4720 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4721
4722 eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
4723
4724 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4725
4726 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4727
4728 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4729
4730 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4731
4732 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4733 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4734 .L192_dec_blocks_less_than_1: @ blocks left <= 1
4735
4736 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
4737 and $bit_length, $bit_length, #127 @ bit_length %= 128
4738
4739 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4740 str $rtmp_ctrq, [$counter] @ store the updated counter
4741
4742 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4743 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
4744
4745 and $bit_length, $bit_length, #127 @ bit_length %= 128
4746
4747 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
4748 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
4749 cmp $bit_length, #64
4750
4751 csel $temp2_x, $temp1_x, $temp0_x, lt
4752 csel $temp3_x, $temp0_x, xzr, lt
4753 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4754 ext $h1.16b, $h1.16b, $h1.16b, #8
4755
4756 mov $ctr0.d[1], $temp3_x
4757 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4758
4759 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
4760
4761 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4762 bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4763
4764 rev64 $res0b, $res1b @ GHASH final block
4765
4766 st1 { $res4b}, [$output_ptr] @ store all 16B
4767
4768 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4769
4770 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
4771 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4772
4773 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4774 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4775 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4776
4777 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4778 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4779
4780 eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
4781 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4782 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
4783
4784 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4785 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4786
4787 eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
4788
4789 eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
4790
4791 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4792 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4793
4794 eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
4795 ext $acc_lb, $acc_lb, $acc_lb, #8
4796 rev64 $acc_lb, $acc_lb
4797 st1 { $acc_l.16b }, [$current_tag]
4798
4799 ldp d10, d11, [sp, #16]
4800 ldp d12, d13, [sp, #32]
4801 ldp d14, d15, [sp, #48]
4802 ldp d8, d9, [sp], #80
4803 ret
4804
4805 .L192_dec_ret:
4806 mov w0, #0x0
4807 ret
4808 .size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
4809 ___
4810 }
4811
4812 {
4813
4814 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
4815 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
4816 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
4817 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
4818 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
4819 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
4820 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
4821
4822 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
4823 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
4824 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
4825
4826 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
4827 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
4828
4829 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
4830 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
4831 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
4832 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
4833
4834 my $t0="v16";
4835 my $t0d="d16";
4836
4837 my $t1="v29";
4838 my $t2=$res1;
4839 my $t3=$t1;
4840
4841 my $t4=$res0;
4842 my $t5=$res2;
4843 my $t6=$t0;
4844
4845 my $t7=$res3;
4846 my $t8=$res4;
4847 my $t9=$res5;
4848
4849 my $t10=$res6;
4850 my $t11="v21";
4851 my $t12=$t1;
4852
4853 my $rtmp_ctr="v30";
4854 my $rtmp_ctrq="q30";
4855 my $rctr_inc="v31";
4856 my $rctr_incd="d31";
4857
4858 my $mod_constantd=$t0d;
4859 my $mod_constant=$t0;
4860
4861 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
4862 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
4863 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
4864 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
4865 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
4866 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
4867 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
4868 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
4869 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
4870 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
4871 my $rk2q1="v28.1q";
4872 my $rk3q1="v26.1q";
4873 my $rk4v="v27";
4874 #########################################################################################
4875 # size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
4876 # size_t len,
4877 # unsigned char *out,
4878 # const void *key,
4879 # unsigned char ivec[16],
4880 # u64 *Xi);
4881 #
4882 $code.=<<___;
4883 .global unroll8_eor3_aes_gcm_enc_256_kernel
4884 .type unroll8_eor3_aes_gcm_enc_256_kernel,%function
4885 .align 4
4886 unroll8_eor3_aes_gcm_enc_256_kernel:
4887 AARCH64_VALID_CALL_TARGET
4888 cbz x1, .L256_enc_ret
4889 stp d8, d9, [sp, #-80]!
4890 mov $counter, x4
4891 mov $cc, x5
4892 stp d10, d11, [sp, #16]
4893 stp d12, d13, [sp, #32]
4894 stp d14, d15, [sp, #48]
4895 mov x5, #0xc200000000000000
4896 stp x5, xzr, [sp, #64]
4897 add $modulo_constant, sp, #64
4898
4899 ld1 { $ctr0b}, [$counter] @ CTR block 0
4900
4901 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
4902
4903 mov $constant_temp, #0x100000000 @ set up counter increment
4904 movi $rctr_inc.16b, #0x0
4905 mov $rctr_inc.d[1], $constant_temp
4906 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4907
4908 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4909
4910 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4911
4912 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
4913
4914 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
4915
4916 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
4917 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
4918
4919 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
4920 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
4921
4922 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
4923 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
4924
4925 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
4926 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
4927
4928 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
4929 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
4930 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
4931
4932 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
4933 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
4934
4935 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
4936
4937 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4938 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
4939 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4940
4941 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4942 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4943 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
4944
4945 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
4946 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
4947 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
4948
4949 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
4950 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4951 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4952
4953 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
4954 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
4955
4956 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4957
4958 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
4959
4960 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4961 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4962 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4963
4964 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
4965 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
4966 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
4967
4968 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
4969 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4970 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4971
4972 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
4973 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4974 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
4975
4976 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
4977
4978 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4979 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
4980 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
4981
4982 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4983 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4984
4985 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
4986 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
4987 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4988
4989 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4990 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4991
4992 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
4993 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
4994 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
4995
4996 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
4997 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
4998 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
4999
5000 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
5001 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
5002 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
5003
5004 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
5005 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
5006 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
5007
5008 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
5009 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
5010 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
5011
5012 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
5013 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
5014 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
5015
5016 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
5017 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
5018 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
5019
5020 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
5021 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
5022
5023 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
5024 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
5025 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
5026
5027 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
5028 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
5029
5030 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
5031
5032 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
5033 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
5034
5035 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
5036 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
5037 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
5038
5039 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
5040 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
5041 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
5042
5043 ld1 { $acc_lb}, [$current_tag]
5044 ext $acc_lb, $acc_lb, $acc_lb, #8
5045 rev64 $acc_lb, $acc_lb
5046 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
5047
5048 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
5049 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
5050 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
5051
5052 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
5053 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
5054 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
5055
5056 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
5057
5058 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
5059 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
5060 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
5061
5062 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
5063 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
5064 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
5065
5066 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
5067 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
5068 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
5069
5070 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
5071 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
5072 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
5073
5074 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
5075 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
5076 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
5077
5078 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
5079 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
5080 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
5081
5082 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
5083 ldr $rk14q, [$cc, #224] @ load rk14
5084
5085 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
5086 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
5087 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
5088
5089 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
5090 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
5091 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
5092
5093 aese $ctr2b, $rk13 @ AES block 2 - round 13
5094 aese $ctr1b, $rk13 @ AES block 1 - round 13
5095 aese $ctr4b, $rk13 @ AES block 4 - round 13
5096
5097 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
5098 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
5099
5100 aese $ctr0b, $rk13 @ AES block 0 - round 13
5101 aese $ctr5b, $rk13 @ AES block 5 - round 13
5102
5103 aese $ctr6b, $rk13 @ AES block 6 - round 13
5104 aese $ctr7b, $rk13 @ AES block 7 - round 13
5105 aese $ctr3b, $rk13 @ AES block 3 - round 13
5106
5107 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
5108 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
5109 b.ge .L256_enc_tail @ handle tail
5110
5111 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
5112
5113 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
5114
5115 eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result
5116 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
5117 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
5118
5119 eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result
5120 eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result
5121
5122 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
5123 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
5124 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
5125
5126 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
5127 eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result
5128 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
5129
5130 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
5131 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
5132 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
5133
5134 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
5135
5136 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
5137 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
5138
5139 eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
5140
5141 eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
5142 eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
5143 eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
5144
5145 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
5146 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
5147
5148 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
5149 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
5150 b.ge .L256_enc_prepretail @ do prepretail
5151
5152 .L256_enc_main_loop: @ main loop start
5153 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
5154
5155 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
5156 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
5157 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
5158 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
5159
5160 rev64 $res3b, $res3b @ GHASH block 8k+3
5161 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
5162 ext $h5.16b, $h5.16b, $h5.16b, #8
5163 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
5164 ext $h6.16b, $h6.16b, $h6.16b, #8
5165 rev64 $res1b, $res1b @ GHASH block 8k+1
5166
5167 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
5168 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
5169 rev64 $res0b, $res0b @ GHASH block 8k
5170
5171 rev64 $res4b, $res4b @ GHASH block 8k+4
5172 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5173 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
5174 ext $h7.16b, $h7.16b, $h7.16b, #8
5175 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
5176 ext $h8.16b, $h8.16b, $h8.16b, #8
5177
5178 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
5179 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
5180 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
5181
5182 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
5183 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
5184 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
5185
5186 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
5187 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
5188 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
5189
5190 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
5191 eor $res0b, $res0b, $acc_lb @ PRE 1
5192 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
5193
5194 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
5195 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
5196 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
5197
5198 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
5199 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
5200 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
5201
5202 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
5203 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
5204 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
5205
5206 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5207 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5208 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
5209
5210 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
5211 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
5212 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
5213
5214 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
5215 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
5216 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
5217
5218 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
5219 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
5220 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
5221
5222 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
5223 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
5224 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
5225
5226 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
5227 rev64 $res6b, $res6b @ GHASH block 8k+6
5228 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
5229
5230 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
5231 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
5232 rev64 $res2b, $res2b @ GHASH block 8k+2
5233
5234 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
5235 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
5236 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
5237
5238 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
5239 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
5240 rev64 $res5b, $res5b @ GHASH block 8k+5
5241
5242 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
5243 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
5244 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5245 ext $h3.16b, $h3.16b, $h3.16b, #8
5246 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5247 ext $h4.16b, $h4.16b, $h4.16b, #8
5248
5249 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5250 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
5251 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
5252
5253 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
5254 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
5255 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
5256
5257 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
5258 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
5259 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
5260
5261 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5262 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
5263 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
5264
5265 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5266 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
5267 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
5268
5269 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
5270 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
5271 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
5272
5273 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5274 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
5275 rev64 $res7b, $res7b @ GHASH block 8k+7
5276
5277 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
5278 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
5279 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
5280
5281 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
5282 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
5283 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
5284
5285 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
5286 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
5287 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
5288
5289 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
5290 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
5291 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
5292
5293 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
5294 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
5295 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
5296
5297 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
5298 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
5299 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
5300
5301 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
5302 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
5303 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
5304
5305 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
5306 ext $h1.16b, $h1.16b, $h1.16b, #8
5307 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
5308 ext $h2.16b, $h2.16b, $h2.16b, #8
5309 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
5310 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5311
5312 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
5313 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5314 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
5315 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
5316
5317 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
5318 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
5319 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
5320
5321 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5322 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
5323 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
5324
5325 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
5326 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
5327 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
5328
5329 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
5330 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5331 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5332
5333 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
5334 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
5335 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
5336
5337 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
5338 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
5339 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
5340
5341 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
5342 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
5343 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
5344
5345 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
5346 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5347 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
5348
5349 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5350 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
5351 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
5352
5353 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5354 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
5355 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
5356
5357 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
5358 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
5359 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
5360
5361 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
5362 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
5363 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
5364
5365 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
5366 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
5367 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
5368
5369 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
5370
5371 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5372 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
5373 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
5374
5375 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
5376 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
5377 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
5378
5379 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
5380 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
5381 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
5382
5383 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
5384 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
5385 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
5386
5387 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
5388
5389 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
5390 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
5391
5392 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5393 ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
5394 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
5395
5396 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
5397 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
5398 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
5399
5400 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
5401 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
5402
5403 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5404 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
5405
5406 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
5407 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
5408
5409 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
5410 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
5411 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
5412
5413 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
5414 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
5415 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
5416
5417 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
5418 ldr $rk14q, [$cc, #224] @ load rk14
5419 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
5420
5421 ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
5422 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
5423 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
5424
5425 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
5426 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
5427 ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
5428
5429 ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
5430 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
5431 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
5432
5433 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
5434 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
5435 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
5436
5437 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
5438 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
5439 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5440
5441 eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result
5442 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
5443 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
5444
5445 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
5446 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
5447 eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
5448
5449 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5450 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5451 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
5452
5453 eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
5454 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
5455 eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result
5456
5457 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
5458 eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result
5459 eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result
5460
5461 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
5462 stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
5463 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
5464
5465 eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
5466 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
5467 stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
5468
5469 eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
5470 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
5471 stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
5472
5473 stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
5474 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
5475 b.lt .L256_enc_main_loop
5476
5477 .L256_enc_prepretail: @ PREPRETAIL
5478 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
5479 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
5480 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
5481
5482 rev64 $res2b, $res2b @ GHASH block 8k+2
5483
5484 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
5485 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
5486
5487 rev64 $res5b, $res5b @ GHASH block 8k+5
5488 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
5489 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
5490
5491 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
5492
5493 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
5494 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
5495 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
5496
5497 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
5498 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
5499
5500 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
5501 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
5502 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
5503
5504 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5505 rev64 $res0b, $res0b @ GHASH block 8k
5506 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
5507
5508 rev64 $res1b, $res1b @ GHASH block 8k+1
5509 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
5510 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
5511
5512 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
5513 ext $h7.16b, $h7.16b, $h7.16b, #8
5514 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
5515 ext $h8.16b, $h8.16b, $h8.16b, #8
5516 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
5517
5518 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
5519 ext $h5.16b, $h5.16b, $h5.16b, #8
5520 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
5521 ext $h6.16b, $h6.16b, $h6.16b, #8
5522 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
5523 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
5524
5525 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
5526 eor $res0b, $res0b, $acc_lb @ PRE 1
5527
5528 rev64 $res3b, $res3b @ GHASH block 8k+3
5529 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
5530
5531 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
5532 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
5533 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
5534
5535 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
5536 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
5537 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
5538
5539 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
5540 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
5541 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
5542
5543 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
5544 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5545 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
5546
5547 rev64 $res6b, $res6b @ GHASH block 8k+6
5548 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
5549 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
5550
5551 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
5552 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
5553 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
5554
5555 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
5556 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
5557
5558 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
5559 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
5560 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
5561
5562 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
5563 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
5564 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
5565
5566 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
5567 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
5568 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
5569
5570 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
5571 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
5572 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
5573
5574 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
5575 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
5576 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
5577
5578 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
5579 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
5580 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
5581
5582 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
5583 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5584 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
5585
5586 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
5587 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
5588 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
5589
5590 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
5591 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
5592 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5593
5594 rev64 $res4b, $res4b @ GHASH block 8k+4
5595 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
5596 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
5597
5598 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
5599 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
5600 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
5601
5602 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5603 ext $h3.16b, $h3.16b, $h3.16b, #8
5604 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5605 ext $h4.16b, $h4.16b, $h4.16b, #8
5606 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
5607 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
5608
5609 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
5610 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
5611
5612 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
5613 rev64 $res7b, $res7b @ GHASH block 8k+7
5614 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5615
5616 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
5617 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
5618 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
5619
5620 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
5621 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
5622 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
5623
5624 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
5625 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5626 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
5627 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
5628
5629 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
5630 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
5631 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
5632
5633 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
5634 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
5635 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
5636 ext $h1.16b, $h1.16b, $h1.16b, #8
5637 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
5638 ext $h2.16b, $h2.16b, $h2.16b, #8
5639
5640 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
5641 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
5642 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
5643
5644 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
5645 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
5646
5647 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
5648 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
5649 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
5650
5651 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
5652 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
5653 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5654
5655 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
5656 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
5657 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
5658
5659 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5660 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
5661 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
5662
5663 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
5664 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
5665 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
5666
5667 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
5668 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
5669 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
5670
5671 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
5672 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5673 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
5674
5675 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
5676 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
5677 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
5678
5679 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
5680 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
5681 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
5682
5683 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
5684 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
5685 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
5686
5687 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
5688 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
5689 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
5690
5691 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
5692 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
5693 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
5694
5695 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
5696
5697 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
5698 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
5699 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
5700
5701 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
5702 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
5703
5704 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
5705 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
5706 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
5707
5708 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
5709 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
5710 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
5711
5712 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
5713 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
5714 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
5715
5716 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5717 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
5718 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
5719
5720 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
5721 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5722 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
5723
5724 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
5725 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
5726 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
5727
5728 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
5729 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
5730 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
5731
5732 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5733 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
5734 ldr $rk14q, [$cc, #224] @ load rk14
5735
5736 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
5737 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
5738 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
5739
5740 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
5741 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
5742 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5743
5744 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
5745 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
5746
5747 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
5748 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
5749 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
5750
5751 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
5752 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
5753 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
5754
5755 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
5756 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
5757 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
5758
5759 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
5760 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
5761 .L256_enc_tail: @ TAIL
5762
5763 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
5764 ext $h8.16b, $h8.16b, $h8.16b, #8
5765 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5766
5767 ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
5768
5769 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
5770 ext $h5.16b, $h5.16b, $h5.16b, #8
5771
5772 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5773 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
5774 ext $h6.16b, $h6.16b, $h6.16b, #8
5775 ext $h7.16b, $h7.16b, $h7.16b, #8
5776 mov $t1.16b, $rk14
5777
5778 cmp $main_end_input_ptr, #112
5779 eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
5780 b.gt .L256_enc_blocks_more_than_7
5781
5782 movi $acc_l.8b, #0
5783 mov $ctr7b, $ctr6b
5784 movi $acc_h.8b, #0
5785
5786 mov $ctr6b, $ctr5b
5787 mov $ctr5b, $ctr4b
5788 mov $ctr4b, $ctr3b
5789
5790 mov $ctr3b, $ctr2b
5791 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5792 mov $ctr2b, $ctr1b
5793
5794 movi $acc_m.8b, #0
5795 cmp $main_end_input_ptr, #96
5796 b.gt .L256_enc_blocks_more_than_6
5797
5798 mov $ctr7b, $ctr6b
5799 mov $ctr6b, $ctr5b
5800 cmp $main_end_input_ptr, #80
5801
5802 mov $ctr5b, $ctr4b
5803 mov $ctr4b, $ctr3b
5804 mov $ctr3b, $ctr1b
5805
5806 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5807 b.gt .L256_enc_blocks_more_than_5
5808
5809 mov $ctr7b, $ctr6b
5810 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5811
5812 mov $ctr6b, $ctr5b
5813 mov $ctr5b, $ctr4b
5814
5815 cmp $main_end_input_ptr, #64
5816 mov $ctr4b, $ctr1b
5817 b.gt .L256_enc_blocks_more_than_4
5818
5819 cmp $main_end_input_ptr, #48
5820 mov $ctr7b, $ctr6b
5821 mov $ctr6b, $ctr5b
5822
5823 mov $ctr5b, $ctr1b
5824 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5825 b.gt .L256_enc_blocks_more_than_3
5826
5827 cmp $main_end_input_ptr, #32
5828 mov $ctr7b, $ctr6b
5829 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5830
5831 mov $ctr6b, $ctr1b
5832 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5833 b.gt .L256_enc_blocks_more_than_2
5834
5835 mov $ctr7b, $ctr1b
5836
5837 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5838 cmp $main_end_input_ptr, #16
5839 b.gt .L256_enc_blocks_more_than_1
5840
5841 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5842 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
5843 b .L256_enc_blocks_less_than_1
5844 .L256_enc_blocks_more_than_7: @ blocks left > 7
5845 st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
5846
5847 rev64 $res0b, $res1b @ GHASH final-7 block
5848
5849 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5850
5851 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
5852
5853 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
5854 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
5855 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
5856
5857 movi $t0.8b, #0 @ supress further partial tag feed in
5858
5859 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
5860 eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
5861
5862 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
5863 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
5864 .L256_enc_blocks_more_than_6: @ blocks left > 6
5865
5866 st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
5867
5868 rev64 $res0b, $res1b @ GHASH final-6 block
5869
5870 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5871
5872 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
5873 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
5874 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
5875
5876 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
5877
5878 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
5879
5880 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
5881
5882 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
5883 eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
5884
5885 movi $t0.8b, #0 @ supress further partial tag feed in
5886
5887 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
5888 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
5889 .L256_enc_blocks_more_than_5: @ blocks left > 5
5890
5891 st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
5892
5893 rev64 $res0b, $res1b @ GHASH final-5 block
5894
5895 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5896
5897 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
5898
5899 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
5900
5901 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
5902 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
5903
5904 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
5905
5906 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
5907 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
5908
5909 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
5910 movi $t0.8b, #0 @ supress further partial tag feed in
5911 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
5912
5913 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
5914 eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
5915 .L256_enc_blocks_more_than_4: @ blocks left > 4
5916
5917 st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
5918
5919 rev64 $res0b, $res1b @ GHASH final-4 block
5920
5921 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
5922
5923 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5924
5925 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
5926 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
5927
5928 eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
5929 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
5930
5931 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
5932 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
5933
5934 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
5935
5936 movi $t0.8b, #0 @ supress further partial tag feed in
5937
5938 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
5939 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
5940 .L256_enc_blocks_more_than_3: @ blocks left > 3
5941
5942 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
5943
5944 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5945 ext $h4.16b, $h4.16b, $h4.16b, #8
5946 rev64 $res0b, $res1b @ GHASH final-3 block
5947
5948 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5949
5950 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
5951 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
5952
5953 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
5954 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5955 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
5956
5957 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
5958 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
5959
5960 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
5961 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
5962
5963 eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
5964 movi $t0.8b, #0 @ supress further partial tag feed in
5965
5966 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
5967 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
5968 .L256_enc_blocks_more_than_2: @ blocks left > 2
5969
5970 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5971 ext $h3.16b, $h3.16b, $h3.16b, #8
5972
5973 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
5974
5975 rev64 $res0b, $res1b @ GHASH final-2 block
5976 ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
5977
5978 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5979
5980 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
5981
5982 movi $t0.8b, #0 @ supress further partial tag feed in
5983
5984 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5985 eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
5986
5987 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5988
5989 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5990
5991 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
5992 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
5993
5994 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
5995 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
5996 .L256_enc_blocks_more_than_1: @ blocks left > 1
5997
5998 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
5999
6000 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
6001 ext $h2.16b, $h2.16b, $h2.16b, #8
6002 rev64 $res0b, $res1b @ GHASH final-1 block
6003 ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
6004
6005 eor $res0b, $res0b, $t0.16b @ feed in partial tag
6006 movi $t0.8b, #0 @ supress further partial tag feed in
6007
6008 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
6009 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
6010
6011 eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
6012 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
6013
6014 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
6015 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
6016
6017 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
6018
6019 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
6020 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
6021
6022 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
6023
6024 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
6025 .L256_enc_blocks_less_than_1: @ blocks left <= 1
6026
6027 and $bit_length, $bit_length, #127 @ bit_length %= 128
6028
6029 sub $bit_length, $bit_length, #128 @ bit_length -= 128
6030
6031 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
6032
6033 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
6034 and $bit_length, $bit_length, #127 @ bit_length %= 128
6035
6036 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
6037 cmp $bit_length, #64
6038 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
6039
6040 csel $temp3_x, $temp0_x, xzr, lt
6041 csel $temp2_x, $temp1_x, $temp0_x, lt
6042
6043 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
6044 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
6045 ext $h1.16b, $h1.16b, $h1.16b, #8
6046
6047 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
6048 mov $ctr0.d[1], $temp3_x
6049
6050 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
6051
6052 rev64 $res0b, $res1b @ GHASH final block
6053
6054 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
6055 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
6056 str $rtmp_ctrq, [$counter] @ store the updated counter
6057
6058 eor $res0b, $res0b, $t0.16b @ feed in partial tag
6059 st1 { $res1b}, [$output_ptr] @ store all 16B
6060
6061 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
6062 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
6063 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
6064
6065 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
6066 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
6067
6068 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
6069
6070 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
6071
6072 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
6073 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
6074
6075 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
6076
6077 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
6078 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
6079
6080 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
6081
6082 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6083 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6084
6085 eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
6086 ext $acc_lb, $acc_lb, $acc_lb, #8
6087 rev64 $acc_lb, $acc_lb
6088 st1 { $acc_l.16b }, [$current_tag]
6089 lsr x0, $bit_length, #3 @ return sizes
6090
6091 ldp d10, d11, [sp, #16]
6092 ldp d12, d13, [sp, #32]
6093 ldp d14, d15, [sp, #48]
6094 ldp d8, d9, [sp], #80
6095 ret
6096
6097 .L256_enc_ret:
6098 mov w0, #0x0
6099 ret
6100 .size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6101 ___
6102
6103 {
6104 #########################################################################################
6105 # size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
6106 # size_t len,
6107 # unsigned char *out,
6108 # const void *key,
6109 # unsigned char ivec[16],
6110 # u64 *Xi);
6111 #
6112 $code.=<<___;
6113 .global unroll8_eor3_aes_gcm_dec_256_kernel
6114 .type unroll8_eor3_aes_gcm_dec_256_kernel,%function
6115 .align 4
6116 unroll8_eor3_aes_gcm_dec_256_kernel:
6117 AARCH64_VALID_CALL_TARGET
6118 cbz x1, .L256_dec_ret
6119 stp d8, d9, [sp, #-80]!
6120 mov $counter, x4
6121 mov $cc, x5
6122 stp d10, d11, [sp, #16]
6123 stp d12, d13, [sp, #32]
6124 stp d14, d15, [sp, #48]
6125 mov x5, #0xc200000000000000
6126 stp x5, xzr, [sp, #64]
6127 add $modulo_constant, sp, #64
6128
6129 ld1 { $ctr0b}, [$counter] @ CTR block 0
6130
6131 mov $constant_temp, #0x100000000 @ set up counter increment
6132 movi $rctr_inc.16b, #0x0
6133 mov $rctr_inc.d[1], $constant_temp
6134 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
6135
6136 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
6137
6138 rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
6139
6140 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
6141
6142 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
6143 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
6144
6145 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
6146 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
6147 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
6148
6149 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
6150 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
6151
6152 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
6153 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
6154
6155 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
6156
6157 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
6158 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
6159
6160 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
6161 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
6162
6163 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
6164 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
6165
6166 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
6167 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
6168
6169 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
6170 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
6171
6172 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
6173 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
6174 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
6175
6176 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
6177 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
6178 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
6179
6180 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
6181 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
6182 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
6183
6184 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
6185 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
6186
6187 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
6188 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
6189 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
6190
6191 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
6192 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
6193 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
6194
6195 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
6196 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
6197 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
6198
6199 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
6200 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
6201
6202 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
6203 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
6204
6205 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
6206 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
6207 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
6208
6209 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
6210
6211 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
6212 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
6213
6214 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
6215 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
6216 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
6217
6218 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
6219 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
6220 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
6221
6222 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
6223 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
6224
6225 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
6226 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
6227 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
6228
6229 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
6230
6231 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
6232 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
6233
6234 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
6235
6236 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
6237 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
6238 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
6239
6240 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
6241 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
6242 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
6243
6244 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
6245 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
6246 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
6247
6248 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
6249 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
6250
6251 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
6252 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
6253 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
6254
6255 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
6256 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
6257 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
6258
6259 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
6260 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
6261 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
6262
6263 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
6264 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
6265 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
6266
6267 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
6268 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
6269 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
6270
6271 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
6272
6273 ld1 { $acc_lb}, [$current_tag]
6274 ext $acc_lb, $acc_lb, $acc_lb, #8
6275 rev64 $acc_lb, $acc_lb
6276 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
6277 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
6278 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
6279
6280 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
6281 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
6282
6283 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
6284 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
6285
6286 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
6287
6288 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
6289 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
6290
6291 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
6292 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
6293 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
6294
6295 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
6296 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
6297 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
6298
6299 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
6300 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
6301 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
6302
6303 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
6304 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
6305
6306 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
6307 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
6308 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
6309
6310 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
6311 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
6312 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
6313
6314 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
6315 ldr $rk14q, [$cc, #224] @ load rk14
6316
6317 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
6318 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
6319 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
6320
6321 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
6322 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
6323 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
6324
6325 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
6326 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
6327 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
6328
6329 aese $ctr5b, $rk13 @ AES block 5 - round 13
6330 aese $ctr1b, $rk13 @ AES block 1 - round 13
6331 aese $ctr2b, $rk13 @ AES block 2 - round 13
6332
6333 aese $ctr0b, $rk13 @ AES block 0 - round 13
6334 aese $ctr4b, $rk13 @ AES block 4 - round 13
6335 aese $ctr6b, $rk13 @ AES block 6 - round 13
6336
6337 aese $ctr3b, $rk13 @ AES block 3 - round 13
6338 aese $ctr7b, $rk13 @ AES block 7 - round 13
6339 b.ge .L256_dec_tail @ handle tail
6340
6341 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
6342
6343 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
6344
6345 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
6346
6347 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
6348 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
6349
6350 eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result
6351 eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result
6352 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
6353
6354 rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
6355 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
6356 eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result
6357
6358 eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result
6359
6360 eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result
6361 rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
6362 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
6363
6364 eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result
6365 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
6366
6367 rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
6368 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
6369
6370 eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result
6371
6372 rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
6373 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
6374 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
6375
6376 eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result
6377 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
6378
6379 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
6380 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
6381 b.ge .L256_dec_prepretail @ do prepretail
6382
6383 .L256_dec_main_loop: @ main loop start
6384 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
6385 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
6386 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
6387
6388 rev64 $res1b, $res1b @ GHASH block 8k+1
6389 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
6390 ext $h7.16b, $h7.16b, $h7.16b, #8
6391 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
6392 ext $h8.16b, $h8.16b, $h8.16b, #8
6393
6394 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
6395 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
6396 rev64 $res0b, $res0b @ GHASH block 8k
6397
6398 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
6399 rev64 $res4b, $res4b @ GHASH block 8k+4
6400 rev64 $res3b, $res3b @ GHASH block 8k+3
6401
6402 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
6403 rev64 $res7b, $res7b @ GHASH block 8k+7
6404
6405 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
6406 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
6407 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
6408
6409 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
6410 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
6411 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
6412
6413 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
6414 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
6415 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
6416
6417 eor $res0b, $res0b, $acc_lb @ PRE 1
6418 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
6419 ext $h5.16b, $h5.16b, $h5.16b, #8
6420 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
6421 ext $h6.16b, $h6.16b, $h6.16b, #8
6422 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
6423
6424 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
6425 rev64 $res2b, $res2b @ GHASH block 8k+2
6426 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
6427
6428 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
6429 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
6430 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
6431
6432 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6433 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
6434 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
6435
6436 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
6437 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
6438 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
6439
6440 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
6441 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
6442 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
6443
6444 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
6445 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
6446 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
6447
6448 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
6449 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
6450 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
6451
6452 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
6453 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
6454 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
6455
6456 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
6457 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
6458 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
6459
6460 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
6461 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
6462 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6463
6464 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
6465 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
6466 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
6467
6468 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
6469 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
6470 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
6471
6472 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
6473 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
6474 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
6475
6476 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
6477 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
6478 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
6479
6480 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
6481 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
6482 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
6483 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
6484
6485 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
6486 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
6487 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
6488
6489 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
6490 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
6491 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
6492
6493 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
6494 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
6495 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
6496
6497 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
6498 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6499 rev64 $res5b, $res5b @ GHASH block 8k+5
6500
6501 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
6502 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
6503 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6504
6505 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
6506 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
6507 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
6508
6509 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6510 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
6511 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
6512
6513 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6514 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
6515 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
6516
6517 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
6518 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
6519 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
6520
6521 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
6522 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
6523 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
6524
6525 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
6526 ext $h3.16b, $h3.16b, $h3.16b, #8
6527 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
6528 ext $h4.16b, $h4.16b, $h4.16b, #8
6529 rev64 $res6b, $res6b @ GHASH block 8k+6
6530 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
6531
6532 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
6533 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
6534 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
6535
6536 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
6537 ext $h1.16b, $h1.16b, $h1.16b, #8
6538 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
6539 ext $h2.16b, $h2.16b, $h2.16b, #8
6540 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6541 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
6542
6543 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
6544 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
6545 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
6546
6547 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
6548 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
6549 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
6550 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
6551
6552 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
6553 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
6554 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6555
6556 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
6557 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
6558 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
6559
6560 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
6561 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
6562 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
6563
6564 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
6565 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
6566 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
6567
6568 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6569 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
6570 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
6571
6572 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
6573 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
6574 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6575
6576 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
6577 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
6578 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
6579
6580 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
6581 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6582 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
6583
6584 ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
6585 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6586 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
6587
6588 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
6589 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
6590 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
6591
6592 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
6593 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
6594 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
6595
6596 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
6597 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
6598 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
6599
6600 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
6601 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
6602 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
6603
6604 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
6605 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6606 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
6607
6608 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
6609 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
6610 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
6611
6612 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
6613 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
6614 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
6615
6616 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
6617 rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
6618 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
6619
6620 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
6621 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
6622 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
6623
6624 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
6625 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
6626
6627 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6628 rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
6629 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
6630
6631 ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
6632 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
6633 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
6634
6635 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
6636 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
6637 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
6638
6639 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
6640 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
6641 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
6642
6643 rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
6644 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
6645 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
6646
6647 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
6648 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
6649 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
6650
6651 ldr $rk14q, [$cc, #224] @ load rk14
6652 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
6653 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
6654
6655 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
6656 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
6657 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
6658
6659 ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
6660 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
6661 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
6662
6663 ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
6664 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
6665 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
6666
6667 rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
6668 eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result
6669 eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result
6670
6671 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6672 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
6673
6674 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
6675 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6676 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
6677
6678 eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result
6679 eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result
6680 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
6681
6682 stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
6683 mov $ctr0.16b, $h1.16b @ CTR block 8k+16
6684 eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result
6685
6686 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
6687 eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result
6688 stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
6689
6690 mov $ctr3.16b, $h4.16b @ CTR block 8k+19
6691 mov $ctr2.16b, $h3.16b @ CTR block 8k+18
6692 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
6693
6694 mov $ctr1.16b, $h2.16b @ CTR block 8k+17
6695 stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
6696 eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result
6697
6698 eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result
6699 rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
6700 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
6701
6702 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
6703 stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
6704 b.lt .L256_dec_main_loop
6705
6706 .L256_dec_prepretail: @ PREPRETAIL
6707 ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
6708 rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
6709 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
6710
6711 rev64 $res4b, $res4b @ GHASH block 8k+4
6712 ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
6713 ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
6714
6715 rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
6716 rev64 $res0b, $res0b @ GHASH block 8k
6717 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
6718
6719 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
6720 ldr $h7q, [$current_tag, #176] @ load h7l | h7h
6721 ext $h7.16b, $h7.16b, $h7.16b, #8
6722 ldr $h8q, [$current_tag, #208] @ load h8l | h8h
6723 ext $h8.16b, $h8.16b, $h8.16b, #8
6724 rev64 $res1b, $res1b @ GHASH block 8k+1
6725
6726 rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
6727 rev64 $res2b, $res2b @ GHASH block 8k+2
6728 ldr $h5q, [$current_tag, #128] @ load h5l | h5h
6729 ext $h5.16b, $h5.16b, $h5.16b, #8
6730 ldr $h6q, [$current_tag, #160] @ load h6l | h6h
6731 ext $h6.16b, $h6.16b, $h6.16b, #8
6732
6733 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
6734 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
6735 aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
6736
6737 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
6738 aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
6739 aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
6740
6741 aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
6742 aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
6743 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
6744
6745 ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
6746 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
6747 eor $res0b, $res0b, $acc_lb @ PRE 1
6748
6749 aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
6750 aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
6751 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
6752
6753 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
6754 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
6755 aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
6756
6757 pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
6758 trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6759 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
6760
6761 rev64 $res3b, $res3b @ GHASH block 8k+3
6762 pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
6763
6764 aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
6765 aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
6766 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
6767
6768 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
6769 aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
6770 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
6771
6772 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
6773 aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
6774
6775 aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
6776 rev64 $res6b, $res6b @ GHASH block 8k+6
6777
6778 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
6779 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
6780 aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
6781
6782 pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
6783 trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
6784 aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
6785
6786 ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
6787 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
6788 pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
6789
6790 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
6791 eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
6792 eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
6793
6794 aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
6795 pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
6796 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
6797
6798 eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
6799 trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6800 trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
6801
6802 pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
6803 pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
6804 eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
6805
6806 pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
6807 aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
6808 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
6809
6810 eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
6811 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
6812 ext $h1.16b, $h1.16b, $h1.16b, #8
6813 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
6814 ext $h2.16b, $h2.16b, $h2.16b, #8
6815 aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
6816
6817 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
6818 aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
6819 eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
6820
6821 eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6822 aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
6823 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
6824
6825 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
6826 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
6827 aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
6828
6829 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
6830 pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
6831 aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
6832
6833 aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
6834 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
6835 pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
6836
6837 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
6838 aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
6839 ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
6840
6841 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
6842 ext $h3.16b, $h3.16b, $h3.16b, #8
6843 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
6844 ext $h4.16b, $h4.16b, $h4.16b, #8
6845 rev64 $res7b, $res7b @ GHASH block 8k+7
6846 rev64 $res5b, $res5b @ GHASH block 8k+5
6847
6848 eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
6849
6850 trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6851
6852 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
6853 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
6854 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
6855 aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
6856
6857 aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
6858 aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
6859
6860 pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
6861 pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
6862 pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
6863
6864 trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
6865 pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
6866 trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6867
6868 aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
6869 pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
6870 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
6871
6872 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
6873 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
6874 aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
6875
6876 ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
6877 pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
6878 aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
6879
6880 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
6881 aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
6882
6883 aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
6884 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
6885 eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
6886
6887 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
6888 trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
6889 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
6890
6891 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
6892 aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
6893 aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
6894
6895 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
6896 aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
6897 aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
6898
6899 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
6900 aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
6901 eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6902
6903 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
6904 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
6905 eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6906
6907 aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
6908 aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
6909 pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
6910
6911 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
6912 pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
6913 pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
6914
6915 pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
6916 pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
6917 pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
6918
6919 ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
6920 eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
6921 eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
6922
6923 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
6924 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
6925 aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
6926
6927 eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
6928 eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
6929 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
6930
6931 eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
6932
6933 aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
6934 aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
6935 aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
6936
6937 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
6938 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
6939 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
6940
6941 eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
6942
6943 aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
6944 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
6945 ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
6946
6947 ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
6948
6949 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
6950 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
6951 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
6952
6953 pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
6954 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
6955
6956 aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
6957 aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
6958 aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
6959
6960 aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
6961 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
6962
6963 eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
6964
6965 aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
6966 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
6967 aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
6968
6969 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6970 aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
6971 aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
6972
6973 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
6974 ldr $rk14q, [$cc, #224] @ load rk14
6975 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
6976
6977 aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
6978 ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6979 aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
6980
6981 aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
6982 aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
6983 aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
6984
6985 aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
6986 eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
6987 add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
6988
6989 aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
6990 aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
6991 .L256_dec_tail: @ TAIL
6992
6993 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
6994 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
6995 cmp $main_end_input_ptr, #112
6996
6997 ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
6998
6999 ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
7000 ext $h8.16b, $h8.16b, $h8.16b, #8
7001 mov $t1.16b, $rk14
7002
7003 ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
7004 ext $h5.16b, $h5.16b, $h5.16b, #8
7005
7006 eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
7007 ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
7008 ext $h6.16b, $h6.16b, $h6.16b, #8
7009 ext $h7.16b, $h7.16b, $h7.16b, #8
7010 b.gt .L256_dec_blocks_more_than_7
7011
7012 mov $ctr7b, $ctr6b
7013 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7014 mov $ctr6b, $ctr5b
7015
7016 mov $ctr5b, $ctr4b
7017 mov $ctr4b, $ctr3b
7018 movi $acc_l.8b, #0
7019
7020 movi $acc_h.8b, #0
7021 movi $acc_m.8b, #0
7022 mov $ctr3b, $ctr2b
7023
7024 cmp $main_end_input_ptr, #96
7025 mov $ctr2b, $ctr1b
7026 b.gt .L256_dec_blocks_more_than_6
7027
7028 mov $ctr7b, $ctr6b
7029 mov $ctr6b, $ctr5b
7030
7031 mov $ctr5b, $ctr4b
7032 cmp $main_end_input_ptr, #80
7033 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7034
7035 mov $ctr4b, $ctr3b
7036 mov $ctr3b, $ctr1b
7037 b.gt .L256_dec_blocks_more_than_5
7038
7039 cmp $main_end_input_ptr, #64
7040 mov $ctr7b, $ctr6b
7041 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7042
7043 mov $ctr6b, $ctr5b
7044
7045 mov $ctr5b, $ctr4b
7046 mov $ctr4b, $ctr1b
7047 b.gt .L256_dec_blocks_more_than_4
7048
7049 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7050 mov $ctr7b, $ctr6b
7051 cmp $main_end_input_ptr, #48
7052
7053 mov $ctr6b, $ctr5b
7054 mov $ctr5b, $ctr1b
7055 b.gt .L256_dec_blocks_more_than_3
7056
7057 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
7058 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7059 mov $ctr7b, $ctr6b
7060
7061 cmp $main_end_input_ptr, #32
7062 mov $ctr6b, $ctr1b
7063 b.gt .L256_dec_blocks_more_than_2
7064
7065 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7066
7067 mov $ctr7b, $ctr1b
7068 cmp $main_end_input_ptr, #16
7069 b.gt .L256_dec_blocks_more_than_1
7070
7071 sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7072 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
7073 b .L256_dec_blocks_less_than_1
7074 .L256_dec_blocks_more_than_7: @ blocks left > 7
7075 rev64 $res0b, $res1b @ GHASH final-7 block
7076 ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
7077 st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
7078
7079 ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
7080
7081 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7082
7083 ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
7084 eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
7085
7086 pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
7087
7088 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
7089 movi $t0.8b, #0 @ supress further partial tag feed in
7090
7091 pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
7092 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
7093 .L256_dec_blocks_more_than_6: @ blocks left > 6
7094
7095 rev64 $res0b, $res1b @ GHASH final-6 block
7096
7097 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7098 ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
7099 movi $t0.8b, #0 @ supress further partial tag feed in
7100
7101 ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
7102 st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
7103 pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
7104
7105 pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
7106
7107 eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
7108 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
7109 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
7110
7111 pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
7112
7113 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
7114 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
7115 .L256_dec_blocks_more_than_5: @ blocks left > 5
7116
7117 rev64 $res0b, $res1b @ GHASH final-5 block
7118
7119 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7120
7121 pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
7122 ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
7123
7124 ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
7125
7126 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
7127 st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
7128
7129 pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
7130 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
7131
7132 pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
7133
7134 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
7135 eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
7136 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
7137
7138 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
7139 movi $t0.8b, #0 @ supress further partial tag feed in
7140 .L256_dec_blocks_more_than_4: @ blocks left > 4
7141
7142 rev64 $res0b, $res1b @ GHASH final-4 block
7143
7144 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7145
7146 ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
7147 ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
7148
7149 movi $t0.8b, #0 @ supress further partial tag feed in
7150
7151 pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
7152 pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
7153
7154 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
7155
7156 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
7157
7158 pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
7159
7160 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
7161 st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
7162
7163 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
7164 eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
7165 .L256_dec_blocks_more_than_3: @ blocks left > 3
7166
7167 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
7168 ext $h4.16b, $h4.16b, $h4.16b, #8
7169 rev64 $res0b, $res1b @ GHASH final-3 block
7170
7171 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7172 ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
7173 ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
7174
7175 ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
7176 st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
7177
7178 eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
7179
7180 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
7181
7182 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
7183 pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
7184 pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
7185
7186 movi $t0.8b, #0 @ supress further partial tag feed in
7187 pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
7188 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
7189
7190 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
7191
7192 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
7193 .L256_dec_blocks_more_than_2: @ blocks left > 2
7194
7195 rev64 $res0b, $res1b @ GHASH final-2 block
7196
7197 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
7198 ext $h3.16b, $h3.16b, $h3.16b, #8
7199 ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
7200
7201 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7202
7203 ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
7204
7205 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
7206 st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
7207 eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
7208
7209 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
7210 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
7211 movi $t0.8b, #0 @ supress further partial tag feed in
7212
7213 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
7214 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
7215
7216 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
7217 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
7218 .L256_dec_blocks_more_than_1: @ blocks left > 1
7219
7220 rev64 $res0b, $res1b @ GHASH final-1 block
7221
7222 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7223
7224 ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
7225 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
7226 ext $h2.16b, $h2.16b, $h2.16b, #8
7227
7228 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
7229 ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
7230 st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
7231
7232 ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
7233 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
7234
7235 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
7236
7237 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
7238
7239 eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
7240 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
7241
7242 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
7243
7244 movi $t0.8b, #0 @ supress further partial tag feed in
7245 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
7246
7247 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
7248 .L256_dec_blocks_less_than_1: @ blocks left <= 1
7249
7250 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
7251 mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
7252 and $bit_length, $bit_length, #127 @ bit_length %= 128
7253
7254 sub $bit_length, $bit_length, #128 @ bit_length -= 128
7255 rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
7256 str $rtmp_ctrq, [$counter] @ store the updated counter
7257
7258 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
7259
7260 and $bit_length, $bit_length, #127 @ bit_length %= 128
7261
7262 lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
7263 cmp $bit_length, #64
7264 mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
7265
7266 csel $temp3_x, $temp0_x, xzr, lt
7267 csel $temp2_x, $temp1_x, $temp0_x, lt
7268
7269 mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
7270 mov $ctr0.d[1], $temp3_x
7271
7272 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
7273 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
7274 ext $h1.16b, $h1.16b, $h1.16b, #8
7275 bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
7276
7277 rev64 $res0b, $res1b @ GHASH final block
7278
7279 eor $res0b, $res0b, $t0.16b @ feed in partial tag
7280
7281 ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
7282 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
7283
7284 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
7285
7286 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
7287 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
7288
7289 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
7290
7291 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
7292 ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
7293 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
7294
7295 pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
7296 eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
7297
7298 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
7299 st1 { $res4b}, [$output_ptr] @ store all 16B
7300
7301 eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
7302
7303 eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid
7304 eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid
7305
7306 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
7307
7308 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
7309 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
7310
7311 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
7312 ext $acc_lb, $acc_lb, $acc_lb, #8
7313 rev64 $acc_lb, $acc_lb
7314 st1 { $acc_l.16b }, [$current_tag]
7315 lsr x0, $bit_length, #3 @ return sizes
7316
7317 ldp d10, d11, [sp, #16]
7318 ldp d12, d13, [sp, #32]
7319 ldp d14, d15, [sp, #48]
7320 ldp d8, d9, [sp], #80
7321 ret
7322
7323 .L256_dec_ret:
7324 mov w0, #0x0
7325 ret
7326 .size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
7327 ___
7328 }
7329 }
7330
7331 $code.=<<___;
7332 .asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
7333 .align 2
7334 #endif
7335 ___
7336
7337 {
7338 my %opcode = (
7339 "rax1" => 0xce608c00, "eor3" => 0xce000000,
7340 "bcax" => 0xce200000, "xar" => 0xce800000 );
7341
7342 sub unsha3 {
7343 my ($mnemonic,$arg)=@_;
7344
7345 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
7346 &&
7347 sprintf ".inst\t0x%08x\t//%s %s",
7348 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
7349 $mnemonic,$arg;
7350 }
7351 sub unvmov {
7352 my $arg=shift;
7353
7354 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
7355 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
7356 $3<8?$3:$3+8,($4 eq "lo")?0:1;
7357 }
7358
7359 foreach(split("\n",$code)) {
7360 s/@\s/\/\//o; # old->new style commentary
7361 s/\`([^\`]*)\`/eval($1)/ge;
7362
7363 m/\bld1r\b/ and s/\.16b/.2d/g or
7364 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
7365 print $_,"\n";
7366 }
7367 }
7368
7369 close STDOUT or die "error closing STDOUT: $!"; # enforce flush