2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright 2021- IBM Inc. All rights reserved
5 # Licensed under the Apache License 2.0 (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
10 #===================================================================================
11 # Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
13 # GHASH is based on the Karatsuba multiplication method.
17 # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
18 # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
19 # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
20 # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
21 # (X4.h * H.h + X4.l * H.l + X4 * H)
25 # Hash keys = v3 - v14
27 # ( H^2.l, H^2, H^2.h)
28 # ( H^3.l, H^3, H^3.h)
29 # ( H^4.l, H^4, H^4.h)
35 # vs0 - vs14 for round keys
36 # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
38 # This implementation uses stitched AES-GCM approach to improve overall performance.
39 # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
41 # Current large block (16384 bytes) performance per second with 128 bit key --
44 # Power10[le] (3.5GHz) 5.32G 5.26G
46 # ===================================================================================
48 # $output is the last argument if it looks like a file (it has an extension)
49 # $flavour is the first argument if it doesn't look like a file
50 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
51 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
53 if ($flavour =~ /64/) {
61 } elsif ($flavour =~ /32/) {
69 } else { die "nonsense $flavour"; }
72 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
74 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
75 ( $xlate="${dir}ppc-xlate.pl" and -f
$xlate ) or
76 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f
$xlate) or
77 die "can't locate ppc-xlate.pl";
79 open STDOUT
,"| $^X $xlate $flavour \"$output\""
80 or die "can't call $xlate: $!";
88 # v15 - v18 - input states
89 # vs1 - vs9 - round keys
91 .macro Loop_aes_middle4x
150 # v15 - v22 - input states
151 # vs1 - vs9 - round keys
153 .macro Loop_aes_middle8x
248 # Compute 4x hash values based on Karatsuba method.
255 vpmsumd
23, 12, 15 # H4.L * X.L
264 vpmsumd
24, 13, 15 # H4.L * X.H + H4.H * X.L
265 vpmsumd
25, 10, 16 # H3.L * X1.H + H3.H * X1.L
273 # sum hash and reduction with H Poly
274 vpmsumd
28, 23, 2 # reduction
277 vsldoi
26, 24, 29, 8 # mL
278 vsldoi
29, 29, 24, 8 # mH
279 vxor
23, 23, 26 # mL + L
281 vsldoi
23, 23, 23, 8 # swap
284 vpmsumd
24, 14, 15 # H4.H * X.H
295 # sum hash and reduction with H Poly
296 vsldoi
27, 23, 23, 8 # swap
301 xxlor
32, 23+32, 23+32 # update hash
306 # Combine two 4x ghash
307 # v15 - v22 - input blocks
309 .macro ppc_aes_gcm_ghash2_4x
311 vxor
15, 15, 0 # Xi + X
315 vpmsumd
23, 12, 15 # H4.L * X.L
324 vpmsumd
24, 13, 15 # H4.L * X.H + H4.H * X.L
325 vpmsumd
25, 10, 16 # H3.L * X1.H + H3.H * X1.L
332 # sum hash and reduction with H Poly
333 vpmsumd
28, 23, 2 # reduction
338 vsldoi
26, 24, 29, 8 # mL
339 vsldoi
29, 29, 24, 8 # mH
340 vxor
23, 23, 26 # mL + L
342 vsldoi
23, 23, 23, 8 # swap
345 vpmsumd
24, 14, 15 # H4.H * X.H
354 vxor
24, 24, 29 # H + mH
356 # sum hash and reduction with H Poly
357 vsldoi
27, 23, 23, 8 # swap
360 vxor
27, 23, 27 # 1st Xi
366 vxor
19, 19, 27 # Xi + X
367 vpmsumd
23, 12, 19 # H4.L * X.L
373 vpmsumd
24, 13, 19 # H4.L * X.H + H4.H * X.L
374 vpmsumd
25, 10, 20 # H3.L * X1.H + H3.H * X1.L
381 # sum hash and reduction with H Poly
382 vpmsumd
28, 23, 2 # reduction
387 vsldoi
26, 24, 29, 8 # mL
388 vsldoi
29, 29, 24, 8 # mH
389 vxor
23, 23, 26 # mL + L
391 vsldoi
23, 23, 23, 8 # swap
394 vpmsumd
24, 14, 19 # H4.H * X.H
403 vxor
24, 24, 29 # H + mH
405 # sum hash and reduction with H Poly
406 vsldoi
27, 23, 23, 8 # swap
411 xxlor
32, 23+32, 23+32 # update hash
416 # Compute update single hash
418 .macro ppc_update_hash_1x
423 vpmsumd
22, 3, 28 # L
424 vpmsumd
23, 4, 28 # M
425 vpmsumd
24, 5, 28 # H
427 vpmsumd
27, 22, 2 # reduction
429 vsldoi
25, 23, 19, 8 # mL
430 vsldoi
26, 19, 23, 8 # mH
431 vxor
22, 22, 25 # LL + LL
432 vxor
24, 24, 26 # HH + HH
434 vsldoi
22, 22, 22, 8 # swap
437 vsldoi
20, 22, 22, 8 # swap
438 vpmsumd
22, 22, 2 # reduction
442 vmr
0, 22 # update hash
447 # ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
448 # const AES_KEY *key, unsigned char iv[16],
454 # r6 - AES round keys
456 # r8 - Xi, HPoli, hash keys
458 .global ppc_aes_gcm_encrypt
461 _ppc_aes_gcm_encrypt
:
501 lxvb16x
32, 0, 8 # load Xi
503 # load Hash - h^4, h^3, h^2, h
505 lxvd2x
2+32, 10, 8 # H Poli
507 lxvd2x
3+32, 10, 8 # Hl
509 lxvd2x
4+32, 10, 8 # H
511 lxvd2x
5+32, 10, 8 # Hh
514 lxvd2x
6+32, 10, 8 # H^2l
516 lxvd2x
7+32, 10, 8 # H^2
518 lxvd2x
8+32, 10, 8 # H^2h
521 lxvd2x
9+32, 10, 8 # H^3l
523 lxvd2x
10+32, 10, 8 # H^3
525 lxvd2x
11+32, 10, 8 # H^3h
528 lxvd2x
12+32, 10, 8 # H^4l
530 lxvd2x
13+32, 10, 8 # H^4
532 lxvd2x
14+32, 10, 8 # H^4h
534 # initialize ICB: GHASH( IV ), IV - r7
535 lxvb16x
30+32, 0, 7 # load IV - v30
538 li
11, 0 # block index
543 vsldoi
31, 31, 22,1 # counter 1
545 # load round key to VSR
558 # load rounds - 10 (128), 12 (192), 14 (256)
562 # vxor state, state, w # addroundkey
564 vxor
15, 30, 29 # IV + round key - add round key 0
569 # load 2 more round keys (v11, v12)
576 # load 2 more round keys (v11, v12, v13, v14)
591 divdu
10, 5, 10 # n 128 bytes-blocks
595 vaddudm
30, 30, 31 # IV + counter
624 lxvb16x
15, 0, 14 # load block
625 lxvb16x
16, 15, 14 # load block
626 lxvb16x
17, 16, 14 # load block
627 lxvb16x
18, 17, 14 # load block
628 lxvb16x
19, 18, 14 # load block
629 lxvb16x
20, 19, 14 # load block
630 lxvb16x
21, 20, 14 # load block
631 lxvb16x
22, 21, 14 # load block
698 vcipherlast
15, 15, 23
699 vcipherlast
16, 16, 23
702 stxvb16x
47, 0, 9 # store output
704 stxvb16x
48, 15, 9 # store output
706 vcipherlast
17, 17, 23
707 vcipherlast
18, 18, 23
710 stxvb16x
49, 16, 9 # store output
712 stxvb16x
50, 17, 9 # store output
714 vcipherlast
19, 19, 23
715 vcipherlast
20, 20, 23
718 stxvb16x
51, 18, 9 # store output
720 stxvb16x
52, 19, 9 # store output
722 vcipherlast
21, 21, 23
723 vcipherlast
22, 22, 23
726 stxvb16x
53, 20, 9 # store output
728 stxvb16x
54, 21, 9 # store output
733 ppc_aes_gcm_ghash2_4x
736 vaddudm
30, 30, 31 # IV + counter
738 vxor
15, 30, 27 # add round key
765 # loop last few blocks
776 .macro Loop_aes_middle_1x
802 lxvb16x
15, 0, 14 # load block
834 vcipherlast
15, 15, 23
837 stxvb16x
47, 0, 9 # store output
847 vaddudm
30, 30, 31 # IV + counter
848 vxor
15, 30, 19 # add round key
886 vcipherlast
15, 15, 23
888 lxvb16x
15, 0, 14 # load last block
891 # create partial block mask
893 sub 15, 15, 12 # index to the mask
895 vspltisb
16, -1 # first 16 bytes - 0xffff...ff
896 vspltisb
17, 0 # second 16 bytes - 0x0000...00
903 lxvb16x
16, 15, 10 # load partial block mask
909 # * should store only the remaining bytes.
910 bl Write_partial_block
915 # Write partial block
917 # r12 - remaining bytes
918 # v15 - partial input data
922 stxvb16x
15+32, 10, 1 # last block
924 #add 10, 9, 11 # Output
928 mtctr
12 # remaining bytes
939 stxvb16x
32, 0, 8 # write out Xi
940 add
3, 11, 12 # return count
984 .global ppc_aes_gcm_decrypt
987 _ppc_aes_gcm_decrypt
:
1027 lxvb16x
32, 0, 8 # load Xi
1029 # load Hash - h^4, h^3, h^2, h
1031 lxvd2x
2+32, 10, 8 # H Poli
1033 lxvd2x
3+32, 10, 8 # Hl
1035 lxvd2x
4+32, 10, 8 # H
1037 lxvd2x
5+32, 10, 8 # Hh
1040 lxvd2x
6+32, 10, 8 # H^2l
1042 lxvd2x
7+32, 10, 8 # H^2
1044 lxvd2x
8+32, 10, 8 # H^2h
1047 lxvd2x
9+32, 10, 8 # H^3l
1049 lxvd2x
10+32, 10, 8 # H^3
1051 lxvd2x
11+32, 10, 8 # H^3h
1054 lxvd2x
12+32, 10, 8 # H^4l
1056 lxvd2x
13+32, 10, 8 # H^4
1058 lxvd2x
14+32, 10, 8 # H^4h
1060 # initialize ICB: GHASH( IV ), IV - r7
1061 lxvb16x
30+32, 0, 7 # load IV - v30
1064 li
11, 0 # block index
1069 vsldoi
31, 31, 22,1 # counter 1
1071 # load round key to VSR
1084 # load rounds - 10 (128), 12 (192), 14 (256)
1088 # vxor state, state, w # addroundkey
1090 vxor
15, 30, 29 # IV + round key - add round key 0
1093 beq Loop_aes_gcm_8x_dec
1095 # load 2 more round keys (v11, v12)
1100 beq Loop_aes_gcm_8x_dec
1102 # load 2 more round keys (v11, v12, v13, v14)
1106 beq Loop_aes_gcm_8x_dec
1111 Loop_aes_gcm_8x_dec
:
1117 divdu
10, 5, 10 # n 128 bytes-blocks
1119 beq Loop_last_block_dec
1121 vaddudm
30, 30, 31 # IV + counter
1150 lxvb16x
15, 0, 14 # load block
1151 lxvb16x
16, 15, 14 # load block
1152 lxvb16x
17, 16, 14 # load block
1153 lxvb16x
18, 17, 14 # load block
1154 lxvb16x
19, 18, 14 # load block
1155 lxvb16x
20, 19, 14 # load block
1156 lxvb16x
21, 20, 14 # load block
1157 lxvb16x
22, 21, 14 # load block
1224 vcipherlast
15, 15, 23
1225 vcipherlast
16, 16, 23
1228 stxvb16x
47, 0, 9 # store output
1230 stxvb16x
48, 15, 9 # store output
1232 vcipherlast
17, 17, 23
1233 vcipherlast
18, 18, 23
1236 stxvb16x
49, 16, 9 # store output
1238 stxvb16x
50, 17, 9 # store output
1240 vcipherlast
19, 19, 23
1241 vcipherlast
20, 20, 23
1244 stxvb16x
51, 18, 9 # store output
1246 stxvb16x
52, 19, 9 # store output
1248 vcipherlast
21, 21, 23
1249 vcipherlast
22, 22, 23
1252 stxvb16x
53, 20, 9 # store output
1254 stxvb16x
54, 21, 9 # store output
1268 ppc_aes_gcm_ghash2_4x
1271 vaddudm
30, 30, 31 # IV + counter
1273 vxor
15, 30, 27 # add round key
1291 bdnz Loop_8x_block_dec
1295 Loop_last_block_dec
:
1299 # loop last few blocks
1311 lxvb16x
15, 0, 14 # load block
1343 vcipherlast
15, 15, 23
1346 stxvb16x
47, 0, 9 # store output
1356 vaddudm
30, 30, 31 # IV + counter
1357 vxor
15, 30, 19 # add round key
1359 bdnz Next_rem_block_dec
1395 vcipherlast
15, 15, 23
1397 lxvb16x
15, 0, 14 # load block
1400 # create partial block mask
1402 sub 15, 15, 12 # index to the mask
1404 vspltisb
16, -1 # first 16 bytes - 0xffff...ff
1405 vspltisb
17, 0 # second 16 bytes - 0x0000...00
1412 lxvb16x
16, 15, 10 # load block mask
1418 # * should store only the remaining bytes.
1419 bl Write_partial_block
1426 foreach (split("\n",$code)) {
1427 s/\`([^\`]*)\`/eval $1/geo;
1429 if ($flavour =~ /le$/o) { # little-endian
1439 close STDOUT
or die "error closing STDOUT: $!"; # enforce flush