]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/vpaes-ppc.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / aes / asm / vpaes-ppc.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
b5c54c91
AP
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20# CBC encrypt/decrypt performance in cycles per byte processed with
21# 128-bit key.
22#
23# aes-ppc.pl this
24# G4e 35.5/52.1/(23.8) 11.9(*)/15.4
25# POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
26# POWER7 32.3/42.9/(18.4) 18.5/23.3
27#
28# (*) This is ~10% worse than reported in paper. The reason is
29# twofold. This module doesn't make any assumption about
30# key schedule (or data for that matter) alignment and handles
31# it in-line. Secondly it, being transliterated from
32# vpaes-x86_64.pl, relies on "nested inversion" better suited
33# for Intel CPUs.
34# (**) Inadequate POWER6 performance is due to astronomic AltiVec
35# latency, 9 cycles per simple logical operation.
36
37$flavour = shift;
38
39if ($flavour =~ /64/) {
40 $SIZE_T =8;
41 $LRSAVE =2*$SIZE_T;
42 $STU ="stdu";
43 $POP ="ld";
44 $PUSH ="std";
b83d09f5 45 $UCMP ="cmpld";
b5c54c91
AP
46} elsif ($flavour =~ /32/) {
47 $SIZE_T =4;
48 $LRSAVE =$SIZE_T;
49 $STU ="stwu";
50 $POP ="lwz";
51 $PUSH ="stw";
b83d09f5 52 $UCMP ="cmplw";
b5c54c91
AP
53} else { die "nonsense $flavour"; }
54
55$sp="r1";
a61e5122 56$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
b5c54c91
AP
57
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
60( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
61die "can't locate ppc-xlate.pl";
62
63open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
64
65$code.=<<___;
66.machine "any"
67
68.text
69
70.align 7 # totally strategic alignment
71_vpaes_consts:
72Lk_mc_forward: # mc_forward
1fb83a3b
AP
73 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
74 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
75 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
76 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
b5c54c91 77Lk_mc_backward: # mc_backward
1fb83a3b
AP
78 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
79 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
80 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
81 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
b5c54c91 82Lk_sr: # sr
1fb83a3b
AP
83 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
84 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
85 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
86 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
b5c54c91
AP
87
88##
89## "Hot" constants
90##
91Lk_inv: # inv, inva
1fb83a3b
AP
92 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
93 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
b5c54c91 94Lk_ipt: # input transform (lo, hi)
1fb83a3b
AP
95 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
96 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
b5c54c91 97Lk_sbo: # sbou, sbot
1fb83a3b
AP
98 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
99 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
b5c54c91 100Lk_sb1: # sb1u, sb1t
1fb83a3b
AP
101 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
102 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
b5c54c91 103Lk_sb2: # sb2u, sb2t
1fb83a3b
AP
104 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
105 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
b5c54c91
AP
106
107##
108## Decryption stuff
109##
110Lk_dipt: # decryption input transform
1fb83a3b
AP
111 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
112 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
b5c54c91 113Lk_dsbo: # decryption sbox final output
1fb83a3b
AP
114 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
115 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
b5c54c91 116Lk_dsb9: # decryption sbox output *9*u, *9*t
1fb83a3b
AP
117 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
118 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
b5c54c91 119Lk_dsbd: # decryption sbox output *D*u, *D*t
1fb83a3b
AP
120 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
121 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
b5c54c91 122Lk_dsbb: # decryption sbox output *B*u, *B*t
1fb83a3b
AP
123 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
124 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
b5c54c91 125Lk_dsbe: # decryption sbox output *E*u, *E*t
1fb83a3b
AP
126 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
127 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
b5c54c91
AP
128
129##
130## Key schedule constants
131##
132Lk_dksd: # decryption key schedule: invskew x*D
1fb83a3b
AP
133 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
134 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
b5c54c91 135Lk_dksb: # decryption key schedule: invskew x*B
1fb83a3b
AP
136 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
137 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
b5c54c91 138Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1fb83a3b
AP
139 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
140 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
b5c54c91 141Lk_dks9: # decryption key schedule: invskew x*9
1fb83a3b
AP
142 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
143 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
b5c54c91
AP
144
145Lk_rcon: # rcon
1fb83a3b 146 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
b5c54c91 147Lk_s63:
1fb83a3b 148 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
b5c54c91
AP
149
150Lk_opt: # output transform
1fb83a3b
AP
151 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
152 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
b5c54c91 153Lk_deskew: # deskew tables: inverts the sbox's "skew"
1fb83a3b
AP
154 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
155 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
b5c54c91
AP
156.align 5
157Lconsts:
158 mflr r0
159 bcl 20,31,\$+4
160 mflr r12 #vvvvv "distance between . and _vpaes_consts
161 addi r12,r12,-0x308
162 mtlr r0
163 blr
164 .long 0
165 .byte 0,12,0x14,0,0,0,0,0
6eebcf34 166.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
b5c54c91
AP
167.align 6
168___
169\f
170my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
171{
172my ($inp,$out,$key) = map("r$_",(3..5));
173
174my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
175my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
176my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
177
178$code.=<<___;
179##
180## _aes_preheat
181##
182## Fills register %r10 -> .aes_consts (so you can -fPIC)
183## and %xmm9-%xmm15 as specified below.
184##
185.align 4
186_vpaes_encrypt_preheat:
187 mflr r8
188 bl Lconsts
189 mtlr r8
190 li r11, 0xc0 # Lk_inv
191 li r10, 0xd0
192 li r9, 0xe0 # Lk_ipt
193 li r8, 0xf0
194 vxor v7, v7, v7 # 0x00..00
195 vspltisb v8,4 # 0x04..04
196 vspltisb v9,0x0f # 0x0f..0f
197 lvx $invlo, r12, r11
198 li r11, 0x100
199 lvx $invhi, r12, r10
200 li r10, 0x110
201 lvx $iptlo, r12, r9
202 li r9, 0x120
203 lvx $ipthi, r12, r8
204 li r8, 0x130
205 lvx $sbou, r12, r11
206 li r11, 0x140
207 lvx $sbot, r12, r10
208 li r10, 0x150
209 lvx $sb1u, r12, r9
210 lvx $sb1t, r12, r8
211 lvx $sb2u, r12, r11
212 lvx $sb2t, r12, r10
213 blr
214 .long 0
215 .byte 0,12,0x14,0,0,0,0,0
216
217##
218## _aes_encrypt_core
219##
220## AES-encrypt %xmm0.
221##
222## Inputs:
223## %xmm0 = input
224## %xmm9-%xmm15 as in _vpaes_preheat
225## (%rdx) = scheduled keys
226##
227## Output in %xmm0
228## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
229##
230##
231.align 5
232_vpaes_encrypt_core:
233 lwz r8, 240($key) # pull rounds
234 li r9, 16
235 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
236 li r11, 0x10
237 lvx v6, r9, $key
238 addi r9, r9, 16
1fb83a3b 239 ?vperm v5, v5, v6, $keyperm # align round key
b5c54c91
AP
240 addi r10, r11, 0x40
241 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
242 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
243 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
244 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
245 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
246 mtctr r8
247 b Lenc_entry
248
249.align 4
250Lenc_loop:
251 # middle of middle round
252 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
253 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
254 addi r11, r11, 16
255 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
256 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
257 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
258 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
259 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
260 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
261 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
262 addi r10, r11, 0x40
263 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
264 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
265 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
266 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
267 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
268 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
269 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
270
271Lenc_entry:
272 # top of round
273 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
274 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
275 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
276 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
277 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
278 vand v0, v0, v9
279 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
280 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
281 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
282 vmr v5, v6
283 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
284 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
285 addi r9, r9, 16
286 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
1fb83a3b 287 ?vperm v5, v5, v6, $keyperm # align round key
b5c54c91
AP
288 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
289 bdnz Lenc_loop
290
291 # middle of last round
292 addi r10, r11, 0x80
293 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
294 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
295 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
296 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
297 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
298 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
299 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
300 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
301 blr
302 .long 0
303 .byte 0,12,0x14,0,0,0,0,0
304
305.globl .vpaes_encrypt
306.align 5
307.vpaes_encrypt:
a61e5122
AP
308 $STU $sp,-$FRAME($sp)
309 li r10,`15+6*$SIZE_T`
310 li r11,`31+6*$SIZE_T`
b5c54c91
AP
311 mflr r6
312 mfspr r7, 256 # save vrsave
a61e5122 313 stvx v20,r10,$sp
b83d09f5 314 addi r10,r10,32
a61e5122 315 stvx v21,r11,$sp
b83d09f5 316 addi r11,r11,32
a61e5122 317 stvx v22,r10,$sp
b83d09f5 318 addi r10,r10,32
a61e5122 319 stvx v23,r11,$sp
b83d09f5 320 addi r11,r11,32
a61e5122 321 stvx v24,r10,$sp
b83d09f5 322 addi r10,r10,32
a61e5122 323 stvx v25,r11,$sp
b83d09f5 324 addi r11,r11,32
a61e5122 325 stvx v26,r10,$sp
b83d09f5 326 addi r10,r10,32
a61e5122 327 stvx v27,r11,$sp
b83d09f5 328 addi r11,r11,32
a61e5122 329 stvx v28,r10,$sp
b83d09f5 330 addi r10,r10,32
a61e5122 331 stvx v29,r11,$sp
b83d09f5 332 addi r11,r11,32
a61e5122
AP
333 stvx v30,r10,$sp
334 stvx v31,r11,$sp
b83d09f5 335 stw r7,`$FRAME-4`($sp) # save vrsave
b5c54c91 336 li r0, -1
a61e5122 337 $PUSH r6,`$FRAME+$LRSAVE`($sp)
b5c54c91
AP
338 mtspr 256, r0 # preserve all AltiVec registers
339
340 bl _vpaes_encrypt_preheat
341
1fb83a3b
AP
342 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
343 lvx v0, 0, $inp
b5c54c91 344 addi $inp, $inp, 15 # 15 is not a typo
1fb83a3b
AP
345 ?lvsr $outperm, 0, $out
346 ?lvsl $keyperm, 0, $key # prepare for unaligned access
b5c54c91 347 lvx $inptail, 0, $inp # redundant in aligned case
1fb83a3b 348 ?vperm v0, v0, $inptail, $inpperm
b5c54c91
AP
349
350 bl _vpaes_encrypt_core
351
ce24d2ed
AP
352 andi. r8, $out, 15
353 li r9, 16
354 beq Lenc_out_aligned
355
1fb83a3b 356 vperm v0, v0, v0, $outperm # rotate right/left
ce24d2ed
AP
357 mtctr r9
358Lenc_out_unaligned:
359 stvebx v0, 0, $out
360 addi $out, $out, 1
361 bdnz Lenc_out_unaligned
362 b Lenc_done
b5c54c91 363
ce24d2ed
AP
364.align 4
365Lenc_out_aligned:
366 stvx v0, 0, $out
367Lenc_done:
b5c54c91 368
a61e5122
AP
369 li r10,`15+6*$SIZE_T`
370 li r11,`31+6*$SIZE_T`
b5c54c91
AP
371 mtlr r6
372 mtspr 256, r7 # restore vrsave
a61e5122 373 lvx v20,r10,$sp
b83d09f5 374 addi r10,r10,32
a61e5122 375 lvx v21,r11,$sp
b83d09f5 376 addi r11,r11,32
a61e5122 377 lvx v22,r10,$sp
b83d09f5 378 addi r10,r10,32
a61e5122 379 lvx v23,r11,$sp
b83d09f5 380 addi r11,r11,32
a61e5122 381 lvx v24,r10,$sp
b83d09f5 382 addi r10,r10,32
a61e5122 383 lvx v25,r11,$sp
b83d09f5 384 addi r11,r11,32
a61e5122 385 lvx v26,r10,$sp
b83d09f5 386 addi r10,r10,32
a61e5122 387 lvx v27,r11,$sp
b83d09f5 388 addi r11,r11,32
a61e5122 389 lvx v28,r10,$sp
b83d09f5 390 addi r10,r10,32
a61e5122 391 lvx v29,r11,$sp
b83d09f5 392 addi r11,r11,32
a61e5122
AP
393 lvx v30,r10,$sp
394 lvx v31,r11,$sp
395 addi $sp,$sp,$FRAME
b5c54c91
AP
396 blr
397 .long 0
a61e5122 398 .byte 0,12,0x04,1,0x80,0,3,0
b5c54c91
AP
399 .long 0
400.size .vpaes_encrypt,.-.vpaes_encrypt
401
402.align 4
403_vpaes_decrypt_preheat:
404 mflr r8
405 bl Lconsts
406 mtlr r8
407 li r11, 0xc0 # Lk_inv
408 li r10, 0xd0
409 li r9, 0x160 # Ldipt
410 li r8, 0x170
411 vxor v7, v7, v7 # 0x00..00
412 vspltisb v8,4 # 0x04..04
413 vspltisb v9,0x0f # 0x0f..0f
414 lvx $invlo, r12, r11
415 li r11, 0x180
416 lvx $invhi, r12, r10
417 li r10, 0x190
418 lvx $iptlo, r12, r9
419 li r9, 0x1a0
420 lvx $ipthi, r12, r8
421 li r8, 0x1b0
422 lvx $sbou, r12, r11
423 li r11, 0x1c0
424 lvx $sbot, r12, r10
425 li r10, 0x1d0
426 lvx $sb9u, r12, r9
427 li r9, 0x1e0
428 lvx $sb9t, r12, r8
429 li r8, 0x1f0
430 lvx $sbdu, r12, r11
431 li r11, 0x200
432 lvx $sbdt, r12, r10
433 li r10, 0x210
434 lvx $sbbu, r12, r9
435 lvx $sbbt, r12, r8
436 lvx $sbeu, r12, r11
437 lvx $sbet, r12, r10
438 blr
439 .long 0
440 .byte 0,12,0x14,0,0,0,0,0
441
442##
443## Decryption core
444##
445## Same API as encryption core.
446##
447.align 4
448_vpaes_decrypt_core:
449 lwz r8, 240($key) # pull rounds
450 li r9, 16
451 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
452 li r11, 0x30
453 lvx v6, r9, $key
454 addi r9, r9, 16
1fb83a3b 455 ?vperm v5, v5, v6, $keyperm # align round key
b5c54c91
AP
456 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
457 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
458 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
459 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
460 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
461 mtctr r8
462 b Ldec_entry
463
464.align 4
465Ldec_loop:
466#
467# Inverse mix columns
468#
469 lvx v0, r12, r11 # v5 and v0 are flipped
470 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
471 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
472 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
473 subi r11, r11, 16
474 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
475 andi. r11, r11, 0x30
476 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
477 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
478 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
479 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
480
481 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
482 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
483 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
484 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
485 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
486 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
487 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
488
489 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
490 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
491 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
492 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
493 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
494 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
495 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
496
497 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
498 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
499 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
500 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
501 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
502
503Ldec_entry:
504 # top of round
505 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
506 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
507 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
508 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
509 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
510 vand v0, v0, v9
511 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
512 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
513 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
514 vmr v5, v6
515 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
516 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
517 addi r9, r9, 16
518 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
1fb83a3b 519 ?vperm v5, v5, v6, $keyperm # align round key
b5c54c91
AP
520 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
521 bdnz Ldec_loop
522
523 # middle of last round
524 addi r10, r11, 0x80
525 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
526 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
527 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
528 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
529 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
530 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
531 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
532 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
533 blr
534 .long 0
535 .byte 0,12,0x14,0,0,0,0,0
536
537.globl .vpaes_decrypt
538.align 5
539.vpaes_decrypt:
a61e5122
AP
540 $STU $sp,-$FRAME($sp)
541 li r10,`15+6*$SIZE_T`
542 li r11,`31+6*$SIZE_T`
b5c54c91
AP
543 mflr r6
544 mfspr r7, 256 # save vrsave
a61e5122 545 stvx v20,r10,$sp
b83d09f5 546 addi r10,r10,32
a61e5122 547 stvx v21,r11,$sp
b83d09f5 548 addi r11,r11,32
a61e5122 549 stvx v22,r10,$sp
b83d09f5 550 addi r10,r10,32
a61e5122 551 stvx v23,r11,$sp
b83d09f5 552 addi r11,r11,32
a61e5122 553 stvx v24,r10,$sp
b83d09f5 554 addi r10,r10,32
a61e5122 555 stvx v25,r11,$sp
b83d09f5 556 addi r11,r11,32
a61e5122 557 stvx v26,r10,$sp
b83d09f5 558 addi r10,r10,32
a61e5122 559 stvx v27,r11,$sp
b83d09f5 560 addi r11,r11,32
a61e5122 561 stvx v28,r10,$sp
b83d09f5 562 addi r10,r10,32
a61e5122 563 stvx v29,r11,$sp
b83d09f5 564 addi r11,r11,32
a61e5122
AP
565 stvx v30,r10,$sp
566 stvx v31,r11,$sp
b83d09f5 567 stw r7,`$FRAME-4`($sp) # save vrsave
b5c54c91 568 li r0, -1
a61e5122 569 $PUSH r6,`$FRAME+$LRSAVE`($sp)
b5c54c91
AP
570 mtspr 256, r0 # preserve all AltiVec registers
571
572 bl _vpaes_decrypt_preheat
573
1fb83a3b
AP
574 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
575 lvx v0, 0, $inp
b5c54c91 576 addi $inp, $inp, 15 # 15 is not a typo
1fb83a3b
AP
577 ?lvsr $outperm, 0, $out
578 ?lvsl $keyperm, 0, $key
b5c54c91 579 lvx $inptail, 0, $inp # redundant in aligned case
1fb83a3b 580 ?vperm v0, v0, $inptail, $inpperm
b5c54c91
AP
581
582 bl _vpaes_decrypt_core
583
ce24d2ed
AP
584 andi. r8, $out, 15
585 li r9, 16
586 beq Ldec_out_aligned
587
1fb83a3b 588 vperm v0, v0, v0, $outperm # rotate right/left
ce24d2ed
AP
589 mtctr r9
590Ldec_out_unaligned:
591 stvebx v0, 0, $out
592 addi $out, $out, 1
593 bdnz Ldec_out_unaligned
594 b Ldec_done
b5c54c91 595
ce24d2ed
AP
596.align 4
597Ldec_out_aligned:
598 stvx v0, 0, $out
599Ldec_done:
b5c54c91 600
a61e5122
AP
601 li r10,`15+6*$SIZE_T`
602 li r11,`31+6*$SIZE_T`
b5c54c91
AP
603 mtlr r6
604 mtspr 256, r7 # restore vrsave
a61e5122 605 lvx v20,r10,$sp
b83d09f5 606 addi r10,r10,32
a61e5122 607 lvx v21,r11,$sp
b83d09f5 608 addi r11,r11,32
a61e5122 609 lvx v22,r10,$sp
b83d09f5 610 addi r10,r10,32
a61e5122 611 lvx v23,r11,$sp
b83d09f5 612 addi r11,r11,32
a61e5122 613 lvx v24,r10,$sp
b83d09f5 614 addi r10,r10,32
a61e5122 615 lvx v25,r11,$sp
b83d09f5 616 addi r11,r11,32
a61e5122 617 lvx v26,r10,$sp
b83d09f5 618 addi r10,r10,32
a61e5122 619 lvx v27,r11,$sp
b83d09f5 620 addi r11,r11,32
a61e5122 621 lvx v28,r10,$sp
b83d09f5 622 addi r10,r10,32
a61e5122 623 lvx v29,r11,$sp
b83d09f5 624 addi r11,r11,32
a61e5122
AP
625 lvx v30,r10,$sp
626 lvx v31,r11,$sp
627 addi $sp,$sp,$FRAME
b5c54c91
AP
628 blr
629 .long 0
a61e5122 630 .byte 0,12,0x04,1,0x80,0,3,0
b5c54c91
AP
631 .long 0
632.size .vpaes_decrypt,.-.vpaes_decrypt
633
634.globl .vpaes_cbc_encrypt
635.align 5
636.vpaes_cbc_encrypt:
b83d09f5
AP
637 ${UCMP}i r5,16
638 bltlr-
639
a61e5122 640 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
b5c54c91 641 mflr r0
a61e5122
AP
642 li r10,`15+6*$SIZE_T`
643 li r11,`31+6*$SIZE_T`
644 mfspr r12, 256
645 stvx v20,r10,$sp
b83d09f5 646 addi r10,r10,32
a61e5122 647 stvx v21,r11,$sp
b83d09f5 648 addi r11,r11,32
a61e5122 649 stvx v22,r10,$sp
b83d09f5 650 addi r10,r10,32
a61e5122 651 stvx v23,r11,$sp
b83d09f5 652 addi r11,r11,32
a61e5122 653 stvx v24,r10,$sp
b83d09f5 654 addi r10,r10,32
a61e5122 655 stvx v25,r11,$sp
b83d09f5 656 addi r11,r11,32
a61e5122 657 stvx v26,r10,$sp
b83d09f5 658 addi r10,r10,32
a61e5122 659 stvx v27,r11,$sp
b83d09f5 660 addi r11,r11,32
a61e5122 661 stvx v28,r10,$sp
b83d09f5 662 addi r10,r10,32
a61e5122 663 stvx v29,r11,$sp
b83d09f5 664 addi r11,r11,32
a61e5122
AP
665 stvx v30,r10,$sp
666 stvx v31,r11,$sp
b83d09f5 667 stw r12,`$FRAME-4`($sp) # save vrsave
a61e5122
AP
668 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
669 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
b83d09f5 670 li r9, -16
a61e5122 671 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
b5c54c91 672
b83d09f5 673 and r30, r5, r9 # copy length&-16
ce24d2ed 674 andi. r9, $out, 15 # is $out aligned?
b5c54c91
AP
675 mr r5, r6 # copy pointer to key
676 mr r31, r7 # copy pointer to iv
b5c54c91 677 li r6, -1
ce24d2ed 678 mcrf cr1, cr0 # put aside $out alignment flag
a61e5122 679 mr r7, r12 # copy vrsave
b5c54c91
AP
680 mtspr 256, r6 # preserve all AltiVec registers
681
89bb96e5 682 lvx v24, 0, r31 # load [potentially unaligned] iv
b5c54c91 683 li r9, 15
1fb83a3b 684 ?lvsl $inpperm, 0, r31
b5c54c91 685 lvx v25, r9, r31
1fb83a3b 686 ?vperm v24, v24, v25, $inpperm
b5c54c91 687
ce24d2ed 688 cmpwi r8, 0 # test direction
b5c54c91
AP
689 neg r8, $inp # prepare for unaligned access
690 vxor v7, v7, v7
1fb83a3b
AP
691 ?lvsl $keyperm, 0, $key
692 ?lvsr $outperm, 0, $out
693 ?lvsr $inpperm, 0, r8 # -$inp
b5c54c91
AP
694 vnor $outmask, v7, v7 # 0xff..ff
695 lvx $inptail, 0, $inp
1fb83a3b 696 ?vperm $outmask, v7, $outmask, $outperm
b5c54c91 697 addi $inp, $inp, 15 # 15 is not a typo
b5c54c91
AP
698
699 beq Lcbc_decrypt
700
701 bl _vpaes_encrypt_preheat
702 li r0, 16
703
ce24d2ed
AP
704 beq cr1, Lcbc_enc_loop # $out is aligned
705
706 vmr v0, $inptail
707 lvx $inptail, 0, $inp
708 addi $inp, $inp, 16
709 ?vperm v0, v0, $inptail, $inpperm
710 vxor v0, v0, v24 # ^= iv
711
712 bl _vpaes_encrypt_core
713
714 andi. r8, $out, 15
715 vmr v24, v0 # put aside iv
716 sub r9, $out, r8
717 vperm $outhead, v0, v0, $outperm # rotate right/left
718
719Lcbc_enc_head:
720 stvebx $outhead, r8, r9
721 cmpwi r8, 15
722 addi r8, r8, 1
723 bne Lcbc_enc_head
724
725 sub. r30, r30, r0 # len -= 16
726 addi $out, $out, 16
727 beq Lcbc_unaligned_done
728
b5c54c91
AP
729Lcbc_enc_loop:
730 vmr v0, $inptail
731 lvx $inptail, 0, $inp
732 addi $inp, $inp, 16
1fb83a3b 733 ?vperm v0, v0, $inptail, $inpperm
b5c54c91
AP
734 vxor v0, v0, v24 # ^= iv
735
736 bl _vpaes_encrypt_core
737
738 vmr v24, v0 # put aside iv
739 sub. r30, r30, r0 # len -= 16
1fb83a3b 740 vperm v0, v0, v0, $outperm # rotate right/left
b5c54c91
AP
741 vsel v1, $outhead, v0, $outmask
742 vmr $outhead, v0
743 stvx v1, 0, $out
744 addi $out, $out, 16
b83d09f5 745 bne Lcbc_enc_loop
b5c54c91
AP
746
747 b Lcbc_done
748
749.align 5
750Lcbc_decrypt:
751 bl _vpaes_decrypt_preheat
752 li r0, 16
753
ce24d2ed
AP
754 beq cr1, Lcbc_dec_loop # $out is aligned
755
756 vmr v0, $inptail
757 lvx $inptail, 0, $inp
758 addi $inp, $inp, 16
759 ?vperm v0, v0, $inptail, $inpperm
760 vmr v25, v0 # put aside input
761
762 bl _vpaes_decrypt_core
763
764 andi. r8, $out, 15
765 vxor v0, v0, v24 # ^= iv
766 vmr v24, v25
767 sub r9, $out, r8
768 vperm $outhead, v0, v0, $outperm # rotate right/left
769
770Lcbc_dec_head:
771 stvebx $outhead, r8, r9
772 cmpwi r8, 15
773 addi r8, r8, 1
774 bne Lcbc_dec_head
775
776 sub. r30, r30, r0 # len -= 16
777 addi $out, $out, 16
778 beq Lcbc_unaligned_done
779
b5c54c91
AP
780Lcbc_dec_loop:
781 vmr v0, $inptail
782 lvx $inptail, 0, $inp
783 addi $inp, $inp, 16
1fb83a3b 784 ?vperm v0, v0, $inptail, $inpperm
b5c54c91
AP
785 vmr v25, v0 # put aside input
786
787 bl _vpaes_decrypt_core
788
789 vxor v0, v0, v24 # ^= iv
790 vmr v24, v25
791 sub. r30, r30, r0 # len -= 16
1fb83a3b 792 vperm v0, v0, v0, $outperm # rotate right/left
b5c54c91
AP
793 vsel v1, $outhead, v0, $outmask
794 vmr $outhead, v0
795 stvx v1, 0, $out
796 addi $out, $out, 16
b83d09f5 797 bne Lcbc_dec_loop
b5c54c91
AP
798
799Lcbc_done:
ce24d2ed
AP
800 beq cr1, Lcbc_write_iv # $out is aligned
801
802Lcbc_unaligned_done:
803 andi. r8, $out, 15
804 sub $out, $out, r8
805 li r9, 0
806Lcbc_tail:
807 stvebx $outhead, r9, $out
808 addi r9, r9, 1
809 cmpw r9, r8
810 bne Lcbc_tail
811
812Lcbc_write_iv:
89bb96e5 813 neg r8, r31 # write [potentially unaligned] iv
ce24d2ed 814 li r10, 4
1fb83a3b 815 ?lvsl $outperm, 0, r8
ce24d2ed
AP
816 li r11, 8
817 li r12, 12
1fb83a3b 818 vperm v24, v24, v24, $outperm # rotate right/left
ce24d2ed
AP
819 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
820 stvewx v24, r10, r31
821 stvewx v24, r11, r31
822 stvewx v24, r12, r31
b5c54c91
AP
823
824 mtspr 256, r7 # restore vrsave
a61e5122
AP
825 li r10,`15+6*$SIZE_T`
826 li r11,`31+6*$SIZE_T`
827 lvx v20,r10,$sp
b83d09f5 828 addi r10,r10,32
a61e5122 829 lvx v21,r11,$sp
b83d09f5 830 addi r11,r11,32
a61e5122 831 lvx v22,r10,$sp
b83d09f5 832 addi r10,r10,32
a61e5122 833 lvx v23,r11,$sp
b83d09f5 834 addi r11,r11,32
a61e5122 835 lvx v24,r10,$sp
b83d09f5 836 addi r10,r10,32
a61e5122 837 lvx v25,r11,$sp
b83d09f5 838 addi r11,r11,32
a61e5122 839 lvx v26,r10,$sp
b83d09f5 840 addi r10,r10,32
a61e5122 841 lvx v27,r11,$sp
b83d09f5 842 addi r11,r11,32
a61e5122 843 lvx v28,r10,$sp
b83d09f5 844 addi r10,r10,32
a61e5122 845 lvx v29,r11,$sp
b83d09f5 846 addi r11,r11,32
a61e5122
AP
847 lvx v30,r10,$sp
848 lvx v31,r11,$sp
b5c54c91 849Lcbc_abort:
a61e5122
AP
850 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
851 $POP r30,`$FRAME+$SIZE_T*0`($sp)
852 $POP r31,`$FRAME+$SIZE_T*1`($sp)
b5c54c91 853 mtlr r0
a61e5122 854 addi $sp,$sp,`$FRAME+$SIZE_T*2`
b5c54c91
AP
855 blr
856 .long 0
857 .byte 0,12,0x04,1,0x80,2,6,0
858 .long 0
859.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
860___
861}\f
862{
863my ($inp,$bits,$out)=map("r$_",(3..5));
89bb96e5 864my $dir="cr1";
b5c54c91
AP
865my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
866
867$code.=<<___;
868########################################################
869## ##
870## AES key schedule ##
871## ##
872########################################################
873.align 4
874_vpaes_key_preheat:
875 mflr r8
876 bl Lconsts
877 mtlr r8
878 li r11, 0xc0 # Lk_inv
879 li r10, 0xd0
880 li r9, 0xe0 # L_ipt
881 li r8, 0xf0
882
883 vspltisb v8,4 # 0x04..04
884 vxor v9,v9,v9 # 0x00..00
885 lvx $invlo, r12, r11 # Lk_inv
886 li r11, 0x120
887 lvx $invhi, r12, r10
888 li r10, 0x130
889 lvx $iptlo, r12, r9 # Lk_ipt
890 li r9, 0x220
891 lvx $ipthi, r12, r8
892 li r8, 0x230
893
894 lvx v14, r12, r11 # Lk_sb1
895 li r11, 0x240
896 lvx v15, r12, r10
897 li r10, 0x250
898
899 lvx v16, r12, r9 # Lk_dksd
900 li r9, 0x260
901 lvx v17, r12, r8
902 li r8, 0x270
903 lvx v18, r12, r11 # Lk_dksb
904 li r11, 0x280
905 lvx v19, r12, r10
906 li r10, 0x290
907 lvx v20, r12, r9 # Lk_dkse
908 li r9, 0x2a0
909 lvx v21, r12, r8
910 li r8, 0x2b0
911 lvx v22, r12, r11 # Lk_dks9
912 lvx v23, r12, r10
913
914 lvx v24, r12, r9 # Lk_rcon
915 lvx v25, 0, r12 # Lk_mc_forward[0]
916 lvx v26, r12, r8 # Lks63
917 blr
918 .long 0
919 .byte 0,12,0x14,0,0,0,0,0
920
921.align 4
922_vpaes_schedule_core:
923 mflr r7
924
925 bl _vpaes_key_preheat # load the tables
926
927 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
928 neg r8, $inp # prepare for unaligned access
929 lvx v0, 0, $inp
930 addi $inp, $inp, 15 # 15 is not typo
1fb83a3b 931 ?lvsr $inpperm, 0, r8 # -$inp
b5c54c91
AP
932 lvx v6, 0, $inp # v6 serves as inptail
933 addi $inp, $inp, 8
1fb83a3b 934 ?vperm v0, v0, v6, $inpperm
b5c54c91
AP
935
936 # input transform
937 vmr v3, v0 # vmovdqa %xmm0, %xmm3
938 bl _vpaes_schedule_transform
939 vmr v7, v0 # vmovdqa %xmm0, %xmm7
940
941 bne $dir, Lschedule_am_decrypting
942
943 # encrypting, output zeroth round key after transform
944 li r8, 0x30 # mov \$0x30,%r8d
ce24d2ed
AP
945 li r9, 4
946 li r10, 8
947 li r11, 12
b5c54c91 948
1fb83a3b
AP
949 ?lvsr $outperm, 0, $out # prepare for unaligned access
950 vnor $outmask, v9, v9 # 0xff..ff
1fb83a3b 951 ?vperm $outmask, v9, $outmask, $outperm
b5c54c91
AP
952
953 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
ce24d2ed
AP
954 vperm $outhead, v0, v0, $outperm # rotate right/left
955 stvewx $outhead, 0, $out # some are superfluous
956 stvewx $outhead, r9, $out
957 stvewx $outhead, r10, $out
958 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
959 stvewx $outhead, r11, $out
b5c54c91
AP
960 b Lschedule_go
961
962Lschedule_am_decrypting:
963 srwi r8, $bits, 1 # shr \$1,%r8d
964 andi. r8, r8, 32 # and \$32,%r8d
965 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
966 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
967 # decrypting, output zeroth round key after shiftrows
968 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
ce24d2ed
AP
969 li r9, 4
970 li r10, 8
971 li r11, 12
b5c54c91
AP
972 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
973
974 neg r0, $out # prepare for unaligned access
1fb83a3b 975 ?lvsl $outperm, 0, r0
1fb83a3b 976 vnor $outmask, v9, v9 # 0xff..ff
1fb83a3b 977 ?vperm $outmask, $outmask, v9, $outperm
b5c54c91
AP
978
979 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
ce24d2ed
AP
980 vperm $outhead, v4, v4, $outperm # rotate right/left
981 stvewx $outhead, 0, $out # some are superfluous
982 stvewx $outhead, r9, $out
983 stvewx $outhead, r10, $out
984 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
985 stvewx $outhead, r11, $out
986 addi $out, $out, 15 # 15 is not typo
b5c54c91
AP
987 xori r8, r8, 0x30 # xor \$0x30, %r8
988
989Lschedule_go:
990 cmplwi $bits, 192 # cmp \$192, %esi
991 bgt Lschedule_256
992 beq Lschedule_192
993 # 128: fall though
994
995##
996## .schedule_128
997##
998## 128-bit specific part of key schedule.
999##
1000## This schedule is really simple, because all its parts
1001## are accomplished by the subroutines.
1002##
1003Lschedule_128:
1004 li r0, 10 # mov \$10, %esi
1005 mtctr r0
1006
1007Loop_schedule_128:
1008 bl _vpaes_schedule_round
1009 bdz Lschedule_mangle_last # dec %esi
1010 bl _vpaes_schedule_mangle # write output
1011 b Loop_schedule_128
1012
1013##
1014## .aes_schedule_192
1015##
1016## 192-bit specific part of key schedule.
1017##
1018## The main body of this schedule is the same as the 128-bit
1019## schedule, but with more smearing. The long, high side is
1020## stored in %xmm7 as before, and the short, low side is in
1021## the high bits of %xmm6.
1022##
1023## This schedule is somewhat nastier, however, because each
1024## round produces 192 bits of key material, or 1.5 round keys.
1025## Therefore, on each cycle we do 2 rounds and produce 3 round
1026## keys.
1027##
1028.align 4
1029Lschedule_192:
1030 li r0, 4 # mov \$4, %esi
1031 lvx v0, 0, $inp
1fb83a3b
AP
1032 ?vperm v0, v6, v0, $inpperm
1033 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
b5c54c91 1034 bl _vpaes_schedule_transform # input transform
1fb83a3b
AP
1035 ?vsldoi v6, v0, v9, 8
1036 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
b5c54c91
AP
1037 mtctr r0
1038
1039Loop_schedule_192:
1040 bl _vpaes_schedule_round
1fb83a3b 1041 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
b5c54c91
AP
1042 bl _vpaes_schedule_mangle # save key n
1043 bl _vpaes_schedule_192_smear
1044 bl _vpaes_schedule_mangle # save key n+1
1045 bl _vpaes_schedule_round
1046 bdz Lschedule_mangle_last # dec %esi
1047 bl _vpaes_schedule_mangle # save key n+2
1048 bl _vpaes_schedule_192_smear
1049 b Loop_schedule_192
1050
1051##
1052## .aes_schedule_256
1053##
1054## 256-bit specific part of key schedule.
1055##
1056## The structure here is very similar to the 128-bit
1057## schedule, but with an additional "low side" in
1058## %xmm6. The low side's rounds are the same as the
1059## high side's, except no rcon and no rotation.
1060##
1061.align 4
1062Lschedule_256:
1063 li r0, 7 # mov \$7, %esi
1064 addi $inp, $inp, 8
1065 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1fb83a3b 1066 ?vperm v0, v6, v0, $inpperm
b5c54c91
AP
1067 bl _vpaes_schedule_transform # input transform
1068 mtctr r0
1069
1070Loop_schedule_256:
1071 bl _vpaes_schedule_mangle # output low result
1072 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1073
1074 # high round
1075 bl _vpaes_schedule_round
1076 bdz Lschedule_mangle_last # dec %esi
1077 bl _vpaes_schedule_mangle
1078
1079 # low round. swap xmm7 and xmm6
1fb83a3b 1080 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
b5c54c91
AP
1081 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1082 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1083 bl _vpaes_schedule_low_round
1084 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1085
1086 b Loop_schedule_256
1087##
1088## .aes_schedule_mangle_last
1089##
1090## Mangler for last round of key schedule
1091## Mangles %xmm0
1092## when encrypting, outputs out(%xmm0) ^ 63
1093## when decrypting, outputs unskew(%xmm0)
1094##
1095## Always called right before return... jumps to cleanup and exits
1096##
1097.align 4
1098Lschedule_mangle_last:
1099 # schedule last round key from xmm0
1100 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1101 li r9, 0x2f0
1102 bne $dir, Lschedule_mangle_last_dec
1103
1104 # encrypting
1105 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1106 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1107 li r9, 0x2d0 # prepare to output transform
1108 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1109
1110 lvx $iptlo, r11, r12 # reload $ipt
1111 lvx $ipthi, r9, r12
1112 addi $out, $out, 16 # add \$16, %rdx
1113 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1114 bl _vpaes_schedule_transform # output transform
1115
1116 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1fb83a3b 1117 vperm v0, v0, v0, $outperm # rotate right/left
ce24d2ed 1118 li r10, 4
b5c54c91 1119 vsel v2, $outhead, v0, $outmask
ce24d2ed 1120 li r11, 8
b5c54c91 1121 stvx v2, 0, $out
ce24d2ed
AP
1122 li r12, 12
1123 stvewx v0, 0, $out # some (or all) are redundant
1124 stvewx v0, r10, $out
1125 stvewx v0, r11, $out
1126 stvewx v0, r12, $out
b5c54c91
AP
1127 b Lschedule_mangle_done
1128
1129.align 4
1130Lschedule_mangle_last_dec:
1131 lvx $iptlo, r11, r12 # reload $ipt
1132 lvx $ipthi, r9, r12
1133 addi $out, $out, -16 # add \$-16, %rdx
1134 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1135 bl _vpaes_schedule_transform # output transform
1136
1137 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
ce24d2ed 1138 addi r9, $out, -15 # -15 is not typo
1fb83a3b 1139 vperm v0, v0, v0, $outperm # rotate right/left
ce24d2ed 1140 li r10, 4
b5c54c91 1141 vsel v2, $outhead, v0, $outmask
ce24d2ed 1142 li r11, 8
b5c54c91 1143 stvx v2, 0, $out
ce24d2ed
AP
1144 li r12, 12
1145 stvewx v0, 0, r9 # some (or all) are redundant
1146 stvewx v0, r10, r9
1147 stvewx v0, r11, r9
1148 stvewx v0, r12, r9
b5c54c91 1149
b5c54c91
AP
1150
1151Lschedule_mangle_done:
1152 mtlr r7
1153 # cleanup
1154 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1155 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1156 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1157 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1158 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1159 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1160 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1161 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1162
1163 blr
1164 .long 0
1165 .byte 0,12,0x14,0,0,0,0,0
1166
1167##
1168## .aes_schedule_192_smear
1169##
1170## Smear the short, low side in the 192-bit key schedule.
1171##
1172## Inputs:
1173## %xmm7: high side, b a x y
1174## %xmm6: low side, d c 0 0
1175## %xmm13: 0
1176##
1177## Outputs:
1178## %xmm6: b+c+d b+c 0 0
1179## %xmm0: b+c+d b+c b a
1180##
1181.align 4
1182_vpaes_schedule_192_smear:
1fb83a3b
AP
1183 ?vspltw v0, v7, 3
1184 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1185 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
b5c54c91
AP
1186 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1187 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1188 vmr v0, v6
1fb83a3b
AP
1189 ?vsldoi v6, v6, v9, 8
1190 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
b5c54c91
AP
1191 blr
1192 .long 0
1193 .byte 0,12,0x14,0,0,0,0,0
1194
1195##
1196## .aes_schedule_round
1197##
1198## Runs one main round of the key schedule on %xmm0, %xmm7
1199##
1200## Specifically, runs subbytes on the high dword of %xmm0
1201## then rotates it by one byte and xors into the low dword of
1202## %xmm7.
1203##
1204## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1205## next rcon.
1206##
1207## Smears the dwords of %xmm7 by xoring the low into the
1208## second low, result into third, result into highest.
1209##
1210## Returns results in %xmm7 = %xmm0.
1211## Clobbers %xmm1-%xmm4, %r11.
1212##
1213.align 4
1214_vpaes_schedule_round:
1215 # extract rcon from xmm8
1216 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1fb83a3b
AP
1217 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1218 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
b5c54c91
AP
1219 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1220
1221 # rotate
1fb83a3b
AP
1222 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1223 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
b5c54c91
AP
1224
1225 # fall through...
1226
1227 # low round: same as high round, but no rotation and no rcon.
1228_vpaes_schedule_low_round:
1229 # smear xmm7
1fb83a3b 1230 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
b5c54c91
AP
1231 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1232 vspltisb v1, 0x0f # 0x0f..0f
1fb83a3b 1233 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
b5c54c91
AP
1234
1235 # subbytes
1236 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1237 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1238 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1239 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1240 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1241 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1242 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1243 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1244 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1245 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1246 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1247 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1248 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1249 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1250 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1251 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1252 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1253
1254 # add in smeared stuff
1255 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1256 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1257 blr
1258 .long 0
1259 .byte 0,12,0x14,0,0,0,0,0
1260
1261##
1262## .aes_schedule_transform
1263##
1264## Linear-transform %xmm0 according to tables at (%r11)
1265##
1266## Requires that %xmm9 = 0x0F0F... as in preheat
1267## Output in %xmm0
1268## Clobbers %xmm2
1269##
1270.align 4
1271_vpaes_schedule_transform:
1272 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1273 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1274 # vmovdqa (%r11), %xmm2 # lo
1275 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1276 # vmovdqa 16(%r11), %xmm1 # hi
1277 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1278 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1279 blr
1280 .long 0
1281 .byte 0,12,0x14,0,0,0,0,0
1282
1283##
1284## .aes_schedule_mangle
1285##
1286## Mangle xmm0 from (basis-transformed) standard version
1287## to our version.
1288##
1289## On encrypt,
1290## xor with 0x63
1291## multiply by circulant 0,1,1,1
1292## apply shiftrows transform
1293##
1294## On decrypt,
1295## xor with 0x63
1296## multiply by "inverse mixcolumns" circulant E,B,D,9
1297## deskew
1298## apply shiftrows transform
1299##
1300##
1301## Writes out to (%rdx), and increments or decrements it
1302## Keeps track of round number mod 4 in %r8
1303## Preserves xmm0
1304## Clobbers xmm1-xmm5
1305##
1306.align 4
1307_vpaes_schedule_mangle:
1308 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1309 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1310 bne $dir, Lschedule_mangle_dec
1311
1312 # encrypting
1313 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1314 addi $out, $out, 16 # add \$16, %rdx
1315 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1316 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1317 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1318 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1319 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1320 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1321
1322 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1323 addi r8, r8, -16 # add \$-16, %r8
1324 andi. r8, r8, 0x30 # and \$0x30, %r8
1325
1326 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1fb83a3b 1327 vperm v1, v3, v3, $outperm # rotate right/left
b5c54c91
AP
1328 vsel v2, $outhead, v1, $outmask
1329 vmr $outhead, v1
1330 stvx v2, 0, $out
1331 blr
1332
1333.align 4
1334Lschedule_mangle_dec:
1335 # inverse mix columns
1336 # lea .Lk_dksd(%rip),%r11
1337 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1338 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1339
1340 # vmovdqa 0x00(%r11), %xmm2
1341 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1342 # vmovdqa 0x10(%r11), %xmm3
1343 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1344 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1345 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1346
1347 # vmovdqa 0x20(%r11), %xmm2
1348 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1349 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1350 # vmovdqa 0x30(%r11), %xmm3
1351 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1352 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1353 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1354
1355 # vmovdqa 0x40(%r11), %xmm2
1356 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1357 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1358 # vmovdqa 0x50(%r11), %xmm3
1359 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1360 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1361
1362 # vmovdqa 0x60(%r11), %xmm2
1363 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1364 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1365 # vmovdqa 0x70(%r11), %xmm4
1366 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1367 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1368 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1369 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1370
1371 addi $out, $out, -16 # add \$-16, %rdx
1372
1373 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1374 addi r8, r8, -16 # add \$-16, %r8
1375 andi. r8, r8, 0x30 # and \$0x30, %r8
1376
1377 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1fb83a3b 1378 vperm v1, v3, v3, $outperm # rotate right/left
b5c54c91
AP
1379 vsel v2, $outhead, v1, $outmask
1380 vmr $outhead, v1
1381 stvx v2, 0, $out
1382 blr
1383 .long 0
1384 .byte 0,12,0x14,0,0,0,0,0
1385
1386.globl .vpaes_set_encrypt_key
1387.align 5
1388.vpaes_set_encrypt_key:
a61e5122
AP
1389 $STU $sp,-$FRAME($sp)
1390 li r10,`15+6*$SIZE_T`
1391 li r11,`31+6*$SIZE_T`
b5c54c91
AP
1392 mflr r0
1393 mfspr r6, 256 # save vrsave
a61e5122 1394 stvx v20,r10,$sp
b83d09f5 1395 addi r10,r10,32
a61e5122 1396 stvx v21,r11,$sp
b83d09f5 1397 addi r11,r11,32
a61e5122 1398 stvx v22,r10,$sp
b83d09f5 1399 addi r10,r10,32
a61e5122 1400 stvx v23,r11,$sp
b83d09f5 1401 addi r11,r11,32
a61e5122 1402 stvx v24,r10,$sp
b83d09f5 1403 addi r10,r10,32
a61e5122 1404 stvx v25,r11,$sp
b83d09f5 1405 addi r11,r11,32
a61e5122 1406 stvx v26,r10,$sp
b83d09f5 1407 addi r10,r10,32
a61e5122 1408 stvx v27,r11,$sp
b83d09f5 1409 addi r11,r11,32
a61e5122 1410 stvx v28,r10,$sp
b83d09f5 1411 addi r10,r10,32
a61e5122 1412 stvx v29,r11,$sp
b83d09f5 1413 addi r11,r11,32
a61e5122
AP
1414 stvx v30,r10,$sp
1415 stvx v31,r11,$sp
b83d09f5 1416 stw r6,`$FRAME-4`($sp) # save vrsave
b5c54c91 1417 li r7, -1
a61e5122 1418 $PUSH r0, `$FRAME+$LRSAVE`($sp)
b5c54c91
AP
1419 mtspr 256, r7 # preserve all AltiVec registers
1420
1421 srwi r9, $bits, 5 # shr \$5,%eax
1422 addi r9, r9, 6 # add \$5,%eax
1423 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1424
1fb83a3b 1425 cmplw $dir, $bits, $bits # set encrypt direction
b5c54c91
AP
1426 li r8, 0x30 # mov \$0x30,%r8d
1427 bl _vpaes_schedule_core
1428
a61e5122
AP
1429 $POP r0, `$FRAME+$LRSAVE`($sp)
1430 li r10,`15+6*$SIZE_T`
1431 li r11,`31+6*$SIZE_T`
b5c54c91
AP
1432 mtspr 256, r6 # restore vrsave
1433 mtlr r0
1434 xor r3, r3, r3
a61e5122 1435 lvx v20,r10,$sp
b83d09f5 1436 addi r10,r10,32
a61e5122 1437 lvx v21,r11,$sp
b83d09f5 1438 addi r11,r11,32
a61e5122 1439 lvx v22,r10,$sp
b83d09f5 1440 addi r10,r10,32
a61e5122 1441 lvx v23,r11,$sp
b83d09f5 1442 addi r11,r11,32
a61e5122 1443 lvx v24,r10,$sp
b83d09f5 1444 addi r10,r10,32
a61e5122 1445 lvx v25,r11,$sp
b83d09f5 1446 addi r11,r11,32
a61e5122 1447 lvx v26,r10,$sp
b83d09f5 1448 addi r10,r10,32
a61e5122 1449 lvx v27,r11,$sp
b83d09f5 1450 addi r11,r11,32
a61e5122 1451 lvx v28,r10,$sp
b83d09f5 1452 addi r10,r10,32
a61e5122 1453 lvx v29,r11,$sp
b83d09f5 1454 addi r11,r11,32
a61e5122
AP
1455 lvx v30,r10,$sp
1456 lvx v31,r11,$sp
1457 addi $sp,$sp,$FRAME
b5c54c91
AP
1458 blr
1459 .long 0
e704741b 1460 .byte 0,12,0x04,1,0x80,0,3,0
b5c54c91
AP
1461 .long 0
1462.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1463
1464.globl .vpaes_set_decrypt_key
1465.align 4
1466.vpaes_set_decrypt_key:
a61e5122
AP
1467 $STU $sp,-$FRAME($sp)
1468 li r10,`15+6*$SIZE_T`
1469 li r11,`31+6*$SIZE_T`
b5c54c91
AP
1470 mflr r0
1471 mfspr r6, 256 # save vrsave
a61e5122 1472 stvx v20,r10,$sp
b83d09f5 1473 addi r10,r10,32
a61e5122 1474 stvx v21,r11,$sp
b83d09f5 1475 addi r11,r11,32
a61e5122 1476 stvx v22,r10,$sp
b83d09f5 1477 addi r10,r10,32
a61e5122 1478 stvx v23,r11,$sp
b83d09f5 1479 addi r11,r11,32
a61e5122 1480 stvx v24,r10,$sp
b83d09f5 1481 addi r10,r10,32
a61e5122 1482 stvx v25,r11,$sp
b83d09f5 1483 addi r11,r11,32
a61e5122 1484 stvx v26,r10,$sp
b83d09f5 1485 addi r10,r10,32
a61e5122 1486 stvx v27,r11,$sp
b83d09f5 1487 addi r11,r11,32
a61e5122 1488 stvx v28,r10,$sp
b83d09f5 1489 addi r10,r10,32
a61e5122 1490 stvx v29,r11,$sp
b83d09f5 1491 addi r11,r11,32
a61e5122
AP
1492 stvx v30,r10,$sp
1493 stvx v31,r11,$sp
b83d09f5 1494 stw r6,`$FRAME-4`($sp) # save vrsave
b5c54c91 1495 li r7, -1
a61e5122 1496 $PUSH r0, `$FRAME+$LRSAVE`($sp)
b5c54c91
AP
1497 mtspr 256, r7 # preserve all AltiVec registers
1498
1499 srwi r9, $bits, 5 # shr \$5,%eax
1500 addi r9, r9, 6 # add \$5,%eax
1501 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1502
1503 slwi r9, r9, 4 # shl \$4,%eax
1504 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1505
1fb83a3b 1506 cmplwi $dir, $bits, 0 # set decrypt direction
b5c54c91
AP
1507 srwi r8, $bits, 1 # shr \$1,%r8d
1508 andi. r8, r8, 32 # and \$32,%r8d
1509 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1510 bl _vpaes_schedule_core
1511
a61e5122
AP
1512 $POP r0, `$FRAME+$LRSAVE`($sp)
1513 li r10,`15+6*$SIZE_T`
1514 li r11,`31+6*$SIZE_T`
b5c54c91
AP
1515 mtspr 256, r6 # restore vrsave
1516 mtlr r0
1517 xor r3, r3, r3
a61e5122 1518 lvx v20,r10,$sp
b83d09f5 1519 addi r10,r10,32
a61e5122 1520 lvx v21,r11,$sp
b83d09f5 1521 addi r11,r11,32
a61e5122 1522 lvx v22,r10,$sp
b83d09f5 1523 addi r10,r10,32
a61e5122 1524 lvx v23,r11,$sp
b83d09f5 1525 addi r11,r11,32
a61e5122 1526 lvx v24,r10,$sp
b83d09f5 1527 addi r10,r10,32
a61e5122 1528 lvx v25,r11,$sp
b83d09f5 1529 addi r11,r11,32
a61e5122 1530 lvx v26,r10,$sp
b83d09f5 1531 addi r10,r10,32
a61e5122 1532 lvx v27,r11,$sp
b83d09f5 1533 addi r11,r11,32
a61e5122 1534 lvx v28,r10,$sp
b83d09f5 1535 addi r10,r10,32
a61e5122 1536 lvx v29,r11,$sp
b83d09f5 1537 addi r11,r11,32
a61e5122
AP
1538 lvx v30,r10,$sp
1539 lvx v31,r11,$sp
1540 addi $sp,$sp,$FRAME
b5c54c91
AP
1541 blr
1542 .long 0
e704741b 1543 .byte 0,12,0x04,1,0x80,0,3,0
b5c54c91
AP
1544 .long 0
1545.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1546___
1547}
1548
1fb83a3b
AP
1549my $consts=1;
1550foreach (split("\n",$code)) {
1551 s/\`([^\`]*)\`/eval $1/geo;
1552
1553 # constants table endian-specific conversion
1554 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1555 my $conv=$2;
1556 my @bytes=();
1557
1558 # convert to endian-agnostic format
1559 foreach (split(/,\s+/,$1)) {
1560 my $l = /^0/?oct:int;
1561 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1562 }
1563
1564 # little-endian conversion
1565 if ($flavour =~ /le$/o) {
1566 SWITCH: for($conv) {
1567 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1568 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1569 }
1570 }
1571
1572 #emit
1573 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1574 next;
1575 }
1576 $consts=0 if (m/Lconsts:/o); # end of table
1577
1578 # instructions prefixed with '?' are endian-specific and need
1579 # to be adjusted accordingly...
1580 if ($flavour =~ /le$/o) { # little-endian
1581 s/\?lvsr/lvsl/o or
1582 s/\?lvsl/lvsr/o or
1583 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1584 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1585 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1586 } else { # big-endian
1587 s/\?([a-z]+)/$1/o;
1588 }
1589
1590 print $_,"\n";
1591}
b5c54c91
AP
1592
1593close STDOUT;