]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/vpaes-ppc.pl
Unify all assembler file generators
[thirdparty/openssl.git] / crypto / aes / asm / vpaes-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
12 ## version 0.1
13 ##
14 ## By Mike Hamburg (Stanford University), 2009
15 ## Public domain.
16 ##
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
19
20 # CBC encrypt/decrypt performance in cycles per byte processed with
21 # 128-bit key.
22 #
23 # aes-ppc.pl this
24 # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
25 # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
26 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
27 # POWER7 32.3/42.9/(18.4) 18.5/23.3
28 #
29 # (*) This is ~10% worse than reported in paper. The reason is
30 # twofold. This module doesn't make any assumption about
31 # key schedule (or data for that matter) alignment and handles
32 # it in-line. Secondly it, being transliterated from
33 # vpaes-x86_64.pl, relies on "nested inversion" better suited
34 # for Intel CPUs.
35 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
36 # latency, 9 cycles per simple logical operation.
37
38 # $output is the last argument if it looks like a file (it has an extension)
39 # $flavour is the first argument if it doesn't look like a file
40 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
41 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
42
43 if ($flavour =~ /64/) {
44 $SIZE_T =8;
45 $LRSAVE =2*$SIZE_T;
46 $STU ="stdu";
47 $POP ="ld";
48 $PUSH ="std";
49 $UCMP ="cmpld";
50 } elsif ($flavour =~ /32/) {
51 $SIZE_T =4;
52 $LRSAVE =$SIZE_T;
53 $STU ="stwu";
54 $POP ="lwz";
55 $PUSH ="stw";
56 $UCMP ="cmplw";
57 } else { die "nonsense $flavour"; }
58
59 $sp="r1";
60 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
61
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65 die "can't locate ppc-xlate.pl";
66
67 open STDOUT,"| $^X $xlate $flavour \"$output\""
68 || die "can't call $xlate: $!";
69
70 $code.=<<___;
71 .machine "any"
72
73 .text
74
75 .align 7 # totally strategic alignment
76 _vpaes_consts:
77 Lk_mc_forward: # mc_forward
78 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
79 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
80 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
81 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
82 Lk_mc_backward: # mc_backward
83 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
84 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
85 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
86 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
87 Lk_sr: # sr
88 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
89 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
90 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
91 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
92
93 ##
94 ## "Hot" constants
95 ##
96 Lk_inv: # inv, inva
97 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
98 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
99 Lk_ipt: # input transform (lo, hi)
100 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
101 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
102 Lk_sbo: # sbou, sbot
103 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
104 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
105 Lk_sb1: # sb1u, sb1t
106 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
107 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
108 Lk_sb2: # sb2u, sb2t
109 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
110 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
111
112 ##
113 ## Decryption stuff
114 ##
115 Lk_dipt: # decryption input transform
116 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
117 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
118 Lk_dsbo: # decryption sbox final output
119 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
120 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
121 Lk_dsb9: # decryption sbox output *9*u, *9*t
122 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
123 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
124 Lk_dsbd: # decryption sbox output *D*u, *D*t
125 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
126 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
127 Lk_dsbb: # decryption sbox output *B*u, *B*t
128 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
129 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
130 Lk_dsbe: # decryption sbox output *E*u, *E*t
131 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
132 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
133
134 ##
135 ## Key schedule constants
136 ##
137 Lk_dksd: # decryption key schedule: invskew x*D
138 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
139 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
140 Lk_dksb: # decryption key schedule: invskew x*B
141 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
142 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
143 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
144 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
145 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
146 Lk_dks9: # decryption key schedule: invskew x*9
147 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
148 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
149
150 Lk_rcon: # rcon
151 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
152 Lk_s63:
153 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
154
155 Lk_opt: # output transform
156 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
157 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
158 Lk_deskew: # deskew tables: inverts the sbox's "skew"
159 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
160 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
161 .align 5
162 Lconsts:
163 mflr r0
164 bcl 20,31,\$+4
165 mflr r12 #vvvvv "distance between . and _vpaes_consts
166 addi r12,r12,-0x308
167 mtlr r0
168 blr
169 .long 0
170 .byte 0,12,0x14,0,0,0,0,0
171 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
172 .align 6
173 ___
174 \f
175 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
176 {
177 my ($inp,$out,$key) = map("r$_",(3..5));
178
179 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
180 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
181 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
182
183 $code.=<<___;
184 ##
185 ## _aes_preheat
186 ##
187 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
188 ## and %xmm9-%xmm15 as specified below.
189 ##
190 .align 4
191 _vpaes_encrypt_preheat:
192 mflr r8
193 bl Lconsts
194 mtlr r8
195 li r11, 0xc0 # Lk_inv
196 li r10, 0xd0
197 li r9, 0xe0 # Lk_ipt
198 li r8, 0xf0
199 vxor v7, v7, v7 # 0x00..00
200 vspltisb v8,4 # 0x04..04
201 vspltisb v9,0x0f # 0x0f..0f
202 lvx $invlo, r12, r11
203 li r11, 0x100
204 lvx $invhi, r12, r10
205 li r10, 0x110
206 lvx $iptlo, r12, r9
207 li r9, 0x120
208 lvx $ipthi, r12, r8
209 li r8, 0x130
210 lvx $sbou, r12, r11
211 li r11, 0x140
212 lvx $sbot, r12, r10
213 li r10, 0x150
214 lvx $sb1u, r12, r9
215 lvx $sb1t, r12, r8
216 lvx $sb2u, r12, r11
217 lvx $sb2t, r12, r10
218 blr
219 .long 0
220 .byte 0,12,0x14,0,0,0,0,0
221
222 ##
223 ## _aes_encrypt_core
224 ##
225 ## AES-encrypt %xmm0.
226 ##
227 ## Inputs:
228 ## %xmm0 = input
229 ## %xmm9-%xmm15 as in _vpaes_preheat
230 ## (%rdx) = scheduled keys
231 ##
232 ## Output in %xmm0
233 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
234 ##
235 ##
236 .align 5
237 _vpaes_encrypt_core:
238 lwz r8, 240($key) # pull rounds
239 li r9, 16
240 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
241 li r11, 0x10
242 lvx v6, r9, $key
243 addi r9, r9, 16
244 ?vperm v5, v5, v6, $keyperm # align round key
245 addi r10, r11, 0x40
246 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
247 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
248 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
249 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
250 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
251 mtctr r8
252 b Lenc_entry
253
254 .align 4
255 Lenc_loop:
256 # middle of middle round
257 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
258 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
259 addi r11, r11, 16
260 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
261 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
262 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
263 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
264 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
265 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
266 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
267 addi r10, r11, 0x40
268 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
269 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
270 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
271 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
272 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
273 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
274 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
275
276 Lenc_entry:
277 # top of round
278 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
279 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
280 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
281 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
282 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
283 vand v0, v0, v9
284 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
285 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
286 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
287 vmr v5, v6
288 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
289 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
290 addi r9, r9, 16
291 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
292 ?vperm v5, v5, v6, $keyperm # align round key
293 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
294 bdnz Lenc_loop
295
296 # middle of last round
297 addi r10, r11, 0x80
298 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
299 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
300 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
301 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
302 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
303 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
304 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
305 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
306 blr
307 .long 0
308 .byte 0,12,0x14,0,0,0,0,0
309
310 .globl .vpaes_encrypt
311 .align 5
312 .vpaes_encrypt:
313 $STU $sp,-$FRAME($sp)
314 li r10,`15+6*$SIZE_T`
315 li r11,`31+6*$SIZE_T`
316 mflr r6
317 mfspr r7, 256 # save vrsave
318 stvx v20,r10,$sp
319 addi r10,r10,32
320 stvx v21,r11,$sp
321 addi r11,r11,32
322 stvx v22,r10,$sp
323 addi r10,r10,32
324 stvx v23,r11,$sp
325 addi r11,r11,32
326 stvx v24,r10,$sp
327 addi r10,r10,32
328 stvx v25,r11,$sp
329 addi r11,r11,32
330 stvx v26,r10,$sp
331 addi r10,r10,32
332 stvx v27,r11,$sp
333 addi r11,r11,32
334 stvx v28,r10,$sp
335 addi r10,r10,32
336 stvx v29,r11,$sp
337 addi r11,r11,32
338 stvx v30,r10,$sp
339 stvx v31,r11,$sp
340 stw r7,`$FRAME-4`($sp) # save vrsave
341 li r0, -1
342 $PUSH r6,`$FRAME+$LRSAVE`($sp)
343 mtspr 256, r0 # preserve all AltiVec registers
344
345 bl _vpaes_encrypt_preheat
346
347 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
348 lvx v0, 0, $inp
349 addi $inp, $inp, 15 # 15 is not a typo
350 ?lvsr $outperm, 0, $out
351 ?lvsl $keyperm, 0, $key # prepare for unaligned access
352 lvx $inptail, 0, $inp # redundant in aligned case
353 ?vperm v0, v0, $inptail, $inpperm
354
355 bl _vpaes_encrypt_core
356
357 andi. r8, $out, 15
358 li r9, 16
359 beq Lenc_out_aligned
360
361 vperm v0, v0, v0, $outperm # rotate right/left
362 mtctr r9
363 Lenc_out_unaligned:
364 stvebx v0, 0, $out
365 addi $out, $out, 1
366 bdnz Lenc_out_unaligned
367 b Lenc_done
368
369 .align 4
370 Lenc_out_aligned:
371 stvx v0, 0, $out
372 Lenc_done:
373
374 li r10,`15+6*$SIZE_T`
375 li r11,`31+6*$SIZE_T`
376 mtlr r6
377 mtspr 256, r7 # restore vrsave
378 lvx v20,r10,$sp
379 addi r10,r10,32
380 lvx v21,r11,$sp
381 addi r11,r11,32
382 lvx v22,r10,$sp
383 addi r10,r10,32
384 lvx v23,r11,$sp
385 addi r11,r11,32
386 lvx v24,r10,$sp
387 addi r10,r10,32
388 lvx v25,r11,$sp
389 addi r11,r11,32
390 lvx v26,r10,$sp
391 addi r10,r10,32
392 lvx v27,r11,$sp
393 addi r11,r11,32
394 lvx v28,r10,$sp
395 addi r10,r10,32
396 lvx v29,r11,$sp
397 addi r11,r11,32
398 lvx v30,r10,$sp
399 lvx v31,r11,$sp
400 addi $sp,$sp,$FRAME
401 blr
402 .long 0
403 .byte 0,12,0x04,1,0x80,0,3,0
404 .long 0
405 .size .vpaes_encrypt,.-.vpaes_encrypt
406
407 .align 4
408 _vpaes_decrypt_preheat:
409 mflr r8
410 bl Lconsts
411 mtlr r8
412 li r11, 0xc0 # Lk_inv
413 li r10, 0xd0
414 li r9, 0x160 # Ldipt
415 li r8, 0x170
416 vxor v7, v7, v7 # 0x00..00
417 vspltisb v8,4 # 0x04..04
418 vspltisb v9,0x0f # 0x0f..0f
419 lvx $invlo, r12, r11
420 li r11, 0x180
421 lvx $invhi, r12, r10
422 li r10, 0x190
423 lvx $iptlo, r12, r9
424 li r9, 0x1a0
425 lvx $ipthi, r12, r8
426 li r8, 0x1b0
427 lvx $sbou, r12, r11
428 li r11, 0x1c0
429 lvx $sbot, r12, r10
430 li r10, 0x1d0
431 lvx $sb9u, r12, r9
432 li r9, 0x1e0
433 lvx $sb9t, r12, r8
434 li r8, 0x1f0
435 lvx $sbdu, r12, r11
436 li r11, 0x200
437 lvx $sbdt, r12, r10
438 li r10, 0x210
439 lvx $sbbu, r12, r9
440 lvx $sbbt, r12, r8
441 lvx $sbeu, r12, r11
442 lvx $sbet, r12, r10
443 blr
444 .long 0
445 .byte 0,12,0x14,0,0,0,0,0
446
447 ##
448 ## Decryption core
449 ##
450 ## Same API as encryption core.
451 ##
452 .align 4
453 _vpaes_decrypt_core:
454 lwz r8, 240($key) # pull rounds
455 li r9, 16
456 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
457 li r11, 0x30
458 lvx v6, r9, $key
459 addi r9, r9, 16
460 ?vperm v5, v5, v6, $keyperm # align round key
461 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
462 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
463 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
464 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
465 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
466 mtctr r8
467 b Ldec_entry
468
469 .align 4
470 Ldec_loop:
471 #
472 # Inverse mix columns
473 #
474 lvx v0, r12, r11 # v5 and v0 are flipped
475 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
476 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
477 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
478 subi r11, r11, 16
479 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
480 andi. r11, r11, 0x30
481 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
482 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
483 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
484 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
485
486 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
487 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
488 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
489 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
490 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
491 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
492 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
493
494 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
495 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
496 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
497 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
498 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
499 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
500 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
501
502 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
503 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
504 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
505 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
506 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
507
508 Ldec_entry:
509 # top of round
510 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
511 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
512 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
513 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
514 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
515 vand v0, v0, v9
516 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
517 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
518 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
519 vmr v5, v6
520 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
521 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
522 addi r9, r9, 16
523 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
524 ?vperm v5, v5, v6, $keyperm # align round key
525 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
526 bdnz Ldec_loop
527
528 # middle of last round
529 addi r10, r11, 0x80
530 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
531 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
532 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
533 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
534 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
535 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
536 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
537 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
538 blr
539 .long 0
540 .byte 0,12,0x14,0,0,0,0,0
541
542 .globl .vpaes_decrypt
543 .align 5
544 .vpaes_decrypt:
545 $STU $sp,-$FRAME($sp)
546 li r10,`15+6*$SIZE_T`
547 li r11,`31+6*$SIZE_T`
548 mflr r6
549 mfspr r7, 256 # save vrsave
550 stvx v20,r10,$sp
551 addi r10,r10,32
552 stvx v21,r11,$sp
553 addi r11,r11,32
554 stvx v22,r10,$sp
555 addi r10,r10,32
556 stvx v23,r11,$sp
557 addi r11,r11,32
558 stvx v24,r10,$sp
559 addi r10,r10,32
560 stvx v25,r11,$sp
561 addi r11,r11,32
562 stvx v26,r10,$sp
563 addi r10,r10,32
564 stvx v27,r11,$sp
565 addi r11,r11,32
566 stvx v28,r10,$sp
567 addi r10,r10,32
568 stvx v29,r11,$sp
569 addi r11,r11,32
570 stvx v30,r10,$sp
571 stvx v31,r11,$sp
572 stw r7,`$FRAME-4`($sp) # save vrsave
573 li r0, -1
574 $PUSH r6,`$FRAME+$LRSAVE`($sp)
575 mtspr 256, r0 # preserve all AltiVec registers
576
577 bl _vpaes_decrypt_preheat
578
579 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
580 lvx v0, 0, $inp
581 addi $inp, $inp, 15 # 15 is not a typo
582 ?lvsr $outperm, 0, $out
583 ?lvsl $keyperm, 0, $key
584 lvx $inptail, 0, $inp # redundant in aligned case
585 ?vperm v0, v0, $inptail, $inpperm
586
587 bl _vpaes_decrypt_core
588
589 andi. r8, $out, 15
590 li r9, 16
591 beq Ldec_out_aligned
592
593 vperm v0, v0, v0, $outperm # rotate right/left
594 mtctr r9
595 Ldec_out_unaligned:
596 stvebx v0, 0, $out
597 addi $out, $out, 1
598 bdnz Ldec_out_unaligned
599 b Ldec_done
600
601 .align 4
602 Ldec_out_aligned:
603 stvx v0, 0, $out
604 Ldec_done:
605
606 li r10,`15+6*$SIZE_T`
607 li r11,`31+6*$SIZE_T`
608 mtlr r6
609 mtspr 256, r7 # restore vrsave
610 lvx v20,r10,$sp
611 addi r10,r10,32
612 lvx v21,r11,$sp
613 addi r11,r11,32
614 lvx v22,r10,$sp
615 addi r10,r10,32
616 lvx v23,r11,$sp
617 addi r11,r11,32
618 lvx v24,r10,$sp
619 addi r10,r10,32
620 lvx v25,r11,$sp
621 addi r11,r11,32
622 lvx v26,r10,$sp
623 addi r10,r10,32
624 lvx v27,r11,$sp
625 addi r11,r11,32
626 lvx v28,r10,$sp
627 addi r10,r10,32
628 lvx v29,r11,$sp
629 addi r11,r11,32
630 lvx v30,r10,$sp
631 lvx v31,r11,$sp
632 addi $sp,$sp,$FRAME
633 blr
634 .long 0
635 .byte 0,12,0x04,1,0x80,0,3,0
636 .long 0
637 .size .vpaes_decrypt,.-.vpaes_decrypt
638
639 .globl .vpaes_cbc_encrypt
640 .align 5
641 .vpaes_cbc_encrypt:
642 ${UCMP}i r5,16
643 bltlr-
644
645 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
646 mflr r0
647 li r10,`15+6*$SIZE_T`
648 li r11,`31+6*$SIZE_T`
649 mfspr r12, 256
650 stvx v20,r10,$sp
651 addi r10,r10,32
652 stvx v21,r11,$sp
653 addi r11,r11,32
654 stvx v22,r10,$sp
655 addi r10,r10,32
656 stvx v23,r11,$sp
657 addi r11,r11,32
658 stvx v24,r10,$sp
659 addi r10,r10,32
660 stvx v25,r11,$sp
661 addi r11,r11,32
662 stvx v26,r10,$sp
663 addi r10,r10,32
664 stvx v27,r11,$sp
665 addi r11,r11,32
666 stvx v28,r10,$sp
667 addi r10,r10,32
668 stvx v29,r11,$sp
669 addi r11,r11,32
670 stvx v30,r10,$sp
671 stvx v31,r11,$sp
672 stw r12,`$FRAME-4`($sp) # save vrsave
673 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
674 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
675 li r9, -16
676 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
677
678 and r30, r5, r9 # copy length&-16
679 andi. r9, $out, 15 # is $out aligned?
680 mr r5, r6 # copy pointer to key
681 mr r31, r7 # copy pointer to iv
682 li r6, -1
683 mcrf cr1, cr0 # put aside $out alignment flag
684 mr r7, r12 # copy vrsave
685 mtspr 256, r6 # preserve all AltiVec registers
686
687 lvx v24, 0, r31 # load [potentially unaligned] iv
688 li r9, 15
689 ?lvsl $inpperm, 0, r31
690 lvx v25, r9, r31
691 ?vperm v24, v24, v25, $inpperm
692
693 cmpwi r8, 0 # test direction
694 neg r8, $inp # prepare for unaligned access
695 vxor v7, v7, v7
696 ?lvsl $keyperm, 0, $key
697 ?lvsr $outperm, 0, $out
698 ?lvsr $inpperm, 0, r8 # -$inp
699 vnor $outmask, v7, v7 # 0xff..ff
700 lvx $inptail, 0, $inp
701 ?vperm $outmask, v7, $outmask, $outperm
702 addi $inp, $inp, 15 # 15 is not a typo
703
704 beq Lcbc_decrypt
705
706 bl _vpaes_encrypt_preheat
707 li r0, 16
708
709 beq cr1, Lcbc_enc_loop # $out is aligned
710
711 vmr v0, $inptail
712 lvx $inptail, 0, $inp
713 addi $inp, $inp, 16
714 ?vperm v0, v0, $inptail, $inpperm
715 vxor v0, v0, v24 # ^= iv
716
717 bl _vpaes_encrypt_core
718
719 andi. r8, $out, 15
720 vmr v24, v0 # put aside iv
721 sub r9, $out, r8
722 vperm $outhead, v0, v0, $outperm # rotate right/left
723
724 Lcbc_enc_head:
725 stvebx $outhead, r8, r9
726 cmpwi r8, 15
727 addi r8, r8, 1
728 bne Lcbc_enc_head
729
730 sub. r30, r30, r0 # len -= 16
731 addi $out, $out, 16
732 beq Lcbc_unaligned_done
733
734 Lcbc_enc_loop:
735 vmr v0, $inptail
736 lvx $inptail, 0, $inp
737 addi $inp, $inp, 16
738 ?vperm v0, v0, $inptail, $inpperm
739 vxor v0, v0, v24 # ^= iv
740
741 bl _vpaes_encrypt_core
742
743 vmr v24, v0 # put aside iv
744 sub. r30, r30, r0 # len -= 16
745 vperm v0, v0, v0, $outperm # rotate right/left
746 vsel v1, $outhead, v0, $outmask
747 vmr $outhead, v0
748 stvx v1, 0, $out
749 addi $out, $out, 16
750 bne Lcbc_enc_loop
751
752 b Lcbc_done
753
754 .align 5
755 Lcbc_decrypt:
756 bl _vpaes_decrypt_preheat
757 li r0, 16
758
759 beq cr1, Lcbc_dec_loop # $out is aligned
760
761 vmr v0, $inptail
762 lvx $inptail, 0, $inp
763 addi $inp, $inp, 16
764 ?vperm v0, v0, $inptail, $inpperm
765 vmr v25, v0 # put aside input
766
767 bl _vpaes_decrypt_core
768
769 andi. r8, $out, 15
770 vxor v0, v0, v24 # ^= iv
771 vmr v24, v25
772 sub r9, $out, r8
773 vperm $outhead, v0, v0, $outperm # rotate right/left
774
775 Lcbc_dec_head:
776 stvebx $outhead, r8, r9
777 cmpwi r8, 15
778 addi r8, r8, 1
779 bne Lcbc_dec_head
780
781 sub. r30, r30, r0 # len -= 16
782 addi $out, $out, 16
783 beq Lcbc_unaligned_done
784
785 Lcbc_dec_loop:
786 vmr v0, $inptail
787 lvx $inptail, 0, $inp
788 addi $inp, $inp, 16
789 ?vperm v0, v0, $inptail, $inpperm
790 vmr v25, v0 # put aside input
791
792 bl _vpaes_decrypt_core
793
794 vxor v0, v0, v24 # ^= iv
795 vmr v24, v25
796 sub. r30, r30, r0 # len -= 16
797 vperm v0, v0, v0, $outperm # rotate right/left
798 vsel v1, $outhead, v0, $outmask
799 vmr $outhead, v0
800 stvx v1, 0, $out
801 addi $out, $out, 16
802 bne Lcbc_dec_loop
803
804 Lcbc_done:
805 beq cr1, Lcbc_write_iv # $out is aligned
806
807 Lcbc_unaligned_done:
808 andi. r8, $out, 15
809 sub $out, $out, r8
810 li r9, 0
811 Lcbc_tail:
812 stvebx $outhead, r9, $out
813 addi r9, r9, 1
814 cmpw r9, r8
815 bne Lcbc_tail
816
817 Lcbc_write_iv:
818 neg r8, r31 # write [potentially unaligned] iv
819 li r10, 4
820 ?lvsl $outperm, 0, r8
821 li r11, 8
822 li r12, 12
823 vperm v24, v24, v24, $outperm # rotate right/left
824 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
825 stvewx v24, r10, r31
826 stvewx v24, r11, r31
827 stvewx v24, r12, r31
828
829 mtspr 256, r7 # restore vrsave
830 li r10,`15+6*$SIZE_T`
831 li r11,`31+6*$SIZE_T`
832 lvx v20,r10,$sp
833 addi r10,r10,32
834 lvx v21,r11,$sp
835 addi r11,r11,32
836 lvx v22,r10,$sp
837 addi r10,r10,32
838 lvx v23,r11,$sp
839 addi r11,r11,32
840 lvx v24,r10,$sp
841 addi r10,r10,32
842 lvx v25,r11,$sp
843 addi r11,r11,32
844 lvx v26,r10,$sp
845 addi r10,r10,32
846 lvx v27,r11,$sp
847 addi r11,r11,32
848 lvx v28,r10,$sp
849 addi r10,r10,32
850 lvx v29,r11,$sp
851 addi r11,r11,32
852 lvx v30,r10,$sp
853 lvx v31,r11,$sp
854 Lcbc_abort:
855 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
856 $POP r30,`$FRAME+$SIZE_T*0`($sp)
857 $POP r31,`$FRAME+$SIZE_T*1`($sp)
858 mtlr r0
859 addi $sp,$sp,`$FRAME+$SIZE_T*2`
860 blr
861 .long 0
862 .byte 0,12,0x04,1,0x80,2,6,0
863 .long 0
864 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
865 ___
866 }\f
867 {
868 my ($inp,$bits,$out)=map("r$_",(3..5));
869 my $dir="cr1";
870 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
871
872 $code.=<<___;
873 ########################################################
874 ## ##
875 ## AES key schedule ##
876 ## ##
877 ########################################################
878 .align 4
879 _vpaes_key_preheat:
880 mflr r8
881 bl Lconsts
882 mtlr r8
883 li r11, 0xc0 # Lk_inv
884 li r10, 0xd0
885 li r9, 0xe0 # L_ipt
886 li r8, 0xf0
887
888 vspltisb v8,4 # 0x04..04
889 vxor v9,v9,v9 # 0x00..00
890 lvx $invlo, r12, r11 # Lk_inv
891 li r11, 0x120
892 lvx $invhi, r12, r10
893 li r10, 0x130
894 lvx $iptlo, r12, r9 # Lk_ipt
895 li r9, 0x220
896 lvx $ipthi, r12, r8
897 li r8, 0x230
898
899 lvx v14, r12, r11 # Lk_sb1
900 li r11, 0x240
901 lvx v15, r12, r10
902 li r10, 0x250
903
904 lvx v16, r12, r9 # Lk_dksd
905 li r9, 0x260
906 lvx v17, r12, r8
907 li r8, 0x270
908 lvx v18, r12, r11 # Lk_dksb
909 li r11, 0x280
910 lvx v19, r12, r10
911 li r10, 0x290
912 lvx v20, r12, r9 # Lk_dkse
913 li r9, 0x2a0
914 lvx v21, r12, r8
915 li r8, 0x2b0
916 lvx v22, r12, r11 # Lk_dks9
917 lvx v23, r12, r10
918
919 lvx v24, r12, r9 # Lk_rcon
920 lvx v25, 0, r12 # Lk_mc_forward[0]
921 lvx v26, r12, r8 # Lks63
922 blr
923 .long 0
924 .byte 0,12,0x14,0,0,0,0,0
925
926 .align 4
927 _vpaes_schedule_core:
928 mflr r7
929
930 bl _vpaes_key_preheat # load the tables
931
932 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
933 neg r8, $inp # prepare for unaligned access
934 lvx v0, 0, $inp
935 addi $inp, $inp, 15 # 15 is not typo
936 ?lvsr $inpperm, 0, r8 # -$inp
937 lvx v6, 0, $inp # v6 serves as inptail
938 addi $inp, $inp, 8
939 ?vperm v0, v0, v6, $inpperm
940
941 # input transform
942 vmr v3, v0 # vmovdqa %xmm0, %xmm3
943 bl _vpaes_schedule_transform
944 vmr v7, v0 # vmovdqa %xmm0, %xmm7
945
946 bne $dir, Lschedule_am_decrypting
947
948 # encrypting, output zeroth round key after transform
949 li r8, 0x30 # mov \$0x30,%r8d
950 li r9, 4
951 li r10, 8
952 li r11, 12
953
954 ?lvsr $outperm, 0, $out # prepare for unaligned access
955 vnor $outmask, v9, v9 # 0xff..ff
956 ?vperm $outmask, v9, $outmask, $outperm
957
958 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
959 vperm $outhead, v0, v0, $outperm # rotate right/left
960 stvewx $outhead, 0, $out # some are superfluous
961 stvewx $outhead, r9, $out
962 stvewx $outhead, r10, $out
963 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
964 stvewx $outhead, r11, $out
965 b Lschedule_go
966
967 Lschedule_am_decrypting:
968 srwi r8, $bits, 1 # shr \$1,%r8d
969 andi. r8, r8, 32 # and \$32,%r8d
970 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
971 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
972 # decrypting, output zeroth round key after shiftrows
973 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
974 li r9, 4
975 li r10, 8
976 li r11, 12
977 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
978
979 neg r0, $out # prepare for unaligned access
980 ?lvsl $outperm, 0, r0
981 vnor $outmask, v9, v9 # 0xff..ff
982 ?vperm $outmask, $outmask, v9, $outperm
983
984 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
985 vperm $outhead, v4, v4, $outperm # rotate right/left
986 stvewx $outhead, 0, $out # some are superfluous
987 stvewx $outhead, r9, $out
988 stvewx $outhead, r10, $out
989 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
990 stvewx $outhead, r11, $out
991 addi $out, $out, 15 # 15 is not typo
992 xori r8, r8, 0x30 # xor \$0x30, %r8
993
994 Lschedule_go:
995 cmplwi $bits, 192 # cmp \$192, %esi
996 bgt Lschedule_256
997 beq Lschedule_192
998 # 128: fall though
999
1000 ##
1001 ## .schedule_128
1002 ##
1003 ## 128-bit specific part of key schedule.
1004 ##
1005 ## This schedule is really simple, because all its parts
1006 ## are accomplished by the subroutines.
1007 ##
1008 Lschedule_128:
1009 li r0, 10 # mov \$10, %esi
1010 mtctr r0
1011
1012 Loop_schedule_128:
1013 bl _vpaes_schedule_round
1014 bdz Lschedule_mangle_last # dec %esi
1015 bl _vpaes_schedule_mangle # write output
1016 b Loop_schedule_128
1017
1018 ##
1019 ## .aes_schedule_192
1020 ##
1021 ## 192-bit specific part of key schedule.
1022 ##
1023 ## The main body of this schedule is the same as the 128-bit
1024 ## schedule, but with more smearing. The long, high side is
1025 ## stored in %xmm7 as before, and the short, low side is in
1026 ## the high bits of %xmm6.
1027 ##
1028 ## This schedule is somewhat nastier, however, because each
1029 ## round produces 192 bits of key material, or 1.5 round keys.
1030 ## Therefore, on each cycle we do 2 rounds and produce 3 round
1031 ## keys.
1032 ##
1033 .align 4
1034 Lschedule_192:
1035 li r0, 4 # mov \$4, %esi
1036 lvx v0, 0, $inp
1037 ?vperm v0, v6, v0, $inpperm
1038 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
1039 bl _vpaes_schedule_transform # input transform
1040 ?vsldoi v6, v0, v9, 8
1041 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
1042 mtctr r0
1043
1044 Loop_schedule_192:
1045 bl _vpaes_schedule_round
1046 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
1047 bl _vpaes_schedule_mangle # save key n
1048 bl _vpaes_schedule_192_smear
1049 bl _vpaes_schedule_mangle # save key n+1
1050 bl _vpaes_schedule_round
1051 bdz Lschedule_mangle_last # dec %esi
1052 bl _vpaes_schedule_mangle # save key n+2
1053 bl _vpaes_schedule_192_smear
1054 b Loop_schedule_192
1055
1056 ##
1057 ## .aes_schedule_256
1058 ##
1059 ## 256-bit specific part of key schedule.
1060 ##
1061 ## The structure here is very similar to the 128-bit
1062 ## schedule, but with an additional "low side" in
1063 ## %xmm6. The low side's rounds are the same as the
1064 ## high side's, except no rcon and no rotation.
1065 ##
1066 .align 4
1067 Lschedule_256:
1068 li r0, 7 # mov \$7, %esi
1069 addi $inp, $inp, 8
1070 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1071 ?vperm v0, v6, v0, $inpperm
1072 bl _vpaes_schedule_transform # input transform
1073 mtctr r0
1074
1075 Loop_schedule_256:
1076 bl _vpaes_schedule_mangle # output low result
1077 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1078
1079 # high round
1080 bl _vpaes_schedule_round
1081 bdz Lschedule_mangle_last # dec %esi
1082 bl _vpaes_schedule_mangle
1083
1084 # low round. swap xmm7 and xmm6
1085 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1086 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1087 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1088 bl _vpaes_schedule_low_round
1089 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1090
1091 b Loop_schedule_256
1092 ##
1093 ## .aes_schedule_mangle_last
1094 ##
1095 ## Mangler for last round of key schedule
1096 ## Mangles %xmm0
1097 ## when encrypting, outputs out(%xmm0) ^ 63
1098 ## when decrypting, outputs unskew(%xmm0)
1099 ##
1100 ## Always called right before return... jumps to cleanup and exits
1101 ##
1102 .align 4
1103 Lschedule_mangle_last:
1104 # schedule last round key from xmm0
1105 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1106 li r9, 0x2f0
1107 bne $dir, Lschedule_mangle_last_dec
1108
1109 # encrypting
1110 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1111 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1112 li r9, 0x2d0 # prepare to output transform
1113 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1114
1115 lvx $iptlo, r11, r12 # reload $ipt
1116 lvx $ipthi, r9, r12
1117 addi $out, $out, 16 # add \$16, %rdx
1118 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1119 bl _vpaes_schedule_transform # output transform
1120
1121 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1122 vperm v0, v0, v0, $outperm # rotate right/left
1123 li r10, 4
1124 vsel v2, $outhead, v0, $outmask
1125 li r11, 8
1126 stvx v2, 0, $out
1127 li r12, 12
1128 stvewx v0, 0, $out # some (or all) are redundant
1129 stvewx v0, r10, $out
1130 stvewx v0, r11, $out
1131 stvewx v0, r12, $out
1132 b Lschedule_mangle_done
1133
1134 .align 4
1135 Lschedule_mangle_last_dec:
1136 lvx $iptlo, r11, r12 # reload $ipt
1137 lvx $ipthi, r9, r12
1138 addi $out, $out, -16 # add \$-16, %rdx
1139 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1140 bl _vpaes_schedule_transform # output transform
1141
1142 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1143 addi r9, $out, -15 # -15 is not typo
1144 vperm v0, v0, v0, $outperm # rotate right/left
1145 li r10, 4
1146 vsel v2, $outhead, v0, $outmask
1147 li r11, 8
1148 stvx v2, 0, $out
1149 li r12, 12
1150 stvewx v0, 0, r9 # some (or all) are redundant
1151 stvewx v0, r10, r9
1152 stvewx v0, r11, r9
1153 stvewx v0, r12, r9
1154
1155
1156 Lschedule_mangle_done:
1157 mtlr r7
1158 # cleanup
1159 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1160 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1161 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1162 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1163 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1164 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1165 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1166 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1167
1168 blr
1169 .long 0
1170 .byte 0,12,0x14,0,0,0,0,0
1171
1172 ##
1173 ## .aes_schedule_192_smear
1174 ##
1175 ## Smear the short, low side in the 192-bit key schedule.
1176 ##
1177 ## Inputs:
1178 ## %xmm7: high side, b a x y
1179 ## %xmm6: low side, d c 0 0
1180 ## %xmm13: 0
1181 ##
1182 ## Outputs:
1183 ## %xmm6: b+c+d b+c 0 0
1184 ## %xmm0: b+c+d b+c b a
1185 ##
1186 .align 4
1187 _vpaes_schedule_192_smear:
1188 ?vspltw v0, v7, 3
1189 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1190 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1191 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1192 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1193 vmr v0, v6
1194 ?vsldoi v6, v6, v9, 8
1195 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1196 blr
1197 .long 0
1198 .byte 0,12,0x14,0,0,0,0,0
1199
1200 ##
1201 ## .aes_schedule_round
1202 ##
1203 ## Runs one main round of the key schedule on %xmm0, %xmm7
1204 ##
1205 ## Specifically, runs subbytes on the high dword of %xmm0
1206 ## then rotates it by one byte and xors into the low dword of
1207 ## %xmm7.
1208 ##
1209 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1210 ## next rcon.
1211 ##
1212 ## Smears the dwords of %xmm7 by xoring the low into the
1213 ## second low, result into third, result into highest.
1214 ##
1215 ## Returns results in %xmm7 = %xmm0.
1216 ## Clobbers %xmm1-%xmm4, %r11.
1217 ##
1218 .align 4
1219 _vpaes_schedule_round:
1220 # extract rcon from xmm8
1221 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1222 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1223 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1224 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1225
1226 # rotate
1227 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1228 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1229
1230 # fall through...
1231
1232 # low round: same as high round, but no rotation and no rcon.
1233 _vpaes_schedule_low_round:
1234 # smear xmm7
1235 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1236 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1237 vspltisb v1, 0x0f # 0x0f..0f
1238 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1239
1240 # subbytes
1241 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1242 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1243 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1244 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1245 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1246 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1247 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1248 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1249 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1250 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1251 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1252 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1253 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1254 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1255 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1256 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1257 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1258
1259 # add in smeared stuff
1260 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1261 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1262 blr
1263 .long 0
1264 .byte 0,12,0x14,0,0,0,0,0
1265
1266 ##
1267 ## .aes_schedule_transform
1268 ##
1269 ## Linear-transform %xmm0 according to tables at (%r11)
1270 ##
1271 ## Requires that %xmm9 = 0x0F0F... as in preheat
1272 ## Output in %xmm0
1273 ## Clobbers %xmm2
1274 ##
1275 .align 4
1276 _vpaes_schedule_transform:
1277 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1278 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1279 # vmovdqa (%r11), %xmm2 # lo
1280 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1281 # vmovdqa 16(%r11), %xmm1 # hi
1282 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1283 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1284 blr
1285 .long 0
1286 .byte 0,12,0x14,0,0,0,0,0
1287
1288 ##
1289 ## .aes_schedule_mangle
1290 ##
1291 ## Mangle xmm0 from (basis-transformed) standard version
1292 ## to our version.
1293 ##
1294 ## On encrypt,
1295 ## xor with 0x63
1296 ## multiply by circulant 0,1,1,1
1297 ## apply shiftrows transform
1298 ##
1299 ## On decrypt,
1300 ## xor with 0x63
1301 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1302 ## deskew
1303 ## apply shiftrows transform
1304 ##
1305 ##
1306 ## Writes out to (%rdx), and increments or decrements it
1307 ## Keeps track of round number mod 4 in %r8
1308 ## Preserves xmm0
1309 ## Clobbers xmm1-xmm5
1310 ##
1311 .align 4
1312 _vpaes_schedule_mangle:
1313 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1314 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1315 bne $dir, Lschedule_mangle_dec
1316
1317 # encrypting
1318 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1319 addi $out, $out, 16 # add \$16, %rdx
1320 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1321 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1322 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1323 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1324 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1325 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1326
1327 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1328 addi r8, r8, -16 # add \$-16, %r8
1329 andi. r8, r8, 0x30 # and \$0x30, %r8
1330
1331 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1332 vperm v1, v3, v3, $outperm # rotate right/left
1333 vsel v2, $outhead, v1, $outmask
1334 vmr $outhead, v1
1335 stvx v2, 0, $out
1336 blr
1337
1338 .align 4
1339 Lschedule_mangle_dec:
1340 # inverse mix columns
1341 # lea .Lk_dksd(%rip),%r11
1342 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1343 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1344
1345 # vmovdqa 0x00(%r11), %xmm2
1346 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1347 # vmovdqa 0x10(%r11), %xmm3
1348 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1349 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1350 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1351
1352 # vmovdqa 0x20(%r11), %xmm2
1353 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1354 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1355 # vmovdqa 0x30(%r11), %xmm3
1356 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1357 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1358 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1359
1360 # vmovdqa 0x40(%r11), %xmm2
1361 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1362 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1363 # vmovdqa 0x50(%r11), %xmm3
1364 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1365 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1366
1367 # vmovdqa 0x60(%r11), %xmm2
1368 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1369 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1370 # vmovdqa 0x70(%r11), %xmm4
1371 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1372 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1373 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1374 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1375
1376 addi $out, $out, -16 # add \$-16, %rdx
1377
1378 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1379 addi r8, r8, -16 # add \$-16, %r8
1380 andi. r8, r8, 0x30 # and \$0x30, %r8
1381
1382 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1383 vperm v1, v3, v3, $outperm # rotate right/left
1384 vsel v2, $outhead, v1, $outmask
1385 vmr $outhead, v1
1386 stvx v2, 0, $out
1387 blr
1388 .long 0
1389 .byte 0,12,0x14,0,0,0,0,0
1390
1391 .globl .vpaes_set_encrypt_key
1392 .align 5
1393 .vpaes_set_encrypt_key:
1394 $STU $sp,-$FRAME($sp)
1395 li r10,`15+6*$SIZE_T`
1396 li r11,`31+6*$SIZE_T`
1397 mflr r0
1398 mfspr r6, 256 # save vrsave
1399 stvx v20,r10,$sp
1400 addi r10,r10,32
1401 stvx v21,r11,$sp
1402 addi r11,r11,32
1403 stvx v22,r10,$sp
1404 addi r10,r10,32
1405 stvx v23,r11,$sp
1406 addi r11,r11,32
1407 stvx v24,r10,$sp
1408 addi r10,r10,32
1409 stvx v25,r11,$sp
1410 addi r11,r11,32
1411 stvx v26,r10,$sp
1412 addi r10,r10,32
1413 stvx v27,r11,$sp
1414 addi r11,r11,32
1415 stvx v28,r10,$sp
1416 addi r10,r10,32
1417 stvx v29,r11,$sp
1418 addi r11,r11,32
1419 stvx v30,r10,$sp
1420 stvx v31,r11,$sp
1421 stw r6,`$FRAME-4`($sp) # save vrsave
1422 li r7, -1
1423 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1424 mtspr 256, r7 # preserve all AltiVec registers
1425
1426 srwi r9, $bits, 5 # shr \$5,%eax
1427 addi r9, r9, 6 # add \$5,%eax
1428 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1429
1430 cmplw $dir, $bits, $bits # set encrypt direction
1431 li r8, 0x30 # mov \$0x30,%r8d
1432 bl _vpaes_schedule_core
1433
1434 $POP r0, `$FRAME+$LRSAVE`($sp)
1435 li r10,`15+6*$SIZE_T`
1436 li r11,`31+6*$SIZE_T`
1437 mtspr 256, r6 # restore vrsave
1438 mtlr r0
1439 xor r3, r3, r3
1440 lvx v20,r10,$sp
1441 addi r10,r10,32
1442 lvx v21,r11,$sp
1443 addi r11,r11,32
1444 lvx v22,r10,$sp
1445 addi r10,r10,32
1446 lvx v23,r11,$sp
1447 addi r11,r11,32
1448 lvx v24,r10,$sp
1449 addi r10,r10,32
1450 lvx v25,r11,$sp
1451 addi r11,r11,32
1452 lvx v26,r10,$sp
1453 addi r10,r10,32
1454 lvx v27,r11,$sp
1455 addi r11,r11,32
1456 lvx v28,r10,$sp
1457 addi r10,r10,32
1458 lvx v29,r11,$sp
1459 addi r11,r11,32
1460 lvx v30,r10,$sp
1461 lvx v31,r11,$sp
1462 addi $sp,$sp,$FRAME
1463 blr
1464 .long 0
1465 .byte 0,12,0x04,1,0x80,0,3,0
1466 .long 0
1467 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1468
1469 .globl .vpaes_set_decrypt_key
1470 .align 4
1471 .vpaes_set_decrypt_key:
1472 $STU $sp,-$FRAME($sp)
1473 li r10,`15+6*$SIZE_T`
1474 li r11,`31+6*$SIZE_T`
1475 mflr r0
1476 mfspr r6, 256 # save vrsave
1477 stvx v20,r10,$sp
1478 addi r10,r10,32
1479 stvx v21,r11,$sp
1480 addi r11,r11,32
1481 stvx v22,r10,$sp
1482 addi r10,r10,32
1483 stvx v23,r11,$sp
1484 addi r11,r11,32
1485 stvx v24,r10,$sp
1486 addi r10,r10,32
1487 stvx v25,r11,$sp
1488 addi r11,r11,32
1489 stvx v26,r10,$sp
1490 addi r10,r10,32
1491 stvx v27,r11,$sp
1492 addi r11,r11,32
1493 stvx v28,r10,$sp
1494 addi r10,r10,32
1495 stvx v29,r11,$sp
1496 addi r11,r11,32
1497 stvx v30,r10,$sp
1498 stvx v31,r11,$sp
1499 stw r6,`$FRAME-4`($sp) # save vrsave
1500 li r7, -1
1501 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1502 mtspr 256, r7 # preserve all AltiVec registers
1503
1504 srwi r9, $bits, 5 # shr \$5,%eax
1505 addi r9, r9, 6 # add \$5,%eax
1506 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1507
1508 slwi r9, r9, 4 # shl \$4,%eax
1509 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1510
1511 cmplwi $dir, $bits, 0 # set decrypt direction
1512 srwi r8, $bits, 1 # shr \$1,%r8d
1513 andi. r8, r8, 32 # and \$32,%r8d
1514 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1515 bl _vpaes_schedule_core
1516
1517 $POP r0, `$FRAME+$LRSAVE`($sp)
1518 li r10,`15+6*$SIZE_T`
1519 li r11,`31+6*$SIZE_T`
1520 mtspr 256, r6 # restore vrsave
1521 mtlr r0
1522 xor r3, r3, r3
1523 lvx v20,r10,$sp
1524 addi r10,r10,32
1525 lvx v21,r11,$sp
1526 addi r11,r11,32
1527 lvx v22,r10,$sp
1528 addi r10,r10,32
1529 lvx v23,r11,$sp
1530 addi r11,r11,32
1531 lvx v24,r10,$sp
1532 addi r10,r10,32
1533 lvx v25,r11,$sp
1534 addi r11,r11,32
1535 lvx v26,r10,$sp
1536 addi r10,r10,32
1537 lvx v27,r11,$sp
1538 addi r11,r11,32
1539 lvx v28,r10,$sp
1540 addi r10,r10,32
1541 lvx v29,r11,$sp
1542 addi r11,r11,32
1543 lvx v30,r10,$sp
1544 lvx v31,r11,$sp
1545 addi $sp,$sp,$FRAME
1546 blr
1547 .long 0
1548 .byte 0,12,0x04,1,0x80,0,3,0
1549 .long 0
1550 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1551 ___
1552 }
1553
1554 my $consts=1;
1555 foreach (split("\n",$code)) {
1556 s/\`([^\`]*)\`/eval $1/geo;
1557
1558 # constants table endian-specific conversion
1559 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1560 my $conv=$2;
1561 my @bytes=();
1562
1563 # convert to endian-agnostic format
1564 foreach (split(/,\s+/,$1)) {
1565 my $l = /^0/?oct:int;
1566 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1567 }
1568
1569 # little-endian conversion
1570 if ($flavour =~ /le$/o) {
1571 SWITCH: for($conv) {
1572 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1573 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1574 }
1575 }
1576
1577 #emit
1578 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1579 next;
1580 }
1581 $consts=0 if (m/Lconsts:/o); # end of table
1582
1583 # instructions prefixed with '?' are endian-specific and need
1584 # to be adjusted accordingly...
1585 if ($flavour =~ /le$/o) { # little-endian
1586 s/\?lvsr/lvsl/o or
1587 s/\?lvsl/lvsr/o or
1588 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1589 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1590 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1591 } else { # big-endian
1592 s/\?([a-z]+)/$1/o;
1593 }
1594
1595 print $_,"\n";
1596 }
1597
1598 close STDOUT;