]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/vpaes-ppc.pl
f77deed272447dbabaf1863153c00f7a85d6ad4e
[thirdparty/openssl.git] / crypto / aes / asm / vpaes-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
12 ## version 0.1
13 ##
14 ## By Mike Hamburg (Stanford University), 2009
15 ## Public domain.
16 ##
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
19
20 # CBC encrypt/decrypt performance in cycles per byte processed with
21 # 128-bit key.
22 #
23 # aes-ppc.pl this
24 # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
25 # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
26 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
27 # POWER7 32.3/42.9/(18.4) 18.5/23.3
28 #
29 # (*) This is ~10% worse than reported in paper. The reason is
30 # twofold. This module doesn't make any assumption about
31 # key schedule (or data for that matter) alignment and handles
32 # it in-line. Secondly it, being transliterated from
33 # vpaes-x86_64.pl, relies on "nested inversion" better suited
34 # for Intel CPUs.
35 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
36 # latency, 9 cycles per simple logical operation.
37
38 $flavour = shift;
39
40 if ($flavour =~ /64/) {
41 $SIZE_T =8;
42 $LRSAVE =2*$SIZE_T;
43 $STU ="stdu";
44 $POP ="ld";
45 $PUSH ="std";
46 $UCMP ="cmpld";
47 } elsif ($flavour =~ /32/) {
48 $SIZE_T =4;
49 $LRSAVE =$SIZE_T;
50 $STU ="stwu";
51 $POP ="lwz";
52 $PUSH ="stw";
53 $UCMP ="cmplw";
54 } else { die "nonsense $flavour"; }
55
56 $sp="r1";
57 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
58
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62 die "can't locate ppc-xlate.pl";
63
64 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
65
66 $code.=<<___;
67 .machine "any"
68
69 .text
70
71 .align 7 # totally strategic alignment
72 _vpaes_consts:
73 Lk_mc_forward: # mc_forward
74 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
75 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
76 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
77 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
78 Lk_mc_backward: # mc_backward
79 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
80 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
81 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
82 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
83 Lk_sr: # sr
84 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
85 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
86 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
87 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
88
89 ##
90 ## "Hot" constants
91 ##
92 Lk_inv: # inv, inva
93 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
94 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
95 Lk_ipt: # input transform (lo, hi)
96 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
97 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
98 Lk_sbo: # sbou, sbot
99 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
100 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
101 Lk_sb1: # sb1u, sb1t
102 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
103 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
104 Lk_sb2: # sb2u, sb2t
105 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
106 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
107
108 ##
109 ## Decryption stuff
110 ##
111 Lk_dipt: # decryption input transform
112 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
113 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
114 Lk_dsbo: # decryption sbox final output
115 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
116 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
117 Lk_dsb9: # decryption sbox output *9*u, *9*t
118 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
119 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
120 Lk_dsbd: # decryption sbox output *D*u, *D*t
121 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
122 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
123 Lk_dsbb: # decryption sbox output *B*u, *B*t
124 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
125 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
126 Lk_dsbe: # decryption sbox output *E*u, *E*t
127 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
128 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
129
130 ##
131 ## Key schedule constants
132 ##
133 Lk_dksd: # decryption key schedule: invskew x*D
134 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
135 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
136 Lk_dksb: # decryption key schedule: invskew x*B
137 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
138 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
139 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
140 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
141 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
142 Lk_dks9: # decryption key schedule: invskew x*9
143 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
144 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
145
146 Lk_rcon: # rcon
147 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
148 Lk_s63:
149 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
150
151 Lk_opt: # output transform
152 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
153 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
154 Lk_deskew: # deskew tables: inverts the sbox's "skew"
155 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
156 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
157 .align 5
158 Lconsts:
159 mflr r0
160 bcl 20,31,\$+4
161 mflr r12 #vvvvv "distance between . and _vpaes_consts
162 addi r12,r12,-0x308
163 mtlr r0
164 blr
165 .long 0
166 .byte 0,12,0x14,0,0,0,0,0
167 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
168 .align 6
169 ___
170 \f
171 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
172 {
173 my ($inp,$out,$key) = map("r$_",(3..5));
174
175 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
176 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
177 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
178
179 $code.=<<___;
180 ##
181 ## _aes_preheat
182 ##
183 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
184 ## and %xmm9-%xmm15 as specified below.
185 ##
186 .align 4
187 _vpaes_encrypt_preheat:
188 mflr r8
189 bl Lconsts
190 mtlr r8
191 li r11, 0xc0 # Lk_inv
192 li r10, 0xd0
193 li r9, 0xe0 # Lk_ipt
194 li r8, 0xf0
195 vxor v7, v7, v7 # 0x00..00
196 vspltisb v8,4 # 0x04..04
197 vspltisb v9,0x0f # 0x0f..0f
198 lvx $invlo, r12, r11
199 li r11, 0x100
200 lvx $invhi, r12, r10
201 li r10, 0x110
202 lvx $iptlo, r12, r9
203 li r9, 0x120
204 lvx $ipthi, r12, r8
205 li r8, 0x130
206 lvx $sbou, r12, r11
207 li r11, 0x140
208 lvx $sbot, r12, r10
209 li r10, 0x150
210 lvx $sb1u, r12, r9
211 lvx $sb1t, r12, r8
212 lvx $sb2u, r12, r11
213 lvx $sb2t, r12, r10
214 blr
215 .long 0
216 .byte 0,12,0x14,0,0,0,0,0
217
218 ##
219 ## _aes_encrypt_core
220 ##
221 ## AES-encrypt %xmm0.
222 ##
223 ## Inputs:
224 ## %xmm0 = input
225 ## %xmm9-%xmm15 as in _vpaes_preheat
226 ## (%rdx) = scheduled keys
227 ##
228 ## Output in %xmm0
229 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
230 ##
231 ##
232 .align 5
233 _vpaes_encrypt_core:
234 lwz r8, 240($key) # pull rounds
235 li r9, 16
236 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
237 li r11, 0x10
238 lvx v6, r9, $key
239 addi r9, r9, 16
240 ?vperm v5, v5, v6, $keyperm # align round key
241 addi r10, r11, 0x40
242 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
243 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
244 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
245 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
246 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
247 mtctr r8
248 b Lenc_entry
249
250 .align 4
251 Lenc_loop:
252 # middle of middle round
253 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
254 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
255 addi r11, r11, 16
256 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
257 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
258 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
259 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
260 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
261 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
262 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
263 addi r10, r11, 0x40
264 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
265 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
266 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
267 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
268 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
269 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
270 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
271
272 Lenc_entry:
273 # top of round
274 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
275 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
276 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
277 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
278 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
279 vand v0, v0, v9
280 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
281 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
282 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
283 vmr v5, v6
284 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
285 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
286 addi r9, r9, 16
287 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
288 ?vperm v5, v5, v6, $keyperm # align round key
289 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
290 bdnz Lenc_loop
291
292 # middle of last round
293 addi r10, r11, 0x80
294 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
295 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
296 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
297 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
298 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
299 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
300 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
301 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
302 blr
303 .long 0
304 .byte 0,12,0x14,0,0,0,0,0
305
306 .globl .vpaes_encrypt
307 .align 5
308 .vpaes_encrypt:
309 $STU $sp,-$FRAME($sp)
310 li r10,`15+6*$SIZE_T`
311 li r11,`31+6*$SIZE_T`
312 mflr r6
313 mfspr r7, 256 # save vrsave
314 stvx v20,r10,$sp
315 addi r10,r10,32
316 stvx v21,r11,$sp
317 addi r11,r11,32
318 stvx v22,r10,$sp
319 addi r10,r10,32
320 stvx v23,r11,$sp
321 addi r11,r11,32
322 stvx v24,r10,$sp
323 addi r10,r10,32
324 stvx v25,r11,$sp
325 addi r11,r11,32
326 stvx v26,r10,$sp
327 addi r10,r10,32
328 stvx v27,r11,$sp
329 addi r11,r11,32
330 stvx v28,r10,$sp
331 addi r10,r10,32
332 stvx v29,r11,$sp
333 addi r11,r11,32
334 stvx v30,r10,$sp
335 stvx v31,r11,$sp
336 stw r7,`$FRAME-4`($sp) # save vrsave
337 li r0, -1
338 $PUSH r6,`$FRAME+$LRSAVE`($sp)
339 mtspr 256, r0 # preserve all AltiVec registers
340
341 bl _vpaes_encrypt_preheat
342
343 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
344 lvx v0, 0, $inp
345 addi $inp, $inp, 15 # 15 is not a typo
346 ?lvsr $outperm, 0, $out
347 ?lvsl $keyperm, 0, $key # prepare for unaligned access
348 lvx $inptail, 0, $inp # redundant in aligned case
349 ?vperm v0, v0, $inptail, $inpperm
350
351 bl _vpaes_encrypt_core
352
353 andi. r8, $out, 15
354 li r9, 16
355 beq Lenc_out_aligned
356
357 vperm v0, v0, v0, $outperm # rotate right/left
358 mtctr r9
359 Lenc_out_unaligned:
360 stvebx v0, 0, $out
361 addi $out, $out, 1
362 bdnz Lenc_out_unaligned
363 b Lenc_done
364
365 .align 4
366 Lenc_out_aligned:
367 stvx v0, 0, $out
368 Lenc_done:
369
370 li r10,`15+6*$SIZE_T`
371 li r11,`31+6*$SIZE_T`
372 mtlr r6
373 mtspr 256, r7 # restore vrsave
374 lvx v20,r10,$sp
375 addi r10,r10,32
376 lvx v21,r11,$sp
377 addi r11,r11,32
378 lvx v22,r10,$sp
379 addi r10,r10,32
380 lvx v23,r11,$sp
381 addi r11,r11,32
382 lvx v24,r10,$sp
383 addi r10,r10,32
384 lvx v25,r11,$sp
385 addi r11,r11,32
386 lvx v26,r10,$sp
387 addi r10,r10,32
388 lvx v27,r11,$sp
389 addi r11,r11,32
390 lvx v28,r10,$sp
391 addi r10,r10,32
392 lvx v29,r11,$sp
393 addi r11,r11,32
394 lvx v30,r10,$sp
395 lvx v31,r11,$sp
396 addi $sp,$sp,$FRAME
397 blr
398 .long 0
399 .byte 0,12,0x04,1,0x80,0,3,0
400 .long 0
401 .size .vpaes_encrypt,.-.vpaes_encrypt
402
403 .align 4
404 _vpaes_decrypt_preheat:
405 mflr r8
406 bl Lconsts
407 mtlr r8
408 li r11, 0xc0 # Lk_inv
409 li r10, 0xd0
410 li r9, 0x160 # Ldipt
411 li r8, 0x170
412 vxor v7, v7, v7 # 0x00..00
413 vspltisb v8,4 # 0x04..04
414 vspltisb v9,0x0f # 0x0f..0f
415 lvx $invlo, r12, r11
416 li r11, 0x180
417 lvx $invhi, r12, r10
418 li r10, 0x190
419 lvx $iptlo, r12, r9
420 li r9, 0x1a0
421 lvx $ipthi, r12, r8
422 li r8, 0x1b0
423 lvx $sbou, r12, r11
424 li r11, 0x1c0
425 lvx $sbot, r12, r10
426 li r10, 0x1d0
427 lvx $sb9u, r12, r9
428 li r9, 0x1e0
429 lvx $sb9t, r12, r8
430 li r8, 0x1f0
431 lvx $sbdu, r12, r11
432 li r11, 0x200
433 lvx $sbdt, r12, r10
434 li r10, 0x210
435 lvx $sbbu, r12, r9
436 lvx $sbbt, r12, r8
437 lvx $sbeu, r12, r11
438 lvx $sbet, r12, r10
439 blr
440 .long 0
441 .byte 0,12,0x14,0,0,0,0,0
442
443 ##
444 ## Decryption core
445 ##
446 ## Same API as encryption core.
447 ##
448 .align 4
449 _vpaes_decrypt_core:
450 lwz r8, 240($key) # pull rounds
451 li r9, 16
452 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
453 li r11, 0x30
454 lvx v6, r9, $key
455 addi r9, r9, 16
456 ?vperm v5, v5, v6, $keyperm # align round key
457 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
458 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
459 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
460 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
461 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
462 mtctr r8
463 b Ldec_entry
464
465 .align 4
466 Ldec_loop:
467 #
468 # Inverse mix columns
469 #
470 lvx v0, r12, r11 # v5 and v0 are flipped
471 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
472 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
473 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
474 subi r11, r11, 16
475 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
476 andi. r11, r11, 0x30
477 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
478 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
479 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
480 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
481
482 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
483 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
484 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
485 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
486 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
487 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
488 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
489
490 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
491 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
492 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
493 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
494 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
495 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
496 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
497
498 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
499 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
500 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
501 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
502 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
503
504 Ldec_entry:
505 # top of round
506 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
507 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
508 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
509 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
510 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
511 vand v0, v0, v9
512 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
513 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
514 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
515 vmr v5, v6
516 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
517 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
518 addi r9, r9, 16
519 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
520 ?vperm v5, v5, v6, $keyperm # align round key
521 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
522 bdnz Ldec_loop
523
524 # middle of last round
525 addi r10, r11, 0x80
526 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
527 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
528 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
529 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
530 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
531 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
532 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
533 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
534 blr
535 .long 0
536 .byte 0,12,0x14,0,0,0,0,0
537
538 .globl .vpaes_decrypt
539 .align 5
540 .vpaes_decrypt:
541 $STU $sp,-$FRAME($sp)
542 li r10,`15+6*$SIZE_T`
543 li r11,`31+6*$SIZE_T`
544 mflr r6
545 mfspr r7, 256 # save vrsave
546 stvx v20,r10,$sp
547 addi r10,r10,32
548 stvx v21,r11,$sp
549 addi r11,r11,32
550 stvx v22,r10,$sp
551 addi r10,r10,32
552 stvx v23,r11,$sp
553 addi r11,r11,32
554 stvx v24,r10,$sp
555 addi r10,r10,32
556 stvx v25,r11,$sp
557 addi r11,r11,32
558 stvx v26,r10,$sp
559 addi r10,r10,32
560 stvx v27,r11,$sp
561 addi r11,r11,32
562 stvx v28,r10,$sp
563 addi r10,r10,32
564 stvx v29,r11,$sp
565 addi r11,r11,32
566 stvx v30,r10,$sp
567 stvx v31,r11,$sp
568 stw r7,`$FRAME-4`($sp) # save vrsave
569 li r0, -1
570 $PUSH r6,`$FRAME+$LRSAVE`($sp)
571 mtspr 256, r0 # preserve all AltiVec registers
572
573 bl _vpaes_decrypt_preheat
574
575 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
576 lvx v0, 0, $inp
577 addi $inp, $inp, 15 # 15 is not a typo
578 ?lvsr $outperm, 0, $out
579 ?lvsl $keyperm, 0, $key
580 lvx $inptail, 0, $inp # redundant in aligned case
581 ?vperm v0, v0, $inptail, $inpperm
582
583 bl _vpaes_decrypt_core
584
585 andi. r8, $out, 15
586 li r9, 16
587 beq Ldec_out_aligned
588
589 vperm v0, v0, v0, $outperm # rotate right/left
590 mtctr r9
591 Ldec_out_unaligned:
592 stvebx v0, 0, $out
593 addi $out, $out, 1
594 bdnz Ldec_out_unaligned
595 b Ldec_done
596
597 .align 4
598 Ldec_out_aligned:
599 stvx v0, 0, $out
600 Ldec_done:
601
602 li r10,`15+6*$SIZE_T`
603 li r11,`31+6*$SIZE_T`
604 mtlr r6
605 mtspr 256, r7 # restore vrsave
606 lvx v20,r10,$sp
607 addi r10,r10,32
608 lvx v21,r11,$sp
609 addi r11,r11,32
610 lvx v22,r10,$sp
611 addi r10,r10,32
612 lvx v23,r11,$sp
613 addi r11,r11,32
614 lvx v24,r10,$sp
615 addi r10,r10,32
616 lvx v25,r11,$sp
617 addi r11,r11,32
618 lvx v26,r10,$sp
619 addi r10,r10,32
620 lvx v27,r11,$sp
621 addi r11,r11,32
622 lvx v28,r10,$sp
623 addi r10,r10,32
624 lvx v29,r11,$sp
625 addi r11,r11,32
626 lvx v30,r10,$sp
627 lvx v31,r11,$sp
628 addi $sp,$sp,$FRAME
629 blr
630 .long 0
631 .byte 0,12,0x04,1,0x80,0,3,0
632 .long 0
633 .size .vpaes_decrypt,.-.vpaes_decrypt
634
635 .globl .vpaes_cbc_encrypt
636 .align 5
637 .vpaes_cbc_encrypt:
638 ${UCMP}i r5,16
639 bltlr-
640
641 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
642 mflr r0
643 li r10,`15+6*$SIZE_T`
644 li r11,`31+6*$SIZE_T`
645 mfspr r12, 256
646 stvx v20,r10,$sp
647 addi r10,r10,32
648 stvx v21,r11,$sp
649 addi r11,r11,32
650 stvx v22,r10,$sp
651 addi r10,r10,32
652 stvx v23,r11,$sp
653 addi r11,r11,32
654 stvx v24,r10,$sp
655 addi r10,r10,32
656 stvx v25,r11,$sp
657 addi r11,r11,32
658 stvx v26,r10,$sp
659 addi r10,r10,32
660 stvx v27,r11,$sp
661 addi r11,r11,32
662 stvx v28,r10,$sp
663 addi r10,r10,32
664 stvx v29,r11,$sp
665 addi r11,r11,32
666 stvx v30,r10,$sp
667 stvx v31,r11,$sp
668 stw r12,`$FRAME-4`($sp) # save vrsave
669 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
670 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
671 li r9, -16
672 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
673
674 and r30, r5, r9 # copy length&-16
675 andi. r9, $out, 15 # is $out aligned?
676 mr r5, r6 # copy pointer to key
677 mr r31, r7 # copy pointer to iv
678 li r6, -1
679 mcrf cr1, cr0 # put aside $out alignment flag
680 mr r7, r12 # copy vrsave
681 mtspr 256, r6 # preserve all AltiVec registers
682
683 lvx v24, 0, r31 # load [potentially unaligned] iv
684 li r9, 15
685 ?lvsl $inpperm, 0, r31
686 lvx v25, r9, r31
687 ?vperm v24, v24, v25, $inpperm
688
689 cmpwi r8, 0 # test direction
690 neg r8, $inp # prepare for unaligned access
691 vxor v7, v7, v7
692 ?lvsl $keyperm, 0, $key
693 ?lvsr $outperm, 0, $out
694 ?lvsr $inpperm, 0, r8 # -$inp
695 vnor $outmask, v7, v7 # 0xff..ff
696 lvx $inptail, 0, $inp
697 ?vperm $outmask, v7, $outmask, $outperm
698 addi $inp, $inp, 15 # 15 is not a typo
699
700 beq Lcbc_decrypt
701
702 bl _vpaes_encrypt_preheat
703 li r0, 16
704
705 beq cr1, Lcbc_enc_loop # $out is aligned
706
707 vmr v0, $inptail
708 lvx $inptail, 0, $inp
709 addi $inp, $inp, 16
710 ?vperm v0, v0, $inptail, $inpperm
711 vxor v0, v0, v24 # ^= iv
712
713 bl _vpaes_encrypt_core
714
715 andi. r8, $out, 15
716 vmr v24, v0 # put aside iv
717 sub r9, $out, r8
718 vperm $outhead, v0, v0, $outperm # rotate right/left
719
720 Lcbc_enc_head:
721 stvebx $outhead, r8, r9
722 cmpwi r8, 15
723 addi r8, r8, 1
724 bne Lcbc_enc_head
725
726 sub. r30, r30, r0 # len -= 16
727 addi $out, $out, 16
728 beq Lcbc_unaligned_done
729
730 Lcbc_enc_loop:
731 vmr v0, $inptail
732 lvx $inptail, 0, $inp
733 addi $inp, $inp, 16
734 ?vperm v0, v0, $inptail, $inpperm
735 vxor v0, v0, v24 # ^= iv
736
737 bl _vpaes_encrypt_core
738
739 vmr v24, v0 # put aside iv
740 sub. r30, r30, r0 # len -= 16
741 vperm v0, v0, v0, $outperm # rotate right/left
742 vsel v1, $outhead, v0, $outmask
743 vmr $outhead, v0
744 stvx v1, 0, $out
745 addi $out, $out, 16
746 bne Lcbc_enc_loop
747
748 b Lcbc_done
749
750 .align 5
751 Lcbc_decrypt:
752 bl _vpaes_decrypt_preheat
753 li r0, 16
754
755 beq cr1, Lcbc_dec_loop # $out is aligned
756
757 vmr v0, $inptail
758 lvx $inptail, 0, $inp
759 addi $inp, $inp, 16
760 ?vperm v0, v0, $inptail, $inpperm
761 vmr v25, v0 # put aside input
762
763 bl _vpaes_decrypt_core
764
765 andi. r8, $out, 15
766 vxor v0, v0, v24 # ^= iv
767 vmr v24, v25
768 sub r9, $out, r8
769 vperm $outhead, v0, v0, $outperm # rotate right/left
770
771 Lcbc_dec_head:
772 stvebx $outhead, r8, r9
773 cmpwi r8, 15
774 addi r8, r8, 1
775 bne Lcbc_dec_head
776
777 sub. r30, r30, r0 # len -= 16
778 addi $out, $out, 16
779 beq Lcbc_unaligned_done
780
781 Lcbc_dec_loop:
782 vmr v0, $inptail
783 lvx $inptail, 0, $inp
784 addi $inp, $inp, 16
785 ?vperm v0, v0, $inptail, $inpperm
786 vmr v25, v0 # put aside input
787
788 bl _vpaes_decrypt_core
789
790 vxor v0, v0, v24 # ^= iv
791 vmr v24, v25
792 sub. r30, r30, r0 # len -= 16
793 vperm v0, v0, v0, $outperm # rotate right/left
794 vsel v1, $outhead, v0, $outmask
795 vmr $outhead, v0
796 stvx v1, 0, $out
797 addi $out, $out, 16
798 bne Lcbc_dec_loop
799
800 Lcbc_done:
801 beq cr1, Lcbc_write_iv # $out is aligned
802
803 Lcbc_unaligned_done:
804 andi. r8, $out, 15
805 sub $out, $out, r8
806 li r9, 0
807 Lcbc_tail:
808 stvebx $outhead, r9, $out
809 addi r9, r9, 1
810 cmpw r9, r8
811 bne Lcbc_tail
812
813 Lcbc_write_iv:
814 neg r8, r31 # write [potentially unaligned] iv
815 li r10, 4
816 ?lvsl $outperm, 0, r8
817 li r11, 8
818 li r12, 12
819 vperm v24, v24, v24, $outperm # rotate right/left
820 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
821 stvewx v24, r10, r31
822 stvewx v24, r11, r31
823 stvewx v24, r12, r31
824
825 mtspr 256, r7 # restore vrsave
826 li r10,`15+6*$SIZE_T`
827 li r11,`31+6*$SIZE_T`
828 lvx v20,r10,$sp
829 addi r10,r10,32
830 lvx v21,r11,$sp
831 addi r11,r11,32
832 lvx v22,r10,$sp
833 addi r10,r10,32
834 lvx v23,r11,$sp
835 addi r11,r11,32
836 lvx v24,r10,$sp
837 addi r10,r10,32
838 lvx v25,r11,$sp
839 addi r11,r11,32
840 lvx v26,r10,$sp
841 addi r10,r10,32
842 lvx v27,r11,$sp
843 addi r11,r11,32
844 lvx v28,r10,$sp
845 addi r10,r10,32
846 lvx v29,r11,$sp
847 addi r11,r11,32
848 lvx v30,r10,$sp
849 lvx v31,r11,$sp
850 Lcbc_abort:
851 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
852 $POP r30,`$FRAME+$SIZE_T*0`($sp)
853 $POP r31,`$FRAME+$SIZE_T*1`($sp)
854 mtlr r0
855 addi $sp,$sp,`$FRAME+$SIZE_T*2`
856 blr
857 .long 0
858 .byte 0,12,0x04,1,0x80,2,6,0
859 .long 0
860 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
861 ___
862 }\f
863 {
864 my ($inp,$bits,$out)=map("r$_",(3..5));
865 my $dir="cr1";
866 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
867
868 $code.=<<___;
869 ########################################################
870 ## ##
871 ## AES key schedule ##
872 ## ##
873 ########################################################
874 .align 4
875 _vpaes_key_preheat:
876 mflr r8
877 bl Lconsts
878 mtlr r8
879 li r11, 0xc0 # Lk_inv
880 li r10, 0xd0
881 li r9, 0xe0 # L_ipt
882 li r8, 0xf0
883
884 vspltisb v8,4 # 0x04..04
885 vxor v9,v9,v9 # 0x00..00
886 lvx $invlo, r12, r11 # Lk_inv
887 li r11, 0x120
888 lvx $invhi, r12, r10
889 li r10, 0x130
890 lvx $iptlo, r12, r9 # Lk_ipt
891 li r9, 0x220
892 lvx $ipthi, r12, r8
893 li r8, 0x230
894
895 lvx v14, r12, r11 # Lk_sb1
896 li r11, 0x240
897 lvx v15, r12, r10
898 li r10, 0x250
899
900 lvx v16, r12, r9 # Lk_dksd
901 li r9, 0x260
902 lvx v17, r12, r8
903 li r8, 0x270
904 lvx v18, r12, r11 # Lk_dksb
905 li r11, 0x280
906 lvx v19, r12, r10
907 li r10, 0x290
908 lvx v20, r12, r9 # Lk_dkse
909 li r9, 0x2a0
910 lvx v21, r12, r8
911 li r8, 0x2b0
912 lvx v22, r12, r11 # Lk_dks9
913 lvx v23, r12, r10
914
915 lvx v24, r12, r9 # Lk_rcon
916 lvx v25, 0, r12 # Lk_mc_forward[0]
917 lvx v26, r12, r8 # Lks63
918 blr
919 .long 0
920 .byte 0,12,0x14,0,0,0,0,0
921
922 .align 4
923 _vpaes_schedule_core:
924 mflr r7
925
926 bl _vpaes_key_preheat # load the tables
927
928 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
929 neg r8, $inp # prepare for unaligned access
930 lvx v0, 0, $inp
931 addi $inp, $inp, 15 # 15 is not typo
932 ?lvsr $inpperm, 0, r8 # -$inp
933 lvx v6, 0, $inp # v6 serves as inptail
934 addi $inp, $inp, 8
935 ?vperm v0, v0, v6, $inpperm
936
937 # input transform
938 vmr v3, v0 # vmovdqa %xmm0, %xmm3
939 bl _vpaes_schedule_transform
940 vmr v7, v0 # vmovdqa %xmm0, %xmm7
941
942 bne $dir, Lschedule_am_decrypting
943
944 # encrypting, output zeroth round key after transform
945 li r8, 0x30 # mov \$0x30,%r8d
946 li r9, 4
947 li r10, 8
948 li r11, 12
949
950 ?lvsr $outperm, 0, $out # prepare for unaligned access
951 vnor $outmask, v9, v9 # 0xff..ff
952 ?vperm $outmask, v9, $outmask, $outperm
953
954 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
955 vperm $outhead, v0, v0, $outperm # rotate right/left
956 stvewx $outhead, 0, $out # some are superfluous
957 stvewx $outhead, r9, $out
958 stvewx $outhead, r10, $out
959 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
960 stvewx $outhead, r11, $out
961 b Lschedule_go
962
963 Lschedule_am_decrypting:
964 srwi r8, $bits, 1 # shr \$1,%r8d
965 andi. r8, r8, 32 # and \$32,%r8d
966 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
967 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
968 # decrypting, output zeroth round key after shiftrows
969 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
970 li r9, 4
971 li r10, 8
972 li r11, 12
973 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
974
975 neg r0, $out # prepare for unaligned access
976 ?lvsl $outperm, 0, r0
977 vnor $outmask, v9, v9 # 0xff..ff
978 ?vperm $outmask, $outmask, v9, $outperm
979
980 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
981 vperm $outhead, v4, v4, $outperm # rotate right/left
982 stvewx $outhead, 0, $out # some are superfluous
983 stvewx $outhead, r9, $out
984 stvewx $outhead, r10, $out
985 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
986 stvewx $outhead, r11, $out
987 addi $out, $out, 15 # 15 is not typo
988 xori r8, r8, 0x30 # xor \$0x30, %r8
989
990 Lschedule_go:
991 cmplwi $bits, 192 # cmp \$192, %esi
992 bgt Lschedule_256
993 beq Lschedule_192
994 # 128: fall though
995
996 ##
997 ## .schedule_128
998 ##
999 ## 128-bit specific part of key schedule.
1000 ##
1001 ## This schedule is really simple, because all its parts
1002 ## are accomplished by the subroutines.
1003 ##
1004 Lschedule_128:
1005 li r0, 10 # mov \$10, %esi
1006 mtctr r0
1007
1008 Loop_schedule_128:
1009 bl _vpaes_schedule_round
1010 bdz Lschedule_mangle_last # dec %esi
1011 bl _vpaes_schedule_mangle # write output
1012 b Loop_schedule_128
1013
1014 ##
1015 ## .aes_schedule_192
1016 ##
1017 ## 192-bit specific part of key schedule.
1018 ##
1019 ## The main body of this schedule is the same as the 128-bit
1020 ## schedule, but with more smearing. The long, high side is
1021 ## stored in %xmm7 as before, and the short, low side is in
1022 ## the high bits of %xmm6.
1023 ##
1024 ## This schedule is somewhat nastier, however, because each
1025 ## round produces 192 bits of key material, or 1.5 round keys.
1026 ## Therefore, on each cycle we do 2 rounds and produce 3 round
1027 ## keys.
1028 ##
1029 .align 4
1030 Lschedule_192:
1031 li r0, 4 # mov \$4, %esi
1032 lvx v0, 0, $inp
1033 ?vperm v0, v6, v0, $inpperm
1034 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
1035 bl _vpaes_schedule_transform # input transform
1036 ?vsldoi v6, v0, v9, 8
1037 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
1038 mtctr r0
1039
1040 Loop_schedule_192:
1041 bl _vpaes_schedule_round
1042 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
1043 bl _vpaes_schedule_mangle # save key n
1044 bl _vpaes_schedule_192_smear
1045 bl _vpaes_schedule_mangle # save key n+1
1046 bl _vpaes_schedule_round
1047 bdz Lschedule_mangle_last # dec %esi
1048 bl _vpaes_schedule_mangle # save key n+2
1049 bl _vpaes_schedule_192_smear
1050 b Loop_schedule_192
1051
1052 ##
1053 ## .aes_schedule_256
1054 ##
1055 ## 256-bit specific part of key schedule.
1056 ##
1057 ## The structure here is very similar to the 128-bit
1058 ## schedule, but with an additional "low side" in
1059 ## %xmm6. The low side's rounds are the same as the
1060 ## high side's, except no rcon and no rotation.
1061 ##
1062 .align 4
1063 Lschedule_256:
1064 li r0, 7 # mov \$7, %esi
1065 addi $inp, $inp, 8
1066 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1067 ?vperm v0, v6, v0, $inpperm
1068 bl _vpaes_schedule_transform # input transform
1069 mtctr r0
1070
1071 Loop_schedule_256:
1072 bl _vpaes_schedule_mangle # output low result
1073 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1074
1075 # high round
1076 bl _vpaes_schedule_round
1077 bdz Lschedule_mangle_last # dec %esi
1078 bl _vpaes_schedule_mangle
1079
1080 # low round. swap xmm7 and xmm6
1081 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1082 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1083 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1084 bl _vpaes_schedule_low_round
1085 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1086
1087 b Loop_schedule_256
1088 ##
1089 ## .aes_schedule_mangle_last
1090 ##
1091 ## Mangler for last round of key schedule
1092 ## Mangles %xmm0
1093 ## when encrypting, outputs out(%xmm0) ^ 63
1094 ## when decrypting, outputs unskew(%xmm0)
1095 ##
1096 ## Always called right before return... jumps to cleanup and exits
1097 ##
1098 .align 4
1099 Lschedule_mangle_last:
1100 # schedule last round key from xmm0
1101 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1102 li r9, 0x2f0
1103 bne $dir, Lschedule_mangle_last_dec
1104
1105 # encrypting
1106 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1107 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1108 li r9, 0x2d0 # prepare to output transform
1109 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1110
1111 lvx $iptlo, r11, r12 # reload $ipt
1112 lvx $ipthi, r9, r12
1113 addi $out, $out, 16 # add \$16, %rdx
1114 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1115 bl _vpaes_schedule_transform # output transform
1116
1117 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1118 vperm v0, v0, v0, $outperm # rotate right/left
1119 li r10, 4
1120 vsel v2, $outhead, v0, $outmask
1121 li r11, 8
1122 stvx v2, 0, $out
1123 li r12, 12
1124 stvewx v0, 0, $out # some (or all) are redundant
1125 stvewx v0, r10, $out
1126 stvewx v0, r11, $out
1127 stvewx v0, r12, $out
1128 b Lschedule_mangle_done
1129
1130 .align 4
1131 Lschedule_mangle_last_dec:
1132 lvx $iptlo, r11, r12 # reload $ipt
1133 lvx $ipthi, r9, r12
1134 addi $out, $out, -16 # add \$-16, %rdx
1135 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1136 bl _vpaes_schedule_transform # output transform
1137
1138 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1139 addi r9, $out, -15 # -15 is not typo
1140 vperm v0, v0, v0, $outperm # rotate right/left
1141 li r10, 4
1142 vsel v2, $outhead, v0, $outmask
1143 li r11, 8
1144 stvx v2, 0, $out
1145 li r12, 12
1146 stvewx v0, 0, r9 # some (or all) are redundant
1147 stvewx v0, r10, r9
1148 stvewx v0, r11, r9
1149 stvewx v0, r12, r9
1150
1151
1152 Lschedule_mangle_done:
1153 mtlr r7
1154 # cleanup
1155 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1156 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1157 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1158 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1159 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1160 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1161 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1162 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1163
1164 blr
1165 .long 0
1166 .byte 0,12,0x14,0,0,0,0,0
1167
1168 ##
1169 ## .aes_schedule_192_smear
1170 ##
1171 ## Smear the short, low side in the 192-bit key schedule.
1172 ##
1173 ## Inputs:
1174 ## %xmm7: high side, b a x y
1175 ## %xmm6: low side, d c 0 0
1176 ## %xmm13: 0
1177 ##
1178 ## Outputs:
1179 ## %xmm6: b+c+d b+c 0 0
1180 ## %xmm0: b+c+d b+c b a
1181 ##
1182 .align 4
1183 _vpaes_schedule_192_smear:
1184 ?vspltw v0, v7, 3
1185 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1186 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1187 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1188 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1189 vmr v0, v6
1190 ?vsldoi v6, v6, v9, 8
1191 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1192 blr
1193 .long 0
1194 .byte 0,12,0x14,0,0,0,0,0
1195
1196 ##
1197 ## .aes_schedule_round
1198 ##
1199 ## Runs one main round of the key schedule on %xmm0, %xmm7
1200 ##
1201 ## Specifically, runs subbytes on the high dword of %xmm0
1202 ## then rotates it by one byte and xors into the low dword of
1203 ## %xmm7.
1204 ##
1205 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1206 ## next rcon.
1207 ##
1208 ## Smears the dwords of %xmm7 by xoring the low into the
1209 ## second low, result into third, result into highest.
1210 ##
1211 ## Returns results in %xmm7 = %xmm0.
1212 ## Clobbers %xmm1-%xmm4, %r11.
1213 ##
1214 .align 4
1215 _vpaes_schedule_round:
1216 # extract rcon from xmm8
1217 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1218 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1219 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1220 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1221
1222 # rotate
1223 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1224 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1225
1226 # fall through...
1227
1228 # low round: same as high round, but no rotation and no rcon.
1229 _vpaes_schedule_low_round:
1230 # smear xmm7
1231 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1232 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1233 vspltisb v1, 0x0f # 0x0f..0f
1234 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1235
1236 # subbytes
1237 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1238 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1239 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1240 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1241 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1242 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1243 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1244 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1245 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1246 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1247 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1248 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1249 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1250 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1251 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1252 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1253 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1254
1255 # add in smeared stuff
1256 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1257 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1258 blr
1259 .long 0
1260 .byte 0,12,0x14,0,0,0,0,0
1261
1262 ##
1263 ## .aes_schedule_transform
1264 ##
1265 ## Linear-transform %xmm0 according to tables at (%r11)
1266 ##
1267 ## Requires that %xmm9 = 0x0F0F... as in preheat
1268 ## Output in %xmm0
1269 ## Clobbers %xmm2
1270 ##
1271 .align 4
1272 _vpaes_schedule_transform:
1273 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1274 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1275 # vmovdqa (%r11), %xmm2 # lo
1276 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1277 # vmovdqa 16(%r11), %xmm1 # hi
1278 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1279 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1280 blr
1281 .long 0
1282 .byte 0,12,0x14,0,0,0,0,0
1283
1284 ##
1285 ## .aes_schedule_mangle
1286 ##
1287 ## Mangle xmm0 from (basis-transformed) standard version
1288 ## to our version.
1289 ##
1290 ## On encrypt,
1291 ## xor with 0x63
1292 ## multiply by circulant 0,1,1,1
1293 ## apply shiftrows transform
1294 ##
1295 ## On decrypt,
1296 ## xor with 0x63
1297 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1298 ## deskew
1299 ## apply shiftrows transform
1300 ##
1301 ##
1302 ## Writes out to (%rdx), and increments or decrements it
1303 ## Keeps track of round number mod 4 in %r8
1304 ## Preserves xmm0
1305 ## Clobbers xmm1-xmm5
1306 ##
1307 .align 4
1308 _vpaes_schedule_mangle:
1309 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1310 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1311 bne $dir, Lschedule_mangle_dec
1312
1313 # encrypting
1314 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1315 addi $out, $out, 16 # add \$16, %rdx
1316 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1317 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1318 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1319 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1320 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1321 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1322
1323 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1324 addi r8, r8, -16 # add \$-16, %r8
1325 andi. r8, r8, 0x30 # and \$0x30, %r8
1326
1327 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1328 vperm v1, v3, v3, $outperm # rotate right/left
1329 vsel v2, $outhead, v1, $outmask
1330 vmr $outhead, v1
1331 stvx v2, 0, $out
1332 blr
1333
1334 .align 4
1335 Lschedule_mangle_dec:
1336 # inverse mix columns
1337 # lea .Lk_dksd(%rip),%r11
1338 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1339 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1340
1341 # vmovdqa 0x00(%r11), %xmm2
1342 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1343 # vmovdqa 0x10(%r11), %xmm3
1344 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1345 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1346 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1347
1348 # vmovdqa 0x20(%r11), %xmm2
1349 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1350 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1351 # vmovdqa 0x30(%r11), %xmm3
1352 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1353 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1354 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1355
1356 # vmovdqa 0x40(%r11), %xmm2
1357 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1358 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1359 # vmovdqa 0x50(%r11), %xmm3
1360 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1361 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1362
1363 # vmovdqa 0x60(%r11), %xmm2
1364 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1365 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1366 # vmovdqa 0x70(%r11), %xmm4
1367 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1368 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1369 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1370 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1371
1372 addi $out, $out, -16 # add \$-16, %rdx
1373
1374 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1375 addi r8, r8, -16 # add \$-16, %r8
1376 andi. r8, r8, 0x30 # and \$0x30, %r8
1377
1378 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1379 vperm v1, v3, v3, $outperm # rotate right/left
1380 vsel v2, $outhead, v1, $outmask
1381 vmr $outhead, v1
1382 stvx v2, 0, $out
1383 blr
1384 .long 0
1385 .byte 0,12,0x14,0,0,0,0,0
1386
1387 .globl .vpaes_set_encrypt_key
1388 .align 5
1389 .vpaes_set_encrypt_key:
1390 $STU $sp,-$FRAME($sp)
1391 li r10,`15+6*$SIZE_T`
1392 li r11,`31+6*$SIZE_T`
1393 mflr r0
1394 mfspr r6, 256 # save vrsave
1395 stvx v20,r10,$sp
1396 addi r10,r10,32
1397 stvx v21,r11,$sp
1398 addi r11,r11,32
1399 stvx v22,r10,$sp
1400 addi r10,r10,32
1401 stvx v23,r11,$sp
1402 addi r11,r11,32
1403 stvx v24,r10,$sp
1404 addi r10,r10,32
1405 stvx v25,r11,$sp
1406 addi r11,r11,32
1407 stvx v26,r10,$sp
1408 addi r10,r10,32
1409 stvx v27,r11,$sp
1410 addi r11,r11,32
1411 stvx v28,r10,$sp
1412 addi r10,r10,32
1413 stvx v29,r11,$sp
1414 addi r11,r11,32
1415 stvx v30,r10,$sp
1416 stvx v31,r11,$sp
1417 stw r6,`$FRAME-4`($sp) # save vrsave
1418 li r7, -1
1419 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1420 mtspr 256, r7 # preserve all AltiVec registers
1421
1422 srwi r9, $bits, 5 # shr \$5,%eax
1423 addi r9, r9, 6 # add \$5,%eax
1424 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1425
1426 cmplw $dir, $bits, $bits # set encrypt direction
1427 li r8, 0x30 # mov \$0x30,%r8d
1428 bl _vpaes_schedule_core
1429
1430 $POP r0, `$FRAME+$LRSAVE`($sp)
1431 li r10,`15+6*$SIZE_T`
1432 li r11,`31+6*$SIZE_T`
1433 mtspr 256, r6 # restore vrsave
1434 mtlr r0
1435 xor r3, r3, r3
1436 lvx v20,r10,$sp
1437 addi r10,r10,32
1438 lvx v21,r11,$sp
1439 addi r11,r11,32
1440 lvx v22,r10,$sp
1441 addi r10,r10,32
1442 lvx v23,r11,$sp
1443 addi r11,r11,32
1444 lvx v24,r10,$sp
1445 addi r10,r10,32
1446 lvx v25,r11,$sp
1447 addi r11,r11,32
1448 lvx v26,r10,$sp
1449 addi r10,r10,32
1450 lvx v27,r11,$sp
1451 addi r11,r11,32
1452 lvx v28,r10,$sp
1453 addi r10,r10,32
1454 lvx v29,r11,$sp
1455 addi r11,r11,32
1456 lvx v30,r10,$sp
1457 lvx v31,r11,$sp
1458 addi $sp,$sp,$FRAME
1459 blr
1460 .long 0
1461 .byte 0,12,0x04,1,0x80,0,3,0
1462 .long 0
1463 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1464
1465 .globl .vpaes_set_decrypt_key
1466 .align 4
1467 .vpaes_set_decrypt_key:
1468 $STU $sp,-$FRAME($sp)
1469 li r10,`15+6*$SIZE_T`
1470 li r11,`31+6*$SIZE_T`
1471 mflr r0
1472 mfspr r6, 256 # save vrsave
1473 stvx v20,r10,$sp
1474 addi r10,r10,32
1475 stvx v21,r11,$sp
1476 addi r11,r11,32
1477 stvx v22,r10,$sp
1478 addi r10,r10,32
1479 stvx v23,r11,$sp
1480 addi r11,r11,32
1481 stvx v24,r10,$sp
1482 addi r10,r10,32
1483 stvx v25,r11,$sp
1484 addi r11,r11,32
1485 stvx v26,r10,$sp
1486 addi r10,r10,32
1487 stvx v27,r11,$sp
1488 addi r11,r11,32
1489 stvx v28,r10,$sp
1490 addi r10,r10,32
1491 stvx v29,r11,$sp
1492 addi r11,r11,32
1493 stvx v30,r10,$sp
1494 stvx v31,r11,$sp
1495 stw r6,`$FRAME-4`($sp) # save vrsave
1496 li r7, -1
1497 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1498 mtspr 256, r7 # preserve all AltiVec registers
1499
1500 srwi r9, $bits, 5 # shr \$5,%eax
1501 addi r9, r9, 6 # add \$5,%eax
1502 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1503
1504 slwi r9, r9, 4 # shl \$4,%eax
1505 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1506
1507 cmplwi $dir, $bits, 0 # set decrypt direction
1508 srwi r8, $bits, 1 # shr \$1,%r8d
1509 andi. r8, r8, 32 # and \$32,%r8d
1510 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1511 bl _vpaes_schedule_core
1512
1513 $POP r0, `$FRAME+$LRSAVE`($sp)
1514 li r10,`15+6*$SIZE_T`
1515 li r11,`31+6*$SIZE_T`
1516 mtspr 256, r6 # restore vrsave
1517 mtlr r0
1518 xor r3, r3, r3
1519 lvx v20,r10,$sp
1520 addi r10,r10,32
1521 lvx v21,r11,$sp
1522 addi r11,r11,32
1523 lvx v22,r10,$sp
1524 addi r10,r10,32
1525 lvx v23,r11,$sp
1526 addi r11,r11,32
1527 lvx v24,r10,$sp
1528 addi r10,r10,32
1529 lvx v25,r11,$sp
1530 addi r11,r11,32
1531 lvx v26,r10,$sp
1532 addi r10,r10,32
1533 lvx v27,r11,$sp
1534 addi r11,r11,32
1535 lvx v28,r10,$sp
1536 addi r10,r10,32
1537 lvx v29,r11,$sp
1538 addi r11,r11,32
1539 lvx v30,r10,$sp
1540 lvx v31,r11,$sp
1541 addi $sp,$sp,$FRAME
1542 blr
1543 .long 0
1544 .byte 0,12,0x04,1,0x80,0,3,0
1545 .long 0
1546 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1547 ___
1548 }
1549
1550 my $consts=1;
1551 foreach (split("\n",$code)) {
1552 s/\`([^\`]*)\`/eval $1/geo;
1553
1554 # constants table endian-specific conversion
1555 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1556 my $conv=$2;
1557 my @bytes=();
1558
1559 # convert to endian-agnostic format
1560 foreach (split(/,\s+/,$1)) {
1561 my $l = /^0/?oct:int;
1562 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1563 }
1564
1565 # little-endian conversion
1566 if ($flavour =~ /le$/o) {
1567 SWITCH: for($conv) {
1568 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1569 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1570 }
1571 }
1572
1573 #emit
1574 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1575 next;
1576 }
1577 $consts=0 if (m/Lconsts:/o); # end of table
1578
1579 # instructions prefixed with '?' are endian-specific and need
1580 # to be adjusted accordingly...
1581 if ($flavour =~ /le$/o) { # little-endian
1582 s/\?lvsr/lvsl/o or
1583 s/\?lvsl/lvsr/o or
1584 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1585 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1586 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1587 } else { # big-endian
1588 s/\?([a-z]+)/$1/o;
1589 }
1590
1591 print $_,"\n";
1592 }
1593
1594 close STDOUT;