]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aes-sparcv9.pl
Split bignum code out of the sparcv9cap.c
[thirdparty/openssl.git] / crypto / aes / asm / aes-sparcv9.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
c918d8e2 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
7395d852
AP
9#
10# ====================================================================
e3713c36 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
7395d852 12# project. Rights for redistribution and usage in source and binary
389c09fa 13# forms are granted according to the License.
7395d852
AP
14# ====================================================================
15#
985e4c41 16# Version 1.1
7395d852
AP
17#
18# The major reason for undertaken effort was to mitigate the hazard of
19# cache-timing attack. This is [currently and initially!] addressed in
20# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
21# 2. References to them are scheduled for L2 cache latency, meaning
22# that the tables don't have to reside in L1 cache. Once again, this
23# is an initial draft and one should expect more countermeasures to
24# be implemented...
25#
985e4c41
AP
26# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
27# round.
28#
7395d852
AP
29# Even though performance was not the primary goal [on the contrary,
30# extra shifts "induced" by compressed S-box and longer loop epilogue
31# "induced" by scheduling for L2 have negative effect on performance],
32# the code turned out to run in ~23 cycles per processed byte en-/
33# decrypted with 128-bit key. This is pretty good result for code
34# with mentioned qualities and UltraSPARC core. Compared to Sun C
35# generated code my encrypt procedure runs just few percents faster,
36# while decrypt one - whole 50% faster [yes, Sun C failed to generate
37# optimal decrypt procedure]. Compared to GNU C generated code both
38# procedures are more than 60% faster:-)
39
1aa89a7a 40$output = pop and open STDOUT,">$output";
eb77e888
AP
41
42$frame="STACK_FRAME";
43$bias="STACK_BIAS";
7395d852
AP
44$locals=16;
45
46$acc0="%l0";
47$acc1="%o0";
48$acc2="%o1";
49$acc3="%o2";
50
51$acc4="%l1";
52$acc5="%o3";
53$acc6="%o4";
54$acc7="%o5";
55
56$acc8="%l2";
57$acc9="%o7";
58$acc10="%g1";
59$acc11="%g2";
60
61$acc12="%l3";
62$acc13="%g3";
63$acc14="%g4";
64$acc15="%g5";
65
66$t0="%l4";
67$t1="%l5";
68$t2="%l6";
69$t3="%l7";
70
71$s0="%i0";
72$s1="%i1";
73$s2="%i2";
74$s3="%i3";
75$tbl="%i4";
76$key="%i5";
77$rounds="%i7"; # aliases with return address, which is off-loaded to stack
78
79sub _data_word()
80{ my $i;
81 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
82}
83
eb77e888 84$code.=<<___;
52f7e44e
TM
85#ifndef __ASSEMBLER__
86# define __ASSEMBLER__ 1
87#endif
88#include "crypto/sparc_arch.h"
eb77e888
AP
89
90#ifdef __arch64__
7395d852
AP
91.register %g2,#scratch
92.register %g3,#scratch
eb77e888 93#endif
7395d852
AP
94.section ".text",#alloc,#execinstr
95
985e4c41 96.align 256
7395d852
AP
97AES_Te:
98___
99&_data_word(
100 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
101 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
102 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
103 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
104 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
105 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
106 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
107 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
108 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
109 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
110 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
111 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
112 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
113 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
114 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
115 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
116 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
117 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
118 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
119 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
120 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
121 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
122 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
123 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
124 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
125 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
126 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
127 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
128 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
129 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
130 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
131 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
132 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
133 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
134 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
135 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
136 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
137 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
138 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
139 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
140 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
141 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
142 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
143 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
144 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
145 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
146 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
147 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
148 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
149 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
150 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
151 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
152 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
153 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
154 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
155 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
156 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
157 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
158 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
159 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
160 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
161 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
162 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
163 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
164$code.=<<___;
165 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
166 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
167 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
168 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
169 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
170 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
171 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
172 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
173 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
174 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
175 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
176 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
177 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
178 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
179 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
180 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
181 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
182 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
183 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
184 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
185 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
186 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
187 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
188 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
189 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
190 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
191 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
192 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
193 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
194 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
195 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
196 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
197.type AES_Te,#object
198.size AES_Te,(.-AES_Te)
199
200.align 64
201.skip 16
202_sparcv9_AES_encrypt:
203 save %sp,-$frame-$locals,%sp
204 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
205 ld [$key+240],$rounds
206 ld [$key+0],$t0
207 ld [$key+4],$t1 !
208 ld [$key+8],$t2
209 srl $rounds,1,$rounds
210 xor $t0,$s0,$s0
211 ld [$key+12],$t3
212 srl $s0,21,$acc0
213 xor $t1,$s1,$s1
214 ld [$key+16],$t0
215 srl $s1,13,$acc1 !
216 xor $t2,$s2,$s2
217 ld [$key+20],$t1
218 xor $t3,$s3,$s3
219 ld [$key+24],$t2
220 and $acc0,2040,$acc0
221 ld [$key+28],$t3
222 nop
223.Lenc_loop:
224 srl $s2,5,$acc2 !
225 and $acc1,2040,$acc1
226 ldx [$tbl+$acc0],$acc0
227 sll $s3,3,$acc3
228 and $acc2,2040,$acc2
229 ldx [$tbl+$acc1],$acc1
230 srl $s1,21,$acc4
231 and $acc3,2040,$acc3
232 ldx [$tbl+$acc2],$acc2 !
233 srl $s2,13,$acc5
234 and $acc4,2040,$acc4
235 ldx [$tbl+$acc3],$acc3
236 srl $s3,5,$acc6
237 and $acc5,2040,$acc5
238 ldx [$tbl+$acc4],$acc4
239 fmovs %f0,%f0
240 sll $s0,3,$acc7 !
241 and $acc6,2040,$acc6
242 ldx [$tbl+$acc5],$acc5
243 srl $s2,21,$acc8
244 and $acc7,2040,$acc7
245 ldx [$tbl+$acc6],$acc6
246 srl $s3,13,$acc9
247 and $acc8,2040,$acc8
248 ldx [$tbl+$acc7],$acc7 !
249 srl $s0,5,$acc10
250 and $acc9,2040,$acc9
251 ldx [$tbl+$acc8],$acc8
252 sll $s1,3,$acc11
253 and $acc10,2040,$acc10
254 ldx [$tbl+$acc9],$acc9
255 fmovs %f0,%f0
256 srl $s3,21,$acc12 !
257 and $acc11,2040,$acc11
258 ldx [$tbl+$acc10],$acc10
259 srl $s0,13,$acc13
260 and $acc12,2040,$acc12
261 ldx [$tbl+$acc11],$acc11
262 srl $s1,5,$acc14
263 and $acc13,2040,$acc13
264 ldx [$tbl+$acc12],$acc12 !
265 sll $s2,3,$acc15
266 and $acc14,2040,$acc14
267 ldx [$tbl+$acc13],$acc13
268 and $acc15,2040,$acc15
269 add $key,32,$key
270 ldx [$tbl+$acc14],$acc14
271 fmovs %f0,%f0
272 subcc $rounds,1,$rounds !
273 ldx [$tbl+$acc15],$acc15
274 bz,a,pn %icc,.Lenc_last
275 add $tbl,2048,$rounds
276
277 srlx $acc1,8,$acc1
278 xor $acc0,$t0,$t0
279 ld [$key+0],$s0
280 fmovs %f0,%f0
281 srlx $acc2,16,$acc2 !
282 xor $acc1,$t0,$t0
283 ld [$key+4],$s1
284 srlx $acc3,24,$acc3
285 xor $acc2,$t0,$t0
286 ld [$key+8],$s2
287 srlx $acc5,8,$acc5
288 xor $acc3,$t0,$t0
289 ld [$key+12],$s3 !
290 srlx $acc6,16,$acc6
291 xor $acc4,$t1,$t1
292 fmovs %f0,%f0
293 srlx $acc7,24,$acc7
294 xor $acc5,$t1,$t1
295 srlx $acc9,8,$acc9
296 xor $acc6,$t1,$t1
297 srlx $acc10,16,$acc10 !
298 xor $acc7,$t1,$t1
299 srlx $acc11,24,$acc11
300 xor $acc8,$t2,$t2
301 srlx $acc13,8,$acc13
302 xor $acc9,$t2,$t2
303 srlx $acc14,16,$acc14
304 xor $acc10,$t2,$t2
305 srlx $acc15,24,$acc15 !
306 xor $acc11,$t2,$t2
307 xor $acc12,$acc14,$acc14
308 xor $acc13,$t3,$t3
309 srl $t0,21,$acc0
310 xor $acc14,$t3,$t3
311 srl $t1,13,$acc1
312 xor $acc15,$t3,$t3
313
314 and $acc0,2040,$acc0 !
315 srl $t2,5,$acc2
316 and $acc1,2040,$acc1
317 ldx [$tbl+$acc0],$acc0
318 sll $t3,3,$acc3
319 and $acc2,2040,$acc2
320 ldx [$tbl+$acc1],$acc1
321 fmovs %f0,%f0
322 srl $t1,21,$acc4 !
323 and $acc3,2040,$acc3
324 ldx [$tbl+$acc2],$acc2
325 srl $t2,13,$acc5
326 and $acc4,2040,$acc4
327 ldx [$tbl+$acc3],$acc3
328 srl $t3,5,$acc6
329 and $acc5,2040,$acc5
330 ldx [$tbl+$acc4],$acc4 !
331 sll $t0,3,$acc7
332 and $acc6,2040,$acc6
333 ldx [$tbl+$acc5],$acc5
334 srl $t2,21,$acc8
335 and $acc7,2040,$acc7
336 ldx [$tbl+$acc6],$acc6
337 fmovs %f0,%f0
338 srl $t3,13,$acc9 !
339 and $acc8,2040,$acc8
340 ldx [$tbl+$acc7],$acc7
341 srl $t0,5,$acc10
342 and $acc9,2040,$acc9
343 ldx [$tbl+$acc8],$acc8
344 sll $t1,3,$acc11
345 and $acc10,2040,$acc10
346 ldx [$tbl+$acc9],$acc9 !
347 srl $t3,21,$acc12
348 and $acc11,2040,$acc11
349 ldx [$tbl+$acc10],$acc10
350 srl $t0,13,$acc13
351 and $acc12,2040,$acc12
352 ldx [$tbl+$acc11],$acc11
353 fmovs %f0,%f0
354 srl $t1,5,$acc14 !
355 and $acc13,2040,$acc13
356 ldx [$tbl+$acc12],$acc12
357 sll $t2,3,$acc15
358 and $acc14,2040,$acc14
359 ldx [$tbl+$acc13],$acc13
360 srlx $acc1,8,$acc1
361 and $acc15,2040,$acc15
362 ldx [$tbl+$acc14],$acc14 !
363
364 srlx $acc2,16,$acc2
365 xor $acc0,$s0,$s0
366 ldx [$tbl+$acc15],$acc15
367 srlx $acc3,24,$acc3
368 xor $acc1,$s0,$s0
369 ld [$key+16],$t0
370 fmovs %f0,%f0
371 srlx $acc5,8,$acc5 !
372 xor $acc2,$s0,$s0
373 ld [$key+20],$t1
374 srlx $acc6,16,$acc6
375 xor $acc3,$s0,$s0
376 ld [$key+24],$t2
377 srlx $acc7,24,$acc7
378 xor $acc4,$s1,$s1
379 ld [$key+28],$t3 !
380 srlx $acc9,8,$acc9
381 xor $acc5,$s1,$s1
985e4c41 382 ldx [$tbl+2048+0],%g0 ! prefetch te4
7395d852
AP
383 srlx $acc10,16,$acc10
384 xor $acc6,$s1,$s1
985e4c41 385 ldx [$tbl+2048+32],%g0 ! prefetch te4
7395d852
AP
386 srlx $acc11,24,$acc11
387 xor $acc7,$s1,$s1
985e4c41 388 ldx [$tbl+2048+64],%g0 ! prefetch te4
7395d852
AP
389 srlx $acc13,8,$acc13
390 xor $acc8,$s2,$s2
985e4c41 391 ldx [$tbl+2048+96],%g0 ! prefetch te4
7395d852
AP
392 srlx $acc14,16,$acc14 !
393 xor $acc9,$s2,$s2
985e4c41 394 ldx [$tbl+2048+128],%g0 ! prefetch te4
7395d852
AP
395 srlx $acc15,24,$acc15
396 xor $acc10,$s2,$s2
985e4c41 397 ldx [$tbl+2048+160],%g0 ! prefetch te4
7395d852
AP
398 srl $s0,21,$acc0
399 xor $acc11,$s2,$s2
985e4c41 400 ldx [$tbl+2048+192],%g0 ! prefetch te4
7395d852
AP
401 xor $acc12,$acc14,$acc14
402 xor $acc13,$s3,$s3
985e4c41 403 ldx [$tbl+2048+224],%g0 ! prefetch te4
7395d852
AP
404 srl $s1,13,$acc1 !
405 xor $acc14,$s3,$s3
406 xor $acc15,$s3,$s3
407 ba .Lenc_loop
408 and $acc0,2040,$acc0
409
410.align 32
411.Lenc_last:
412 srlx $acc1,8,$acc1 !
413 xor $acc0,$t0,$t0
414 ld [$key+0],$s0
415 srlx $acc2,16,$acc2
416 xor $acc1,$t0,$t0
417 ld [$key+4],$s1
418 srlx $acc3,24,$acc3
419 xor $acc2,$t0,$t0
420 ld [$key+8],$s2 !
421 srlx $acc5,8,$acc5
422 xor $acc3,$t0,$t0
423 ld [$key+12],$s3
424 srlx $acc6,16,$acc6
425 xor $acc4,$t1,$t1
426 srlx $acc7,24,$acc7
427 xor $acc5,$t1,$t1
428 srlx $acc9,8,$acc9 !
429 xor $acc6,$t1,$t1
430 srlx $acc10,16,$acc10
431 xor $acc7,$t1,$t1
432 srlx $acc11,24,$acc11
433 xor $acc8,$t2,$t2
434 srlx $acc13,8,$acc13
435 xor $acc9,$t2,$t2
436 srlx $acc14,16,$acc14 !
437 xor $acc10,$t2,$t2
438 srlx $acc15,24,$acc15
439 xor $acc11,$t2,$t2
440 xor $acc12,$acc14,$acc14
441 xor $acc13,$t3,$t3
442 srl $t0,24,$acc0
443 xor $acc14,$t3,$t3
444 srl $t1,16,$acc1 !
445 xor $acc15,$t3,$t3
446
447 srl $t2,8,$acc2
448 and $acc1,255,$acc1
449 ldub [$rounds+$acc0],$acc0
450 srl $t1,24,$acc4
451 and $acc2,255,$acc2
452 ldub [$rounds+$acc1],$acc1
453 srl $t2,16,$acc5 !
454 and $t3,255,$acc3
455 ldub [$rounds+$acc2],$acc2
456 ldub [$rounds+$acc3],$acc3
457 srl $t3,8,$acc6
458 and $acc5,255,$acc5
459 ldub [$rounds+$acc4],$acc4
460 fmovs %f0,%f0
461 srl $t2,24,$acc8 !
462 and $acc6,255,$acc6
463 ldub [$rounds+$acc5],$acc5
464 srl $t3,16,$acc9
465 and $t0,255,$acc7
466 ldub [$rounds+$acc6],$acc6
467 ldub [$rounds+$acc7],$acc7
468 fmovs %f0,%f0
469 srl $t0,8,$acc10 !
470 and $acc9,255,$acc9
471 ldub [$rounds+$acc8],$acc8
472 srl $t3,24,$acc12
473 and $acc10,255,$acc10
474 ldub [$rounds+$acc9],$acc9
475 srl $t0,16,$acc13
476 and $t1,255,$acc11
477 ldub [$rounds+$acc10],$acc10 !
478 srl $t1,8,$acc14
479 and $acc13,255,$acc13
480 ldub [$rounds+$acc11],$acc11
481 ldub [$rounds+$acc12],$acc12
482 and $acc14,255,$acc14
483 ldub [$rounds+$acc13],$acc13
484 and $t2,255,$acc15
485 ldub [$rounds+$acc14],$acc14 !
486
487 sll $acc0,24,$acc0
488 xor $acc3,$s0,$s0
489 ldub [$rounds+$acc15],$acc15
490 sll $acc1,16,$acc1
491 xor $acc0,$s0,$s0
492 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
493 fmovs %f0,%f0
494 sll $acc2,8,$acc2 !
495 xor $acc1,$s0,$s0
496 sll $acc4,24,$acc4
497 xor $acc2,$s0,$s0
498 sll $acc5,16,$acc5
499 xor $acc7,$s1,$s1
500 sll $acc6,8,$acc6
501 xor $acc4,$s1,$s1
502 sll $acc8,24,$acc8 !
503 xor $acc5,$s1,$s1
504 sll $acc9,16,$acc9
505 xor $acc11,$s2,$s2
506 sll $acc10,8,$acc10
507 xor $acc6,$s1,$s1
508 sll $acc12,24,$acc12
509 xor $acc8,$s2,$s2
510 sll $acc13,16,$acc13 !
511 xor $acc9,$s2,$s2
512 sll $acc14,8,$acc14
513 xor $acc10,$s2,$s2
514 xor $acc12,$acc14,$acc14
515 xor $acc13,$s3,$s3
516 xor $acc14,$s3,$s3
517 xor $acc15,$s3,$s3
518
519 ret
520 restore
521.type _sparcv9_AES_encrypt,#function
522.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
523
524.align 32
525.globl AES_encrypt
526AES_encrypt:
527 or %o0,%o1,%g1
528 andcc %g1,3,%g0
529 bnz,pn %xcc,.Lunaligned_enc
530 save %sp,-$frame,%sp
531
532 ld [%i0+0],%o0
533 ld [%i0+4],%o1
534 ld [%i0+8],%o2
535 ld [%i0+12],%o3
536
4c78bc05 5371: call .+8
e22b8648 538 add %o7,AES_Te-1b,%o4
4c78bc05
AP
539 call _sparcv9_AES_encrypt
540 mov %i2,%o5
7395d852
AP
541
542 st %o0,[%i1+0]
543 st %o1,[%i1+4]
544 st %o2,[%i1+8]
545 st %o3,[%i1+12]
546
547 ret
548 restore
549
550.align 32
551.Lunaligned_enc:
552 ldub [%i0+0],%l0
553 ldub [%i0+1],%l1
554 ldub [%i0+2],%l2
555
556 sll %l0,24,%l0
557 ldub [%i0+3],%l3
558 sll %l1,16,%l1
559 ldub [%i0+4],%l4
560 sll %l2,8,%l2
561 or %l1,%l0,%l0
562 ldub [%i0+5],%l5
563 sll %l4,24,%l4
564 or %l3,%l2,%l2
565 ldub [%i0+6],%l6
566 sll %l5,16,%l5
567 or %l0,%l2,%o0
568 ldub [%i0+7],%l7
569
570 sll %l6,8,%l6
571 or %l5,%l4,%l4
572 ldub [%i0+8],%l0
573 or %l7,%l6,%l6
574 ldub [%i0+9],%l1
575 or %l4,%l6,%o1
576 ldub [%i0+10],%l2
577
578 sll %l0,24,%l0
579 ldub [%i0+11],%l3
580 sll %l1,16,%l1
581 ldub [%i0+12],%l4
582 sll %l2,8,%l2
583 or %l1,%l0,%l0
584 ldub [%i0+13],%l5
585 sll %l4,24,%l4
586 or %l3,%l2,%l2
587 ldub [%i0+14],%l6
588 sll %l5,16,%l5
589 or %l0,%l2,%o2
590 ldub [%i0+15],%l7
591
592 sll %l6,8,%l6
593 or %l5,%l4,%l4
594 or %l7,%l6,%l6
595 or %l4,%l6,%o3
596
4c78bc05 5971: call .+8
e22b8648 598 add %o7,AES_Te-1b,%o4
4c78bc05
AP
599 call _sparcv9_AES_encrypt
600 mov %i2,%o5
7395d852
AP
601
602 srl %o0,24,%l0
603 srl %o0,16,%l1
604 stb %l0,[%i1+0]
605 srl %o0,8,%l2
606 stb %l1,[%i1+1]
607 stb %l2,[%i1+2]
608 srl %o1,24,%l4
609 stb %o0,[%i1+3]
610
611 srl %o1,16,%l5
612 stb %l4,[%i1+4]
613 srl %o1,8,%l6
614 stb %l5,[%i1+5]
615 stb %l6,[%i1+6]
616 srl %o2,24,%l0
617 stb %o1,[%i1+7]
618
619 srl %o2,16,%l1
620 stb %l0,[%i1+8]
621 srl %o2,8,%l2
622 stb %l1,[%i1+9]
623 stb %l2,[%i1+10]
624 srl %o3,24,%l4
625 stb %o2,[%i1+11]
626
627 srl %o3,16,%l5
628 stb %l4,[%i1+12]
629 srl %o3,8,%l6
630 stb %l5,[%i1+13]
631 stb %l6,[%i1+14]
632 stb %o3,[%i1+15]
633
634 ret
635 restore
636.type AES_encrypt,#function
637.size AES_encrypt,(.-AES_encrypt)
638
639___
640
641$code.=<<___;
985e4c41 642.align 256
7395d852
AP
643AES_Td:
644___
645&_data_word(
646 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
647 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
648 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
649 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
650 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
651 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
652 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
653 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
654 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
655 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
656 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
657 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
658 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
659 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
660 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
661 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
662 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
663 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
664 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
665 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
666 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
667 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
668 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
669 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
670 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
671 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
672 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
673 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
674 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
675 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
676 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
677 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
678 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
679 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
680 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
681 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
682 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
683 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
684 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
685 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
686 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
687 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
688 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
689 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
690 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
691 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
692 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
693 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
694 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
695 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
696 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
697 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
698 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
699 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
700 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
701 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
702 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
703 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
704 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
705 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
706 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
707 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
708 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
709 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
710$code.=<<___;
711 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
712 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
713 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
714 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
715 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
716 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
717 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
718 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
719 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
720 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
721 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
722 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
723 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
724 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
725 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
726 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
727 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
728 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
729 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
730 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
731 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
732 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
733 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
734 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
735 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
736 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
737 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
738 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
739 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
740 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
741 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
742 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
743.type AES_Td,#object
744.size AES_Td,(.-AES_Td)
745
746.align 64
747.skip 16
748_sparcv9_AES_decrypt:
749 save %sp,-$frame-$locals,%sp
750 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
751 ld [$key+240],$rounds
752 ld [$key+0],$t0
753 ld [$key+4],$t1 !
754 ld [$key+8],$t2
755 ld [$key+12],$t3
756 srl $rounds,1,$rounds
757 xor $t0,$s0,$s0
758 ld [$key+16],$t0
759 xor $t1,$s1,$s1
760 ld [$key+20],$t1
761 srl $s0,21,$acc0 !
762 xor $t2,$s2,$s2
763 ld [$key+24],$t2
764 xor $t3,$s3,$s3
765 and $acc0,2040,$acc0
766 ld [$key+28],$t3
767 srl $s3,13,$acc1
768 nop
769.Ldec_loop:
770 srl $s2,5,$acc2 !
771 and $acc1,2040,$acc1
772 ldx [$tbl+$acc0],$acc0
773 sll $s1,3,$acc3
774 and $acc2,2040,$acc2
775 ldx [$tbl+$acc1],$acc1
776 srl $s1,21,$acc4
777 and $acc3,2040,$acc3
778 ldx [$tbl+$acc2],$acc2 !
779 srl $s0,13,$acc5
780 and $acc4,2040,$acc4
781 ldx [$tbl+$acc3],$acc3
782 srl $s3,5,$acc6
783 and $acc5,2040,$acc5
784 ldx [$tbl+$acc4],$acc4
785 fmovs %f0,%f0
786 sll $s2,3,$acc7 !
787 and $acc6,2040,$acc6
788 ldx [$tbl+$acc5],$acc5
789 srl $s2,21,$acc8
790 and $acc7,2040,$acc7
791 ldx [$tbl+$acc6],$acc6
792 srl $s1,13,$acc9
793 and $acc8,2040,$acc8
794 ldx [$tbl+$acc7],$acc7 !
795 srl $s0,5,$acc10
796 and $acc9,2040,$acc9
797 ldx [$tbl+$acc8],$acc8
798 sll $s3,3,$acc11
799 and $acc10,2040,$acc10
800 ldx [$tbl+$acc9],$acc9
801 fmovs %f0,%f0
802 srl $s3,21,$acc12 !
803 and $acc11,2040,$acc11
804 ldx [$tbl+$acc10],$acc10
805 srl $s2,13,$acc13
806 and $acc12,2040,$acc12
807 ldx [$tbl+$acc11],$acc11
808 srl $s1,5,$acc14
809 and $acc13,2040,$acc13
810 ldx [$tbl+$acc12],$acc12 !
811 sll $s0,3,$acc15
812 and $acc14,2040,$acc14
813 ldx [$tbl+$acc13],$acc13
814 and $acc15,2040,$acc15
815 add $key,32,$key
816 ldx [$tbl+$acc14],$acc14
817 fmovs %f0,%f0
818 subcc $rounds,1,$rounds !
819 ldx [$tbl+$acc15],$acc15
820 bz,a,pn %icc,.Ldec_last
821 add $tbl,2048,$rounds
822
823 srlx $acc1,8,$acc1
824 xor $acc0,$t0,$t0
825 ld [$key+0],$s0
826 fmovs %f0,%f0
827 srlx $acc2,16,$acc2 !
828 xor $acc1,$t0,$t0
829 ld [$key+4],$s1
830 srlx $acc3,24,$acc3
831 xor $acc2,$t0,$t0
832 ld [$key+8],$s2
833 srlx $acc5,8,$acc5
834 xor $acc3,$t0,$t0
835 ld [$key+12],$s3 !
836 srlx $acc6,16,$acc6
837 xor $acc4,$t1,$t1
838 fmovs %f0,%f0
839 srlx $acc7,24,$acc7
840 xor $acc5,$t1,$t1
841 srlx $acc9,8,$acc9
842 xor $acc6,$t1,$t1
843 srlx $acc10,16,$acc10 !
844 xor $acc7,$t1,$t1
845 srlx $acc11,24,$acc11
846 xor $acc8,$t2,$t2
847 srlx $acc13,8,$acc13
848 xor $acc9,$t2,$t2
849 srlx $acc14,16,$acc14
850 xor $acc10,$t2,$t2
851 srlx $acc15,24,$acc15 !
852 xor $acc11,$t2,$t2
853 xor $acc12,$acc14,$acc14
854 xor $acc13,$t3,$t3
855 srl $t0,21,$acc0
856 xor $acc14,$t3,$t3
857 xor $acc15,$t3,$t3
858 srl $t3,13,$acc1
859
860 and $acc0,2040,$acc0 !
861 srl $t2,5,$acc2
862 and $acc1,2040,$acc1
863 ldx [$tbl+$acc0],$acc0
864 sll $t1,3,$acc3
865 and $acc2,2040,$acc2
866 ldx [$tbl+$acc1],$acc1
867 fmovs %f0,%f0
868 srl $t1,21,$acc4 !
869 and $acc3,2040,$acc3
870 ldx [$tbl+$acc2],$acc2
871 srl $t0,13,$acc5
872 and $acc4,2040,$acc4
873 ldx [$tbl+$acc3],$acc3
874 srl $t3,5,$acc6
875 and $acc5,2040,$acc5
876 ldx [$tbl+$acc4],$acc4 !
877 sll $t2,3,$acc7
878 and $acc6,2040,$acc6
879 ldx [$tbl+$acc5],$acc5
880 srl $t2,21,$acc8
881 and $acc7,2040,$acc7
882 ldx [$tbl+$acc6],$acc6
883 fmovs %f0,%f0
884 srl $t1,13,$acc9 !
885 and $acc8,2040,$acc8
886 ldx [$tbl+$acc7],$acc7
887 srl $t0,5,$acc10
888 and $acc9,2040,$acc9
889 ldx [$tbl+$acc8],$acc8
890 sll $t3,3,$acc11
891 and $acc10,2040,$acc10
892 ldx [$tbl+$acc9],$acc9 !
893 srl $t3,21,$acc12
894 and $acc11,2040,$acc11
895 ldx [$tbl+$acc10],$acc10
896 srl $t2,13,$acc13
897 and $acc12,2040,$acc12
898 ldx [$tbl+$acc11],$acc11
899 fmovs %f0,%f0
900 srl $t1,5,$acc14 !
901 and $acc13,2040,$acc13
902 ldx [$tbl+$acc12],$acc12
903 sll $t0,3,$acc15
904 and $acc14,2040,$acc14
905 ldx [$tbl+$acc13],$acc13
906 srlx $acc1,8,$acc1
907 and $acc15,2040,$acc15
908 ldx [$tbl+$acc14],$acc14 !
909
910 srlx $acc2,16,$acc2
911 xor $acc0,$s0,$s0
912 ldx [$tbl+$acc15],$acc15
913 srlx $acc3,24,$acc3
914 xor $acc1,$s0,$s0
915 ld [$key+16],$t0
916 fmovs %f0,%f0
917 srlx $acc5,8,$acc5 !
918 xor $acc2,$s0,$s0
919 ld [$key+20],$t1
920 srlx $acc6,16,$acc6
921 xor $acc3,$s0,$s0
922 ld [$key+24],$t2
923 srlx $acc7,24,$acc7
924 xor $acc4,$s1,$s1
925 ld [$key+28],$t3 !
926 srlx $acc9,8,$acc9
927 xor $acc5,$s1,$s1
985e4c41 928 ldx [$tbl+2048+0],%g0 ! prefetch td4
7395d852
AP
929 srlx $acc10,16,$acc10
930 xor $acc6,$s1,$s1
985e4c41 931 ldx [$tbl+2048+32],%g0 ! prefetch td4
7395d852
AP
932 srlx $acc11,24,$acc11
933 xor $acc7,$s1,$s1
985e4c41 934 ldx [$tbl+2048+64],%g0 ! prefetch td4
7395d852
AP
935 srlx $acc13,8,$acc13
936 xor $acc8,$s2,$s2
985e4c41 937 ldx [$tbl+2048+96],%g0 ! prefetch td4
7395d852
AP
938 srlx $acc14,16,$acc14 !
939 xor $acc9,$s2,$s2
985e4c41 940 ldx [$tbl+2048+128],%g0 ! prefetch td4
7395d852
AP
941 srlx $acc15,24,$acc15
942 xor $acc10,$s2,$s2
985e4c41 943 ldx [$tbl+2048+160],%g0 ! prefetch td4
7395d852
AP
944 srl $s0,21,$acc0
945 xor $acc11,$s2,$s2
985e4c41 946 ldx [$tbl+2048+192],%g0 ! prefetch td4
7395d852
AP
947 xor $acc12,$acc14,$acc14
948 xor $acc13,$s3,$s3
985e4c41 949 ldx [$tbl+2048+224],%g0 ! prefetch td4
7395d852
AP
950 and $acc0,2040,$acc0 !
951 xor $acc14,$s3,$s3
952 xor $acc15,$s3,$s3
953 ba .Ldec_loop
954 srl $s3,13,$acc1
955
956.align 32
957.Ldec_last:
958 srlx $acc1,8,$acc1 !
959 xor $acc0,$t0,$t0
960 ld [$key+0],$s0
961 srlx $acc2,16,$acc2
962 xor $acc1,$t0,$t0
963 ld [$key+4],$s1
964 srlx $acc3,24,$acc3
965 xor $acc2,$t0,$t0
966 ld [$key+8],$s2 !
967 srlx $acc5,8,$acc5
968 xor $acc3,$t0,$t0
969 ld [$key+12],$s3
970 srlx $acc6,16,$acc6
971 xor $acc4,$t1,$t1
972 srlx $acc7,24,$acc7
973 xor $acc5,$t1,$t1
974 srlx $acc9,8,$acc9 !
975 xor $acc6,$t1,$t1
976 srlx $acc10,16,$acc10
977 xor $acc7,$t1,$t1
978 srlx $acc11,24,$acc11
979 xor $acc8,$t2,$t2
980 srlx $acc13,8,$acc13
981 xor $acc9,$t2,$t2
982 srlx $acc14,16,$acc14 !
983 xor $acc10,$t2,$t2
984 srlx $acc15,24,$acc15
985 xor $acc11,$t2,$t2
986 xor $acc12,$acc14,$acc14
987 xor $acc13,$t3,$t3
988 srl $t0,24,$acc0
989 xor $acc14,$t3,$t3
990 xor $acc15,$t3,$t3 !
991 srl $t3,16,$acc1
992
993 srl $t2,8,$acc2
994 and $acc1,255,$acc1
995 ldub [$rounds+$acc0],$acc0
996 srl $t1,24,$acc4
997 and $acc2,255,$acc2
998 ldub [$rounds+$acc1],$acc1
999 srl $t0,16,$acc5 !
1000 and $t1,255,$acc3
1001 ldub [$rounds+$acc2],$acc2
1002 ldub [$rounds+$acc3],$acc3
1003 srl $t3,8,$acc6
1004 and $acc5,255,$acc5
1005 ldub [$rounds+$acc4],$acc4
1006 fmovs %f0,%f0
1007 srl $t2,24,$acc8 !
1008 and $acc6,255,$acc6
1009 ldub [$rounds+$acc5],$acc5
1010 srl $t1,16,$acc9
1011 and $t2,255,$acc7
1012 ldub [$rounds+$acc6],$acc6
1013 ldub [$rounds+$acc7],$acc7
1014 fmovs %f0,%f0
1015 srl $t0,8,$acc10 !
1016 and $acc9,255,$acc9
1017 ldub [$rounds+$acc8],$acc8
1018 srl $t3,24,$acc12
1019 and $acc10,255,$acc10
1020 ldub [$rounds+$acc9],$acc9
1021 srl $t2,16,$acc13
1022 and $t3,255,$acc11
1023 ldub [$rounds+$acc10],$acc10 !
1024 srl $t1,8,$acc14
1025 and $acc13,255,$acc13
1026 ldub [$rounds+$acc11],$acc11
1027 ldub [$rounds+$acc12],$acc12
1028 and $acc14,255,$acc14
1029 ldub [$rounds+$acc13],$acc13
1030 and $t0,255,$acc15
1031 ldub [$rounds+$acc14],$acc14 !
1032
1033 sll $acc0,24,$acc0
1034 xor $acc3,$s0,$s0
1035 ldub [$rounds+$acc15],$acc15
1036 sll $acc1,16,$acc1
1037 xor $acc0,$s0,$s0
1038 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1039 fmovs %f0,%f0
1040 sll $acc2,8,$acc2 !
1041 xor $acc1,$s0,$s0
1042 sll $acc4,24,$acc4
1043 xor $acc2,$s0,$s0
1044 sll $acc5,16,$acc5
1045 xor $acc7,$s1,$s1
1046 sll $acc6,8,$acc6
1047 xor $acc4,$s1,$s1
1048 sll $acc8,24,$acc8 !
1049 xor $acc5,$s1,$s1
1050 sll $acc9,16,$acc9
1051 xor $acc11,$s2,$s2
1052 sll $acc10,8,$acc10
1053 xor $acc6,$s1,$s1
1054 sll $acc12,24,$acc12
1055 xor $acc8,$s2,$s2
1056 sll $acc13,16,$acc13 !
1057 xor $acc9,$s2,$s2
1058 sll $acc14,8,$acc14
1059 xor $acc10,$s2,$s2
1060 xor $acc12,$acc14,$acc14
1061 xor $acc13,$s3,$s3
1062 xor $acc14,$s3,$s3
1063 xor $acc15,$s3,$s3
1064
1065 ret
1066 restore
1067.type _sparcv9_AES_decrypt,#function
1068.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1069
1070.align 32
1071.globl AES_decrypt
1072AES_decrypt:
1073 or %o0,%o1,%g1
1074 andcc %g1,3,%g0
1075 bnz,pn %xcc,.Lunaligned_dec
1076 save %sp,-$frame,%sp
1077
1078 ld [%i0+0],%o0
1079 ld [%i0+4],%o1
1080 ld [%i0+8],%o2
1081 ld [%i0+12],%o3
1082
4c78bc05 10831: call .+8
e22b8648 1084 add %o7,AES_Td-1b,%o4
4c78bc05
AP
1085 call _sparcv9_AES_decrypt
1086 mov %i2,%o5
7395d852
AP
1087
1088 st %o0,[%i1+0]
1089 st %o1,[%i1+4]
1090 st %o2,[%i1+8]
1091 st %o3,[%i1+12]
1092
1093 ret
1094 restore
1095
1096.align 32
1097.Lunaligned_dec:
1098 ldub [%i0+0],%l0
1099 ldub [%i0+1],%l1
1100 ldub [%i0+2],%l2
1101
1102 sll %l0,24,%l0
1103 ldub [%i0+3],%l3
1104 sll %l1,16,%l1
1105 ldub [%i0+4],%l4
1106 sll %l2,8,%l2
1107 or %l1,%l0,%l0
1108 ldub [%i0+5],%l5
1109 sll %l4,24,%l4
1110 or %l3,%l2,%l2
1111 ldub [%i0+6],%l6
1112 sll %l5,16,%l5
1113 or %l0,%l2,%o0
1114 ldub [%i0+7],%l7
1115
1116 sll %l6,8,%l6
1117 or %l5,%l4,%l4
1118 ldub [%i0+8],%l0
1119 or %l7,%l6,%l6
1120 ldub [%i0+9],%l1
1121 or %l4,%l6,%o1
1122 ldub [%i0+10],%l2
1123
1124 sll %l0,24,%l0
1125 ldub [%i0+11],%l3
1126 sll %l1,16,%l1
1127 ldub [%i0+12],%l4
1128 sll %l2,8,%l2
1129 or %l1,%l0,%l0
1130 ldub [%i0+13],%l5
1131 sll %l4,24,%l4
1132 or %l3,%l2,%l2
1133 ldub [%i0+14],%l6
1134 sll %l5,16,%l5
1135 or %l0,%l2,%o2
1136 ldub [%i0+15],%l7
1137
1138 sll %l6,8,%l6
1139 or %l5,%l4,%l4
1140 or %l7,%l6,%l6
1141 or %l4,%l6,%o3
1142
4c78bc05 11431: call .+8
e22b8648 1144 add %o7,AES_Td-1b,%o4
4c78bc05
AP
1145 call _sparcv9_AES_decrypt
1146 mov %i2,%o5
7395d852
AP
1147
1148 srl %o0,24,%l0
1149 srl %o0,16,%l1
1150 stb %l0,[%i1+0]
1151 srl %o0,8,%l2
1152 stb %l1,[%i1+1]
1153 stb %l2,[%i1+2]
1154 srl %o1,24,%l4
1155 stb %o0,[%i1+3]
1156
1157 srl %o1,16,%l5
1158 stb %l4,[%i1+4]
1159 srl %o1,8,%l6
1160 stb %l5,[%i1+5]
1161 stb %l6,[%i1+6]
1162 srl %o2,24,%l0
1163 stb %o1,[%i1+7]
1164
1165 srl %o2,16,%l1
1166 stb %l0,[%i1+8]
1167 srl %o2,8,%l2
1168 stb %l1,[%i1+9]
1169 stb %l2,[%i1+10]
1170 srl %o3,24,%l4
1171 stb %o2,[%i1+11]
1172
1173 srl %o3,16,%l5
1174 stb %l4,[%i1+12]
1175 srl %o3,8,%l6
1176 stb %l5,[%i1+13]
1177 stb %l6,[%i1+14]
1178 stb %o3,[%i1+15]
1179
1180 ret
1181 restore
1182.type AES_decrypt,#function
1183.size AES_decrypt,(.-AES_decrypt)
1184___
1185
1186# fmovs instructions substituting for FP nops were originally added
1187# to meet specific instruction alignment requirements to maximize ILP.
1188# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1189# undesired effect, so just omit them and sacrifice some portion of
1190# percent in performance...
e255024b 1191$code =~ s/fmovs.*$//gm;
7395d852
AP
1192
1193print $code;
a21314db 1194close STDOUT or die "error closing STDOUT: $!"; # ensure flush