]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aes-s390x.pl
ssl/ssl_ciph.c: interim solution for assertion in d1_pkt.c(444).
[thirdparty/openssl.git] / crypto / aes / asm / aes-s390x.pl
CommitLineData
a2a54ffc
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
76c828c6 19# flow proceeds. S-boxes are compressed to 2KB[+256B].
a2a54ffc
AP
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
251718e4 26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
a2a54ffc
AP
27# support is implemented.
28
76c828c6
AP
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
8626230a
AP
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
4e52b984
AP
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
8626230a
AP
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
26064d7f 47# for keys longer than 128 bits.
8626230a
AP
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
874a3757
AP
53# May 2010.
54#
26064d7f
AP
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
874a3757 62
e822c756
AP
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
0ab8fd58
AP
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
0c237e42
AP
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
0ab8fd58 84
e822c756
AP
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
1cbdca7b
AP
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
a61710b8
AP
98$softonly=0; # allow hardware support
99
8626230a
AP
100$t0="%r0"; $mask="%r0";
101$t1="%r1";
102$t2="%r2"; $inp="%r2";
103$t3="%r3"; $out="%r3"; $bits="%r3";
a2a54ffc
AP
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
e822c756
AP
117$stdframe=16*$SIZE_T+4*8;
118
a2a54ffc
AP
119sub _data_word()
120{ my $i;
121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type AES_Te,\@object
8626230a 128.align 256
a2a54ffc
AP
129AES_Te:
130___
131&_data_word(
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
76c828c6
AP
197# Te4[256]
198.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
8626230a 234.align 256
a2a54ffc
AP
235.size AES_Te,.-AES_Te
236
76c828c6 237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
a2a54ffc
AP
238# const AES_KEY *key) {
239.globl AES_encrypt
240.type AES_encrypt,\@function
241AES_encrypt:
a61710b8
AP
242___
243$code.=<<___ if (!$softonly);
8626230a
AP
244 l %r0,240($key)
245 lhi %r1,16
246 clr %r0,%r1
247 jl .Lesoft
248
a2a54ffc 249 la %r1,0($key)
3f6916cf 250 #la %r2,0($inp)
a2a54ffc
AP
251 la %r4,0($out)
252 lghi %r3,16 # single block length
253 .long 0xb92e0042 # km %r4,%r2
8626230a
AP
254 brc 1,.-4 # can this happen?
255 br %r14
256.align 64
a2a54ffc 257.Lesoft:
a61710b8
AP
258___
259$code.=<<___;
e822c756 260 stm${g} %r3,$ra,3*$SIZE_T($sp)
a2a54ffc
AP
261
262 llgf $s0,0($inp)
263 llgf $s1,4($inp)
264 llgf $s2,8($inp)
265 llgf $s3,12($inp)
266
8626230a 267 larl $tbl,AES_Te
a2a54ffc
AP
268 bras $ra,_s390x_AES_encrypt
269
e822c756 270 l${g} $out,3*$SIZE_T($sp)
a2a54ffc
AP
271 st $s0,0($out)
272 st $s1,4($out)
273 st $s2,8($out)
274 st $s3,12($out)
275
e822c756 276 lm${g} %r6,$ra,6*$SIZE_T($sp)
76c828c6 277 br $ra
a2a54ffc
AP
278.size AES_encrypt,.-AES_encrypt
279
280.type _s390x_AES_encrypt,\@function
281.align 16
282_s390x_AES_encrypt:
0ab8fd58 283 st${g} $ra,15*$SIZE_T($sp)
a2a54ffc
AP
284 x $s0,0($key)
285 x $s1,4($key)
286 x $s2,8($key)
287 x $s3,12($key)
288 l $rounds,240($key)
8626230a 289 llill $mask,`0xff<<3`
a2a54ffc 290 aghi $rounds,-1
8626230a
AP
291 j .Lenc_loop
292.align 16
a2a54ffc 293.Lenc_loop:
8626230a
AP
294 sllg $t1,$s0,`0+3`
295 srlg $t2,$s0,`8-3`
296 srlg $t3,$s0,`16-3`
a2a54ffc
AP
297 srl $s0,`24-3`
298 nr $s0,$mask
8626230a
AP
299 ngr $t1,$mask
300 nr $t2,$mask
301 nr $t3,$mask
a2a54ffc
AP
302
303 srlg $i1,$s1,`16-3` # i0
304 sllg $i2,$s1,`0+3`
305 srlg $i3,$s1,`8-3`
306 srl $s1,`24-3`
307 nr $i1,$mask
308 nr $s1,$mask
309 ngr $i2,$mask
310 nr $i3,$mask
8626230a
AP
311
312 l $s0,0($s0,$tbl) # Te0[s0>>24]
313 l $t1,1($t1,$tbl) # Te3[s0>>0]
314 l $t2,2($t2,$tbl) # Te2[s0>>8]
315 l $t3,3($t3,$tbl) # Te1[s0>>16]
316
a2a54ffc
AP
317 x $s0,3($i1,$tbl) # Te1[s1>>16]
318 l $s1,0($s1,$tbl) # Te0[s1>>24]
319 x $t2,1($i2,$tbl) # Te3[s1>>0]
320 x $t3,2($i3,$tbl) # Te2[s1>>8]
a2a54ffc
AP
321
322 srlg $i1,$s2,`8-3` # i0
323 srlg $i2,$s2,`16-3` # i1
a2a54ffc
AP
324 nr $i1,$mask
325 nr $i2,$mask
8626230a
AP
326 sllg $i3,$s2,`0+3`
327 srl $s2,`24-3`
a2a54ffc
AP
328 nr $s2,$mask
329 ngr $i3,$mask
8626230a
AP
330
331 xr $s1,$t1
332 srlg $ra,$s3,`8-3` # i1
333 sllg $t1,$s3,`0+3` # i0
334 nr $ra,$mask
335 la $key,16($key)
336 ngr $t1,$mask
337
a2a54ffc
AP
338 x $s0,2($i1,$tbl) # Te2[s2>>8]
339 x $s1,3($i2,$tbl) # Te1[s2>>16]
340 l $s2,0($s2,$tbl) # Te0[s2>>24]
341 x $t3,1($i3,$tbl) # Te3[s2>>0]
a2a54ffc 342
a2a54ffc 343 srlg $i3,$s3,`16-3` # i2
8626230a 344 xr $s2,$t2
a2a54ffc 345 srl $s3,`24-3`
a2a54ffc
AP
346 nr $i3,$mask
347 nr $s3,$mask
a2a54ffc 348
a2a54ffc
AP
349 x $s0,0($key)
350 x $s1,4($key)
351 x $s2,8($key)
8626230a
AP
352 x $t3,12($key)
353
354 x $s0,1($t1,$tbl) # Te3[s3>>0]
355 x $s1,2($ra,$tbl) # Te2[s3>>8]
356 x $s2,3($i3,$tbl) # Te1[s3>>16]
357 l $s3,0($s3,$tbl) # Te0[s3>>24]
358 xr $s3,$t3
a2a54ffc
AP
359
360 brct $rounds,.Lenc_loop
8626230a 361 .align 16
a2a54ffc 362
8626230a
AP
363 sllg $t1,$s0,`0+3`
364 srlg $t2,$s0,`8-3`
365 ngr $t1,$mask
366 srlg $t3,$s0,`16-3`
a2a54ffc
AP
367 srl $s0,`24-3`
368 nr $s0,$mask
8626230a
AP
369 nr $t2,$mask
370 nr $t3,$mask
a2a54ffc
AP
371
372 srlg $i1,$s1,`16-3` # i0
373 sllg $i2,$s1,`0+3`
8626230a 374 ngr $i2,$mask
a2a54ffc
AP
375 srlg $i3,$s1,`8-3`
376 srl $s1,`24-3`
377 nr $i1,$mask
378 nr $s1,$mask
a2a54ffc 379 nr $i3,$mask
8626230a
AP
380
381 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
382 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
383 sll $s0,24
384 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
385 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
386 sll $t2,8
387 sll $t3,16
388
a2a54ffc
AP
389 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
390 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
391 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
392 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
393 sll $i1,16
394 sll $s1,24
395 sll $i3,8
396 or $s0,$i1
397 or $s1,$t1
398 or $t2,$i2
399 or $t3,$i3
400
401 srlg $i1,$s2,`8-3` # i0
402 srlg $i2,$s2,`16-3` # i1
a2a54ffc
AP
403 nr $i1,$mask
404 nr $i2,$mask
8626230a
AP
405 sllg $i3,$s2,`0+3`
406 srl $s2,`24-3`
a2a54ffc 407 ngr $i3,$mask
8626230a
AP
408 nr $s2,$mask
409
410 sllg $t1,$s3,`0+3` # i0
411 srlg $ra,$s3,`8-3` # i1
412 ngr $t1,$mask
413
a2a54ffc
AP
414 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
415 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
8626230a 416 sll $i1,8
a2a54ffc
AP
417 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
418 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
a2a54ffc 419 sll $i2,16
8626230a 420 nr $ra,$mask
a2a54ffc
AP
421 sll $s2,24
422 or $s0,$i1
423 or $s1,$i2
424 or $s2,$t2
425 or $t3,$i3
426
a2a54ffc
AP
427 srlg $i3,$s3,`16-3` # i2
428 srl $s3,`24-3`
a2a54ffc
AP
429 nr $i3,$mask
430 nr $s3,$mask
8626230a
AP
431
432 l $t0,16($key)
433 l $t2,20($key)
434
435 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
436 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
a2a54ffc
AP
437 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
438 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
439 sll $i2,8
440 sll $i3,16
441 sll $s3,24
442 or $s0,$i1
443 or $s1,$i2
444 or $s2,$i3
445 or $s3,$t3
446
0ab8fd58 447 l${g} $ra,15*$SIZE_T($sp)
8626230a
AP
448 xr $s0,$t0
449 xr $s1,$t2
a2a54ffc
AP
450 x $s2,24($key)
451 x $s3,28($key)
452
453 br $ra
454.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type AES_Td,\@object
8626230a 459.align 256
a2a54ffc
AP
460AES_Td:
461___
462&_data_word(
463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
76c828c6 528# Td4[256]
a2a54ffc
AP
529.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size AES_Td,.-AES_Td
562
76c828c6 563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
a2a54ffc
AP
564# const AES_KEY *key) {
565.globl AES_decrypt
566.type AES_decrypt,\@function
567AES_decrypt:
a61710b8
AP
568___
569$code.=<<___ if (!$softonly);
8626230a
AP
570 l %r0,240($key)
571 lhi %r1,16
572 clr %r0,%r1
573 jl .Ldsoft
574
575 la %r1,0($key)
3f6916cf 576 #la %r2,0($inp)
a2a54ffc
AP
577 la %r4,0($out)
578 lghi %r3,16 # single block length
579 .long 0xb92e0042 # km %r4,%r2
8626230a
AP
580 brc 1,.-4 # can this happen?
581 br %r14
582.align 64
a2a54ffc 583.Ldsoft:
a61710b8
AP
584___
585$code.=<<___;
e822c756 586 stm${g} %r3,$ra,3*$SIZE_T($sp)
a2a54ffc
AP
587
588 llgf $s0,0($inp)
589 llgf $s1,4($inp)
590 llgf $s2,8($inp)
591 llgf $s3,12($inp)
592
8626230a 593 larl $tbl,AES_Td
a2a54ffc
AP
594 bras $ra,_s390x_AES_decrypt
595
e822c756 596 l${g} $out,3*$SIZE_T($sp)
a2a54ffc
AP
597 st $s0,0($out)
598 st $s1,4($out)
599 st $s2,8($out)
600 st $s3,12($out)
601
e822c756 602 lm${g} %r6,$ra,6*$SIZE_T($sp)
76c828c6 603 br $ra
a2a54ffc
AP
604.size AES_decrypt,.-AES_decrypt
605
606.type _s390x_AES_decrypt,\@function
607.align 16
608_s390x_AES_decrypt:
0ab8fd58 609 st${g} $ra,15*$SIZE_T($sp)
a2a54ffc
AP
610 x $s0,0($key)
611 x $s1,4($key)
612 x $s2,8($key)
613 x $s3,12($key)
614 l $rounds,240($key)
8626230a 615 llill $mask,`0xff<<3`
a2a54ffc 616 aghi $rounds,-1
8626230a
AP
617 j .Ldec_loop
618.align 16
a2a54ffc 619.Ldec_loop:
8626230a
AP
620 srlg $t1,$s0,`16-3`
621 srlg $t2,$s0,`8-3`
622 sllg $t3,$s0,`0+3`
a2a54ffc
AP
623 srl $s0,`24-3`
624 nr $s0,$mask
8626230a
AP
625 nr $t1,$mask
626 nr $t2,$mask
627 ngr $t3,$mask
a2a54ffc
AP
628
629 sllg $i1,$s1,`0+3` # i0
630 srlg $i2,$s1,`16-3`
631 srlg $i3,$s1,`8-3`
632 srl $s1,`24-3`
633 ngr $i1,$mask
634 nr $s1,$mask
635 nr $i2,$mask
636 nr $i3,$mask
8626230a
AP
637
638 l $s0,0($s0,$tbl) # Td0[s0>>24]
639 l $t1,3($t1,$tbl) # Td1[s0>>16]
640 l $t2,2($t2,$tbl) # Td2[s0>>8]
641 l $t3,1($t3,$tbl) # Td3[s0>>0]
642
a2a54ffc
AP
643 x $s0,1($i1,$tbl) # Td3[s1>>0]
644 l $s1,0($s1,$tbl) # Td0[s1>>24]
645 x $t2,3($i2,$tbl) # Td1[s1>>16]
646 x $t3,2($i3,$tbl) # Td2[s1>>8]
a2a54ffc
AP
647
648 srlg $i1,$s2,`8-3` # i0
649 sllg $i2,$s2,`0+3` # i1
650 srlg $i3,$s2,`16-3`
651 srl $s2,`24-3`
652 nr $i1,$mask
653 ngr $i2,$mask
654 nr $s2,$mask
655 nr $i3,$mask
8626230a
AP
656
657 xr $s1,$t1
658 srlg $ra,$s3,`8-3` # i1
659 srlg $t1,$s3,`16-3` # i0
660 nr $ra,$mask
661 la $key,16($key)
662 nr $t1,$mask
663
a2a54ffc
AP
664 x $s0,2($i1,$tbl) # Td2[s2>>8]
665 x $s1,1($i2,$tbl) # Td3[s2>>0]
666 l $s2,0($s2,$tbl) # Td0[s2>>24]
667 x $t3,3($i3,$tbl) # Td1[s2>>16]
a2a54ffc 668
a2a54ffc
AP
669 sllg $i3,$s3,`0+3` # i2
670 srl $s3,`24-3`
a2a54ffc
AP
671 ngr $i3,$mask
672 nr $s3,$mask
a2a54ffc 673
8626230a 674 xr $s2,$t2
a2a54ffc
AP
675 x $s0,0($key)
676 x $s1,4($key)
677 x $s2,8($key)
8626230a
AP
678 x $t3,12($key)
679
680 x $s0,3($t1,$tbl) # Td1[s3>>16]
681 x $s1,2($ra,$tbl) # Td2[s3>>8]
682 x $s2,1($i3,$tbl) # Td3[s3>>0]
683 l $s3,0($s3,$tbl) # Td0[s3>>24]
684 xr $s3,$t3
a2a54ffc
AP
685
686 brct $rounds,.Ldec_loop
8626230a 687 .align 16
a2a54ffc
AP
688
689 l $t1,`2048+0`($tbl) # prefetch Td4
8626230a
AP
690 l $t2,`2048+64`($tbl)
691 l $t3,`2048+128`($tbl)
692 l $i1,`2048+192`($tbl)
a2a54ffc
AP
693 llill $mask,0xff
694
695 srlg $i3,$s0,24 # i0
8626230a
AP
696 srlg $t1,$s0,16
697 srlg $t2,$s0,8
a2a54ffc 698 nr $s0,$mask # i3
8626230a
AP
699 nr $t1,$mask
700
701 srlg $i1,$s1,24
702 nr $t2,$mask
703 srlg $i2,$s1,16
704 srlg $ra,$s1,8
705 nr $s1,$mask # i0
a2a54ffc 706 nr $i2,$mask
8626230a
AP
707 nr $ra,$mask
708
a2a54ffc 709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
8626230a
AP
710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
712 sll $t1,16
a2a54ffc
AP
713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
714 sllg $s0,$i3,24
a2a54ffc
AP
715 sll $t2,8
716
a2a54ffc
AP
717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
a2a54ffc 720 sll $i1,24
8626230a 721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
a2a54ffc
AP
722 sll $i2,16
723 sll $i3,8
724 or $s0,$s1
725 or $t1,$i1
726 or $t2,$i2
727 or $t3,$i3
728
729 srlg $i1,$s2,8 # i0
730 srlg $i2,$s2,24
731 srlg $i3,$s2,16
732 nr $s2,$mask # i1
733 nr $i1,$mask
734 nr $i3,$mask
735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
739 sll $i1,8
740 sll $i2,24
a2a54ffc 741 or $s0,$i1
8626230a 742 sll $i3,16
a2a54ffc
AP
743 or $t2,$i2
744 or $t3,$i3
745
746 srlg $i1,$s3,16 # i0
747 srlg $i2,$s3,8 # i1
748 srlg $i3,$s3,24
749 nr $s3,$mask # i2
750 nr $i1,$mask
751 nr $i2,$mask
8626230a 752
0ab8fd58 753 l${g} $ra,15*$SIZE_T($sp)
8626230a
AP
754 or $s1,$t1
755 l $t0,16($key)
756 l $t1,20($key)
757
a2a54ffc
AP
758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
8626230a 760 sll $i1,16
a2a54ffc
AP
761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
a2a54ffc
AP
763 sll $i2,8
764 sll $s3,24
765 or $s0,$i1
766 or $s1,$i2
767 or $s2,$t2
768 or $s3,$t3
769
8626230a
AP
770 xr $s0,$t0
771 xr $s1,$t1
a2a54ffc
AP
772 x $s2,24($key)
773 x $s3,28($key)
774
775 br $ra
776.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
8626230a 777___
76c828c6 778
8626230a 779$code.=<<___;
76c828c6
AP
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# AES_KEY *key) {
782.globl AES_set_encrypt_key
783.type AES_set_encrypt_key,\@function
784.align 16
785AES_set_encrypt_key:
8626230a 786 lghi $t0,0
e822c756 787 cl${g}r $inp,$t0
76c828c6 788 je .Lminus1
e822c756 789 cl${g}r $key,$t0
76c828c6
AP
790 je .Lminus1
791
8626230a
AP
792 lghi $t0,128
793 clr $bits,$t0
794 je .Lproceed
795 lghi $t0,192
796 clr $bits,$t0
797 je .Lproceed
798 lghi $t0,256
799 clr $bits,$t0
800 je .Lproceed
76c828c6
AP
801 lghi %r2,-2
802 br %r14
803
8626230a
AP
804.align 16
805.Lproceed:
a61710b8
AP
806___
807$code.=<<___ if (!$softonly);
8626230a
AP
808 # convert bits to km code, [128,192,256]->[18,19,20]
809 lhi %r5,-128
810 lhi %r0,18
811 ar %r5,$bits
812 srl %r5,6
813 ar %r5,%r0
814
91fdacb2
AP
815 larl %r1,OPENSSL_s390xcap_P
816 lg %r0,0(%r1)
817 tmhl %r0,0x4000 # check for message-security assist
818 jz .Lekey_internal
819
76c828c6
AP
820 lghi %r0,0 # query capability vector
821 la %r1,16($sp)
8626230a
AP
822 .long 0xb92f0042 # kmc %r4,%r2
823
824 llihh %r1,0x8000
825 srlg %r1,%r1,0(%r5)
826 ng %r1,16($sp)
76c828c6
AP
827 jz .Lekey_internal
828
8626230a
AP
829 lmg %r0,%r1,0($inp) # just copy 128 bits...
830 stmg %r0,%r1,0($key)
831 lhi %r0,192
832 cr $bits,%r0
833 jl 1f
834 lg %r1,16($inp)
835 stg %r1,16($key)
836 je 1f
837 lg %r1,24($inp)
838 stg %r1,24($key)
8391: st $bits,236($key) # save bits
840 st %r5,240($key) # save km code
76c828c6
AP
841 lghi %r2,0
842 br %r14
a61710b8
AP
843___
844$code.=<<___;
76c828c6
AP
845.align 16
846.Lekey_internal:
e822c756 847 stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs
76c828c6 848
a61710b8 849 larl $tbl,AES_Te+2048
76c828c6
AP
850
851 llgf $s0,0($inp)
852 llgf $s1,4($inp)
853 llgf $s2,8($inp)
854 llgf $s3,12($inp)
855 st $s0,0($key)
856 st $s1,4($key)
857 st $s2,8($key)
858 st $s3,12($key)
8626230a
AP
859 lghi $t0,128
860 cr $bits,$t0
76c828c6
AP
861 jne .Lnot128
862
863 llill $mask,0xff
864 lghi $t3,0 # i=0
865 lghi $rounds,10
76c828c6
AP
866 st $rounds,240($key)
867
76c828c6
AP
868 llgfr $t2,$s3 # temp=rk[3]
869 srlg $i1,$s3,8
870 srlg $i2,$s3,16
871 srlg $i3,$s3,24
872 nr $t2,$mask
873 nr $i1,$mask
874 nr $i2,$mask
8626230a
AP
875
876.align 16
877.L128_loop:
76c828c6
AP
878 la $t2,0($t2,$tbl)
879 la $i1,0($i1,$tbl)
880 la $i2,0($i2,$tbl)
881 la $i3,0($i3,$tbl)
882 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
883 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
884 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
885 icm $t2,1,0($i3) # Te4[rk[3]>>24]
886 x $t2,256($t3,$tbl) # rcon[i]
887 xr $s0,$t2 # rk[4]=rk[0]^...
888 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
889 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
890 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
8626230a
AP
891
892 llgfr $t2,$s3 # temp=rk[3]
893 srlg $i1,$s3,8
894 srlg $i2,$s3,16
895 nr $t2,$mask
896 nr $i1,$mask
897 srlg $i3,$s3,24
898 nr $i2,$mask
899
76c828c6
AP
900 st $s0,16($key)
901 st $s1,20($key)
902 st $s2,24($key)
903 st $s3,28($key)
904 la $key,16($key) # key+=4
905 la $t3,4($t3) # i++
906 brct $rounds,.L128_loop
907 lghi %r2,0
e822c756 908 lm${g} %r6,%r13,6*$SIZE_T($sp)
76c828c6
AP
909 br $ra
910
8626230a 911.align 16
76c828c6 912.Lnot128:
8626230a
AP
913 llgf $t0,16($inp)
914 llgf $t1,20($inp)
915 st $t0,16($key)
916 st $t1,20($key)
917 lghi $t0,192
918 cr $bits,$t0
76c828c6
AP
919 jne .Lnot192
920
921 llill $mask,0xff
922 lghi $t3,0 # i=0
923 lghi $rounds,12
924 st $rounds,240($key)
925 lghi $rounds,8
926
8626230a
AP
927 srlg $i1,$t1,8
928 srlg $i2,$t1,16
929 srlg $i3,$t1,24
930 nr $t1,$mask
76c828c6
AP
931 nr $i1,$mask
932 nr $i2,$mask
8626230a
AP
933
934.align 16
935.L192_loop:
936 la $t1,0($t1,$tbl)
76c828c6
AP
937 la $i1,0($i1,$tbl)
938 la $i2,0($i2,$tbl)
939 la $i3,0($i3,$tbl)
8626230a
AP
940 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
941 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
942 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
943 icm $t1,1,0($i3) # Te4[rk[5]>>24]
944 x $t1,256($t3,$tbl) # rcon[i]
945 xr $s0,$t1 # rk[6]=rk[0]^...
76c828c6
AP
946 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
947 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
948 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
8626230a 949
76c828c6
AP
950 st $s0,24($key)
951 st $s1,28($key)
952 st $s2,32($key)
953 st $s3,36($key)
954 brct $rounds,.L192_continue
955 lghi %r2,0
e822c756 956 lm${g} %r6,%r13,6*$SIZE_T($sp)
76c828c6 957 br $ra
8626230a
AP
958
959.align 16
76c828c6 960.L192_continue:
8626230a
AP
961 lgr $t1,$s3
962 x $t1,16($key) # rk[10]=rk[4]^rk[9]
963 st $t1,40($key)
964 x $t1,20($key) # rk[11]=rk[5]^rk[10]
965 st $t1,44($key)
966
967 srlg $i1,$t1,8
968 srlg $i2,$t1,16
969 srlg $i3,$t1,24
970 nr $t1,$mask
971 nr $i1,$mask
972 nr $i2,$mask
973
76c828c6
AP
974 la $key,24($key) # key+=6
975 la $t3,4($t3) # i++
976 j .L192_loop
977
8626230a 978.align 16
76c828c6 979.Lnot192:
8626230a
AP
980 llgf $t0,24($inp)
981 llgf $t1,28($inp)
982 st $t0,24($key)
983 st $t1,28($key)
76c828c6
AP
984 llill $mask,0xff
985 lghi $t3,0 # i=0
986 lghi $rounds,14
987 st $rounds,240($key)
988 lghi $rounds,7
989
8626230a
AP
990 srlg $i1,$t1,8
991 srlg $i2,$t1,16
992 srlg $i3,$t1,24
993 nr $t1,$mask
76c828c6
AP
994 nr $i1,$mask
995 nr $i2,$mask
8626230a
AP
996
997.align 16
998.L256_loop:
999 la $t1,0($t1,$tbl)
76c828c6
AP
1000 la $i1,0($i1,$tbl)
1001 la $i2,0($i2,$tbl)
1002 la $i3,0($i3,$tbl)
8626230a
AP
1003 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1004 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1005 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1006 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1007 x $t1,256($t3,$tbl) # rcon[i]
1008 xr $s0,$t1 # rk[8]=rk[0]^...
76c828c6
AP
1009 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1010 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1011 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1012 st $s0,32($key)
1013 st $s1,36($key)
1014 st $s2,40($key)
1015 st $s3,44($key)
1016 brct $rounds,.L256_continue
1017 lghi %r2,0
e822c756 1018 lm${g} %r6,%r13,6*$SIZE_T($sp)
76c828c6 1019 br $ra
8626230a
AP
1020
1021.align 16
76c828c6 1022.L256_continue:
8626230a 1023 lgr $t1,$s3 # temp=rk[11]
76c828c6
AP
1024 srlg $i1,$s3,8
1025 srlg $i2,$s3,16
1026 srlg $i3,$s3,24
8626230a 1027 nr $t1,$mask
76c828c6
AP
1028 nr $i1,$mask
1029 nr $i2,$mask
8626230a 1030 la $t1,0($t1,$tbl)
76c828c6
AP
1031 la $i1,0($i1,$tbl)
1032 la $i2,0($i2,$tbl)
1033 la $i3,0($i3,$tbl)
8626230a
AP
1034 llgc $t1,0($t1) # Te4[rk[11]>>0]
1035 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1036 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1037 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1038 x $t1,16($key) # rk[12]=rk[4]^...
1039 st $t1,48($key)
1040 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1041 st $t1,52($key)
1042 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1043 st $t1,56($key)
1044 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1045 st $t1,60($key)
1046
1047 srlg $i1,$t1,8
1048 srlg $i2,$t1,16
1049 srlg $i3,$t1,24
1050 nr $t1,$mask
1051 nr $i1,$mask
1052 nr $i2,$mask
76c828c6
AP
1053
1054 la $key,32($key) # key+=8
1055 la $t3,4($t3) # i++
1056 j .L256_loop
8626230a 1057
76c828c6
AP
1058.Lminus1:
1059 lghi %r2,-1
8626230a 1060 br $ra
76c828c6
AP
1061.size AES_set_encrypt_key,.-AES_set_encrypt_key
1062
1063# void AES_set_decrypt_key(const unsigned char *in, int bits,
1064# AES_KEY *key) {
1065.globl AES_set_decrypt_key
1066.type AES_set_decrypt_key,\@function
1067.align 16
1068AES_set_decrypt_key:
e822c756
AP
1069 st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1070 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers!
76c828c6 1071 bras $ra,AES_set_encrypt_key
e822c756
AP
1072 l${g} $key,4*$SIZE_T($sp)
1073 l${g} $ra,14*$SIZE_T($sp)
76c828c6
AP
1074 ltgr %r2,%r2
1075 bnzr $ra
a61710b8
AP
1076___
1077$code.=<<___ if (!$softonly);
8626230a
AP
1078 l $t0,240($key)
1079 lhi $t1,16
1080 cr $t0,$t1
1081 jl .Lgo
1082 oill $t0,0x80 # set "decrypt" bit
1083 st $t0,240($key)
76c828c6
AP
1084 br $ra
1085
1086.align 16
1087.Ldkey_internal:
e822c756
AP
1088 st${g} $key,4*$SIZE_T($sp)
1089 st${g} $ra,14*$SIZE_T($sp)
76c828c6 1090 bras $ra,.Lekey_internal
e822c756
AP
1091 l${g} $key,4*$SIZE_T($sp)
1092 l${g} $ra,14*$SIZE_T($sp)
a61710b8
AP
1093___
1094$code.=<<___;
76c828c6
AP
1095
1096.Lgo: llgf $rounds,240($key)
96b0f6c1 1097 la $i1,0($key)
76c828c6 1098 sllg $i2,$rounds,4
96b0f6c1 1099 la $i2,0($i2,$key)
76c828c6 1100 srl $rounds,1
8626230a 1101 lghi $t1,-16
76c828c6 1102
8626230a 1103.align 16
96b0f6c1
AP
1104.Linv: lmg $s0,$s1,0($i1)
1105 lmg $s2,$s3,0($i2)
1106 stmg $s0,$s1,0($i2)
1107 stmg $s2,$s3,0($i1)
8626230a
AP
1108 la $i1,16($i1)
1109 la $i2,0($t1,$i2)
76c828c6
AP
1110 brct $rounds,.Linv
1111___
1112$mask80=$i1;
1113$mask1b=$i2;
1114$maskfe=$i3;
1115$code.=<<___;
1116 llgf $rounds,240($key)
1117 aghi $rounds,-1
1118 sll $rounds,2 # (rounds-1)*4
1119 llilh $mask80,0x8080
76c828c6 1120 llilh $mask1b,0x1b1b
76c828c6 1121 llilh $maskfe,0xfefe
8626230a
AP
1122 oill $mask80,0x8080
1123 oill $mask1b,0x1b1b
76c828c6
AP
1124 oill $maskfe,0xfefe
1125
8626230a 1126.align 16
76c828c6
AP
1127.Lmix: l $s0,16($key) # tp1
1128 lr $s1,$s0
1129 ngr $s1,$mask80
1130 srlg $t1,$s1,7
1131 slr $s1,$t1
1132 nr $s1,$mask1b
1133 sllg $t1,$s0,1
1134 nr $t1,$maskfe
1135 xr $s1,$t1 # tp2
1136
1137 lr $s2,$s1
1138 ngr $s2,$mask80
1139 srlg $t1,$s2,7
1140 slr $s2,$t1
1141 nr $s2,$mask1b
1142 sllg $t1,$s1,1
1143 nr $t1,$maskfe
1144 xr $s2,$t1 # tp4
1145
1146 lr $s3,$s2
1147 ngr $s3,$mask80
1148 srlg $t1,$s3,7
1149 slr $s3,$t1
1150 nr $s3,$mask1b
1151 sllg $t1,$s2,1
1152 nr $t1,$maskfe
1153 xr $s3,$t1 # tp8
1154
1155 xr $s1,$s0 # tp2^tp1
1156 xr $s2,$s0 # tp4^tp1
1157 rll $s0,$s0,24 # = ROTATE(tp1,8)
8626230a 1158 xr $s2,$s3 # ^=tp8
76c828c6 1159 xr $s0,$s1 # ^=tp2^tp1
76c828c6 1160 xr $s1,$s3 # tp2^tp1^tp8
8626230a 1161 xr $s0,$s2 # ^=tp4^tp1^tp8
76c828c6 1162 rll $s1,$s1,8
76c828c6 1163 rll $s2,$s2,16
8626230a 1164 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
76c828c6 1165 rll $s3,$s3,24
8626230a 1166 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
76c828c6
AP
1167 xr $s0,$s3 # ^= ROTATE(tp8,8)
1168
1169 st $s0,16($key)
1170 la $key,4($key)
1171 brct $rounds,.Lmix
1172
e822c756 1173 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
76c828c6
AP
1174 lghi %r2,0
1175 br $ra
1176.size AES_set_decrypt_key,.-AES_set_decrypt_key
8626230a
AP
1177___
1178
0ab8fd58
AP
1179########################################################################
1180# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
8626230a
AP
1181# size_t length, const AES_KEY *key,
1182# unsigned char *ivec, const int enc)
1183{
1184my $inp="%r2";
1185my $out="%r4"; # length and out are swapped
1186my $len="%r3";
1187my $key="%r5";
1188my $ivp="%r6";
1189
1190$code.=<<___;
1191.globl AES_cbc_encrypt
1192.type AES_cbc_encrypt,\@function
1193.align 16
1194AES_cbc_encrypt:
1195 xgr %r3,%r4 # flip %r3 and %r4, out and len
1196 xgr %r4,%r3
1197 xgr %r3,%r4
1198___
1199$code.=<<___ if (!$softonly);
1200 lhi %r0,16
1201 cl %r0,240($key)
1202 jh .Lcbc_software
1203
1204 lg %r0,0($ivp) # copy ivec
1205 lg %r1,8($ivp)
1206 stmg %r0,%r1,16($sp)
1207 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1208 stmg %r0,%r1,32($sp)
1209 lmg %r0,%r1,16($key)
1210 stmg %r0,%r1,48($sp)
1211 l %r0,240($key) # load kmc code
1212 lghi $key,15 # res=len%16, len-=res;
1213 ngr $key,$len
e822c756 1214 sl${g}r $len,$key
8626230a
AP
1215 la %r1,16($sp) # parameter block - ivec || key
1216 jz .Lkmc_truncated
1217 .long 0xb92f0042 # kmc %r4,%r2
1218 brc 1,.-4 # pay attention to "partial completion"
1219 ltr $key,$key
1220 jnz .Lkmc_truncated
1221.Lkmc_done:
1222 lmg %r0,%r1,16($sp) # copy ivec to caller
1223 stg %r0,0($ivp)
1224 stg %r1,8($ivp)
1225 br $ra
1226.align 16
1227.Lkmc_truncated:
1228 ahi $key,-1 # it's the way it's encoded in mvc
1229 tmll %r0,0x80
1230 jnz .Lkmc_truncated_dec
1231 lghi %r1,0
e822c756
AP
1232 stg %r1,16*$SIZE_T($sp)
1233 stg %r1,16*$SIZE_T+8($sp)
8626230a 1234 bras %r1,1f
e822c756 1235 mvc 16*$SIZE_T(1,$sp),0($inp)
8626230a
AP
12361: ex $key,0(%r1)
1237 la %r1,16($sp) # restore parameter block
e822c756 1238 la $inp,16*$SIZE_T($sp)
8626230a
AP
1239 lghi $len,16
1240 .long 0xb92f0042 # kmc %r4,%r2
1241 j .Lkmc_done
1242.align 16
1243.Lkmc_truncated_dec:
e822c756
AP
1244 st${g} $out,4*$SIZE_T($sp)
1245 la $out,16*$SIZE_T($sp)
8626230a
AP
1246 lghi $len,16
1247 .long 0xb92f0042 # kmc %r4,%r2
e822c756 1248 l${g} $out,4*$SIZE_T($sp)
8626230a 1249 bras %r1,2f
e822c756 1250 mvc 0(1,$out),16*$SIZE_T($sp)
8626230a
AP
12512: ex $key,0(%r1)
1252 j .Lkmc_done
1253.align 16
1254.Lcbc_software:
1255___
1256$code.=<<___;
e822c756 1257 stm${g} $key,$ra,5*$SIZE_T($sp)
8626230a 1258 lhi %r0,0
e822c756 1259 cl %r0,`$stdframe+$SIZE_T-4`($sp)
8626230a
AP
1260 je .Lcbc_decrypt
1261
1262 larl $tbl,AES_Te
1263
1264 llgf $s0,0($ivp)
1265 llgf $s1,4($ivp)
1266 llgf $s2,8($ivp)
1267 llgf $s3,12($ivp)
1268
1269 lghi $t0,16
e822c756 1270 sl${g}r $len,$t0
8626230a
AP
1271 brc 4,.Lcbc_enc_tail # if borrow
1272.Lcbc_enc_loop:
e822c756 1273 stm${g} $inp,$out,2*$SIZE_T($sp)
8626230a
AP
1274 x $s0,0($inp)
1275 x $s1,4($inp)
1276 x $s2,8($inp)
1277 x $s3,12($inp)
1278 lgr %r4,$key
1279
1280 bras $ra,_s390x_AES_encrypt
1281
e822c756 1282 lm${g} $inp,$key,2*$SIZE_T($sp)
8626230a
AP
1283 st $s0,0($out)
1284 st $s1,4($out)
1285 st $s2,8($out)
1286 st $s3,12($out)
1287
1288 la $inp,16($inp)
1289 la $out,16($out)
1290 lghi $t0,16
e822c756 1291 lt${g}r $len,$len
8626230a 1292 jz .Lcbc_enc_done
e822c756 1293 sl${g}r $len,$t0
8626230a
AP
1294 brc 4,.Lcbc_enc_tail # if borrow
1295 j .Lcbc_enc_loop
1296.align 16
1297.Lcbc_enc_done:
e822c756 1298 l${g} $ivp,6*$SIZE_T($sp)
8626230a
AP
1299 st $s0,0($ivp)
1300 st $s1,4($ivp)
1301 st $s2,8($ivp)
1302 st $s3,12($ivp)
1303
e822c756 1304 lm${g} %r7,$ra,7*$SIZE_T($sp)
8626230a
AP
1305 br $ra
1306
1307.align 16
1308.Lcbc_enc_tail:
1309 aghi $len,15
1310 lghi $t0,0
e822c756
AP
1311 stg $t0,16*$SIZE_T($sp)
1312 stg $t0,16*$SIZE_T+8($sp)
8626230a 1313 bras $t1,3f
e822c756 1314 mvc 16*$SIZE_T(1,$sp),0($inp)
8626230a
AP
13153: ex $len,0($t1)
1316 lghi $len,0
e822c756 1317 la $inp,16*$SIZE_T($sp)
8626230a
AP
1318 j .Lcbc_enc_loop
1319
1320.align 16
1321.Lcbc_decrypt:
1322 larl $tbl,AES_Td
1323
1324 lg $t0,0($ivp)
1325 lg $t1,8($ivp)
e822c756 1326 stmg $t0,$t1,16*$SIZE_T($sp)
8626230a
AP
1327
1328.Lcbc_dec_loop:
e822c756 1329 stm${g} $inp,$out,2*$SIZE_T($sp)
8626230a
AP
1330 llgf $s0,0($inp)
1331 llgf $s1,4($inp)
1332 llgf $s2,8($inp)
1333 llgf $s3,12($inp)
1334 lgr %r4,$key
1335
1336 bras $ra,_s390x_AES_decrypt
1337
e822c756 1338 lm${g} $inp,$key,2*$SIZE_T($sp)
8626230a
AP
1339 sllg $s0,$s0,32
1340 sllg $s2,$s2,32
1341 lr $s0,$s1
1342 lr $s2,$s3
1343
1344 lg $t0,0($inp)
1345 lg $t1,8($inp)
e822c756
AP
1346 xg $s0,16*$SIZE_T($sp)
1347 xg $s2,16*$SIZE_T+8($sp)
8626230a 1348 lghi $s1,16
e822c756 1349 sl${g}r $len,$s1
8626230a
AP
1350 brc 4,.Lcbc_dec_tail # if borrow
1351 brc 2,.Lcbc_dec_done # if zero
1352 stg $s0,0($out)
1353 stg $s2,8($out)
e822c756 1354 stmg $t0,$t1,16*$SIZE_T($sp)
8626230a
AP
1355
1356 la $inp,16($inp)
1357 la $out,16($out)
1358 j .Lcbc_dec_loop
1359
1360.Lcbc_dec_done:
1361 stg $s0,0($out)
1362 stg $s2,8($out)
1363.Lcbc_dec_exit:
e822c756 1364 lm${g} %r6,$ra,6*$SIZE_T($sp)
8626230a
AP
1365 stmg $t0,$t1,0($ivp)
1366
1367 br $ra
1368
1369.align 16
1370.Lcbc_dec_tail:
1371 aghi $len,15
e822c756
AP
1372 stg $s0,16*$SIZE_T($sp)
1373 stg $s2,16*$SIZE_T+8($sp)
8626230a 1374 bras $s1,4f
e822c756 1375 mvc 0(1,$out),16*$SIZE_T($sp)
8626230a
AP
13764: ex $len,0($s1)
1377 j .Lcbc_dec_exit
1378.size AES_cbc_encrypt,.-AES_cbc_encrypt
874a3757
AP
1379___
1380}
0ab8fd58
AP
1381########################################################################
1382# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
874a3757
AP
1383# size_t blocks, const AES_KEY *key,
1384# const unsigned char *ivec)
1385{
1386my $inp="%r2";
0ab8fd58
AP
1387my $out="%r4"; # blocks and out are swapped
1388my $len="%r3";
874a3757
AP
1389my $key="%r5"; my $iv0="%r5";
1390my $ivp="%r6";
1391my $fp ="%r7";
1392
1393$code.=<<___;
1394.globl AES_ctr32_encrypt
1395.type AES_ctr32_encrypt,\@function
1396.align 16
1397AES_ctr32_encrypt:
0ab8fd58
AP
1398 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1399 xgr %r4,%r3
1400 xgr %r3,%r4
e822c756 1401 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
874a3757
AP
1402___
1403$code.=<<___ if (!$softonly);
1404 l %r0,240($key)
1405 lhi %r1,16
1406 clr %r0,%r1
1407 jl .Lctr32_software
1408
e822c756 1409 stm${g} %r6,$s3,6*$SIZE_T($sp)
874a3757
AP
1410
1411 slgr $out,$inp
1412 la %r1,0($key) # %r1 is permanent copy of $key
1413 lg $iv0,0($ivp) # load ivec
1414 lg $ivp,8($ivp)
1415
26064d7f
AP
1416 # prepare and allocate stack frame at the top of 4K page
1417 # with 1K reserved for eventual signal handling
1418 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
874a3757 1419 lghi $s1,-4096
874a3757 1420 algr $s0,$sp
26064d7f 1421 lgr $fp,$sp
874a3757 1422 ngr $s0,$s1 # align at page boundary
26064d7f
AP
1423 slgr $fp,$s0 # total buffer size
1424 lgr $s2,$sp
1425 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1426 slgr $fp,$s1 # deduct reservation to get usable buffer size
1427 # buffer size is at lest 256 and at most 3072+256-16
1428
1429 la $sp,1024($s0) # alloca
1430 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
e822c756
AP
1431 st${g} $s2,0($sp) # back-chain
1432 st${g} $fp,$SIZE_T($sp)
874a3757
AP
1433
1434 slgr $len,$fp
0ab8fd58 1435 brc 1,.Lctr32_hw_switch # not zero, no borrow
26064d7f 1436 algr $fp,$len # input is shorter than allocated buffer
874a3757 1437 lghi $len,0
e822c756 1438 st${g} $fp,$SIZE_T($sp)
874a3757 1439
0ab8fd58
AP
1440.Lctr32_hw_switch:
1441___
1442$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1443 larl $s0,OPENSSL_s390xcap_P
1444 lg $s0,8($s0)
1445 tmhh $s0,0x0004 # check for message_security-assist-4
1446 jz .Lctr32_km_loop
1447
1448 llgfr $s0,%r0
1449 lgr $s1,%r1
1450 lghi %r0,0
1451 la %r1,16($sp)
1452 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1453
1454 llihh %r0,0x8000 # check if kmctr supports the function code
1455 srlg %r0,%r0,0($s0)
1456 ng %r0,16($sp)
1457 lgr %r0,$s0
1458 lgr %r1,$s1
1459 jz .Lctr32_km_loop
1460
1461####### kmctr code
1462 algr $out,$inp # restore $out
1463 lgr $s1,$len # $s1 undertakes $len
1464 j .Lctr32_kmctr_loop
1465.align 16
1466.Lctr32_kmctr_loop:
1467 la $s2,16($sp)
1468 lgr $s3,$fp
1469.Lctr32_kmctr_prepare:
1470 stg $iv0,0($s2)
1471 stg $ivp,8($s2)
1472 la $s2,16($s2)
1473 ahi $ivp,1 # 32-bit increment, preserves upper half
1474 brct $s3,.Lctr32_kmctr_prepare
1475
1476 #la $inp,0($inp) # inp
1477 sllg $len,$fp,4 # len
1478 #la $out,0($out) # out
1479 la $s2,16($sp) # iv
1480 .long 0xb92da042 # kmctr $out,$s2,$inp
1481 brc 1,.-4 # pay attention to "partial completion"
1482
1483 slgr $s1,$fp
1484 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1485 algr $fp,$s1
1486 lghi $s1,0
1487 brc 4+1,.Lctr32_kmctr_loop # not zero
1488
1489 l${g} $sp,0($sp)
1490 lm${g} %r6,$s3,6*$SIZE_T($sp)
1491 br $ra
1492.align 16
1493___
1494$code.=<<___;
1495.Lctr32_km_loop:
874a3757
AP
1496 la $s2,16($sp)
1497 lgr $s3,$fp
0ab8fd58 1498.Lctr32_km_prepare:
874a3757
AP
1499 stg $iv0,0($s2)
1500 stg $ivp,8($s2)
1501 la $s2,16($s2)
1502 ahi $ivp,1 # 32-bit increment, preserves upper half
0ab8fd58 1503 brct $s3,.Lctr32_km_prepare
874a3757
AP
1504
1505 la $s0,16($sp) # inp
1506 sllg $s1,$fp,4 # len
1507 la $s2,16($sp) # out
1508 .long 0xb92e00a8 # km %r10,%r8
1509 brc 1,.-4 # pay attention to "partial completion"
1510
1511 la $s2,16($sp)
1512 lgr $s3,$fp
1513 slgr $s2,$inp
0ab8fd58 1514.Lctr32_km_xor:
874a3757
AP
1515 lg $s0,0($inp)
1516 lg $s1,8($inp)
1517 xg $s0,0($s2,$inp)
1518 xg $s1,8($s2,$inp)
1519 stg $s0,0($out,$inp)
1520 stg $s1,8($out,$inp)
1521 la $inp,16($inp)
0ab8fd58 1522 brct $s3,.Lctr32_km_xor
874a3757
AP
1523
1524 slgr $len,$fp
0ab8fd58 1525 brc 1,.Lctr32_km_loop # not zero, no borrow
874a3757
AP
1526 algr $fp,$len
1527 lghi $len,0
0ab8fd58 1528 brc 4+1,.Lctr32_km_loop # not zero
874a3757 1529
e822c756
AP
1530 l${g} $s0,0($sp)
1531 l${g} $s1,$SIZE_T($sp)
874a3757 1532 la $s2,16($sp)
0ab8fd58 1533.Lctr32_km_zap:
874a3757
AP
1534 stg $s0,0($s2)
1535 stg $s0,8($s2)
1536 la $s2,16($s2)
0ab8fd58 1537 brct $s1,.Lctr32_km_zap
874a3757
AP
1538
1539 la $sp,0($s0)
e822c756 1540 lm${g} %r6,$s3,6*$SIZE_T($sp)
874a3757
AP
1541 br $ra
1542.align 16
1543.Lctr32_software:
1544___
1545$code.=<<___;
e822c756 1546 stm${g} $key,$ra,5*$SIZE_T($sp)
0ab8fd58 1547 sl${g}r $inp,$out
874a3757
AP
1548 larl $tbl,AES_Te
1549 llgf $t1,12($ivp)
1550
1551.Lctr32_loop:
0ab8fd58 1552 stm${g} $inp,$out,2*$SIZE_T($sp)
874a3757
AP
1553 llgf $s0,0($ivp)
1554 llgf $s1,4($ivp)
1555 llgf $s2,8($ivp)
1556 lgr $s3,$t1
e822c756 1557 st $t1,16*$SIZE_T($sp)
874a3757
AP
1558 lgr %r4,$key
1559
1560 bras $ra,_s390x_AES_encrypt
1561
e822c756
AP
1562 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1563 llgf $t1,16*$SIZE_T($sp)
0ab8fd58
AP
1564 x $s0,0($inp,$out)
1565 x $s1,4($inp,$out)
1566 x $s2,8($inp,$out)
1567 x $s3,12($inp,$out)
1568 stm $s0,$s3,0($out)
1569
1570 la $out,16($out)
1571 ahi $t1,1 # 32-bit increment
1572 brct $len,.Lctr32_loop
1573
1574 lm${g} %r6,$ra,6*$SIZE_T($sp)
1575 br $ra
1576.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1577___
1578}
1579
1580########################################################################
1581# void AES_xts_encrypt(const char *inp,char *out,size_t len,
0c237e42
AP
1582# const AES_KEY *key1, const AES_KEY *key2,
1583# const unsigned char iv[16]);
0ab8fd58
AP
1584#
1585{
1586my $inp="%r2";
1587my $out="%r4"; # len and out are swapped
1588my $len="%r3";
1589my $key1="%r5"; # $i1
1590my $key2="%r6"; # $i2
1591my $fp="%r7"; # $i3
1592my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1593
1594$code.=<<___;
1595.type _s390x_xts_km,\@function
1596.align 16
1597_s390x_xts_km:
1598___
0c237e42 1599$code.=<<___ if(1);
0ab8fd58
AP
1600 llgfr $s0,%r0 # put aside the function code
1601 lghi $s1,0x7f
1602 nr $s1,%r0
1603 lghi %r0,0 # query capability vector
1604 la %r1,2*$SIZE_T($sp)
1605 .long 0xb92e0042 # km %r4,%r2
1606 llihh %r1,0x8000
1607 srlg %r1,%r1,32($s1) # check for 32+function code
1608 ng %r1,2*$SIZE_T($sp)
1609 lgr %r0,$s0 # restore the function code
1610 la %r1,0($key1) # restore $key1
1611 jz .Lxts_km_vanilla
1612
1613 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1614 algr $out,$inp
1615
1616 oill %r0,32 # switch to xts function code
1617 aghi $s1,-18 #
1618 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1619 la %r1,$tweak-16($sp)
1620 slgr %r1,$s1 # parameter block position
1621 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1622 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1623 # yes, it contains junk and overlaps
1624 # with the tweak in 128-bit case.
1625 # it's done to avoid conditional
1626 # branch.
1627 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1628
1629 .long 0xb92e0042 # km %r4,%r2
1630 brc 1,.-4 # pay attention to "partial completion"
1631
1632 lrvg $s0,$tweak+0($sp) # load the last tweak
1633 lrvg $s1,$tweak+8($sp)
1634 stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
1635
1636 nill %r0,0xffdf # switch back to original function code
1637 la %r1,0($key1) # restore pointer to $key1
1638 slgr $out,$inp
1639
1640 llgc $len,2*$SIZE_T-1($sp)
1641 nill $len,0x0f # $len%=16
1642 br $ra
1643
1644.align 16
1645.Lxts_km_vanilla:
1646___
1647$code.=<<___;
1648 # prepare and allocate stack frame at the top of 4K page
1649 # with 1K reserved for eventual signal handling
1650 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1651 lghi $s1,-4096
1652 algr $s0,$sp
1653 lgr $fp,$sp
1654 ngr $s0,$s1 # align at page boundary
1655 slgr $fp,$s0 # total buffer size
1656 lgr $s2,$sp
1657 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1658 slgr $fp,$s1 # deduct reservation to get usable buffer size
1659 # buffer size is at lest 256 and at most 3072+256-16
1660
1661 la $sp,1024($s0) # alloca
1662 nill $fp,0xfff0 # round to 16*n
1663 st${g} $s2,0($sp) # back-chain
1664 nill $len,0xfff0 # redundant
1665 st${g} $fp,$SIZE_T($sp)
1666
1667 slgr $len,$fp
1668 brc 1,.Lxts_km_go # not zero, no borrow
1669 algr $fp,$len # input is shorter than allocated buffer
1670 lghi $len,0
1671 st${g} $fp,$SIZE_T($sp)
1672
1673.Lxts_km_go:
1674 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1675 lrvg $s1,$tweak+8($s2)
1676
1677 la $s2,16($sp) # vector of ascending tweak values
1678 slgr $s2,$inp
1679 srlg $s3,$fp,4
1680 j .Lxts_km_start
1681
1682.Lxts_km_loop:
1683 la $s2,16($sp)
1684 slgr $s2,$inp
1685 srlg $s3,$fp,4
1686.Lxts_km_prepare:
1687 lghi $i1,0x87
1688 srag $i2,$s1,63 # broadcast upper bit
1689 ngr $i1,$i2 # rem
1690 srlg $i2,$s0,63 # carry bit from lower half
1691 sllg $s0,$s0,1
1692 sllg $s1,$s1,1
1693 xgr $s0,$i1
1694 ogr $s1,$i2
1695.Lxts_km_start:
1696 lrvgr $i1,$s0 # flip byte order
1697 lrvgr $i2,$s1
1698 stg $i1,0($s2,$inp)
1699 stg $i2,8($s2,$inp)
1700 xg $i1,0($inp)
1701 xg $i2,8($inp)
1702 stg $i1,0($out,$inp)
1703 stg $i2,8($out,$inp)
1704 la $inp,16($inp)
1705 brct $s3,.Lxts_km_prepare
1706
1707 slgr $inp,$fp # rewind $inp
1708 la $s2,0($out,$inp)
1709 lgr $s3,$fp
1710 .long 0xb92e00aa # km $s2,$s2
1711 brc 1,.-4 # pay attention to "partial completion"
1712
1713 la $s2,16($sp)
1714 slgr $s2,$inp
1715 srlg $s3,$fp,4
1716.Lxts_km_xor:
1717 lg $i1,0($out,$inp)
1718 lg $i2,8($out,$inp)
1719 xg $i1,0($s2,$inp)
1720 xg $i2,8($s2,$inp)
1721 stg $i1,0($out,$inp)
1722 stg $i2,8($out,$inp)
1723 la $inp,16($inp)
1724 brct $s3,.Lxts_km_xor
1725
1726 slgr $len,$fp
1727 brc 1,.Lxts_km_loop # not zero, no borrow
1728 algr $fp,$len
1729 lghi $len,0
1730 brc 4+1,.Lxts_km_loop # not zero
1731
1732 l${g} $i1,0($sp) # back-chain
1733 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1734 la $i2,16($sp)
1735 srlg $fp,$fp,4
1736.Lxts_km_zap:
1737 stg $i1,0($i2)
1738 stg $i1,8($i2)
1739 la $i2,16($i2)
1740 brct $fp,.Lxts_km_zap
1741
1742 la $sp,0($i1)
1743 llgc $len,2*$SIZE_T-1($i1)
1744 nill $len,0x0f # $len%=16
1745 bzr $ra
1746
1747 # generate one more tweak...
1748 lghi $i1,0x87
1749 srag $i2,$s1,63 # broadcast upper bit
1750 ngr $i1,$i2 # rem
1751 srlg $i2,$s0,63 # carry bit from lower half
1752 sllg $s0,$s0,1
1753 sllg $s1,$s1,1
1754 xgr $s0,$i1
1755 ogr $s1,$i2
1756
1757 ltr $len,$len # clear zero flag
1758 br $ra
1759.size _s390x_xts_km,.-_s390x_xts_km
1760
1761.globl AES_xts_encrypt
1762.type AES_xts_encrypt,\@function
1763.align 16
1764AES_xts_encrypt:
1765 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1766 xgr %r4,%r3
1767 xgr %r3,%r4
1768___
1769$code.=<<___ if ($SIZE_T==4);
1770 llgfr $len,$len
1771___
1772$code.=<<___;
1773 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1774 srag $len,$len,4 # formally wrong, because it expands
1775 # sign byte, but who can afford asking
1776 # to process more than 2^63-1 bytes?
1777 # I use it, because it sets condition
1778 # code...
1779 bcr 8,$ra # abort if zero (i.e. less than 16)
1780___
1781$code.=<<___ if (!$softonly);
1782 llgf %r0,240($key2)
1783 lhi %r1,16
1784 clr %r0,%r1
1785 jl .Lxts_enc_software
1786
1787 stm${g} %r6,$s3,6*$SIZE_T($sp)
1788 st${g} $ra,14*$SIZE_T($sp)
1789
1790 sllg $len,$len,4 # $len&=~15
1791 slgr $out,$inp
1792
0c237e42
AP
1793 # generate the tweak value
1794 l${g} $s3,$stdframe($sp) # pointer to iv
0ab8fd58 1795 la $s2,$tweak($sp)
0c237e42 1796 lmg $s0,$s1,0($s3)
0ab8fd58
AP
1797 lghi $s3,16
1798 stmg $s0,$s1,0($s2)
1799 la %r1,0($key2) # $key2 is not needed anymore
1800 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1801 brc 1,.-4 # can this happen?
1802
1803 l %r0,240($key1)
1804 la %r1,0($key1) # $key1 is not needed anymore
1805 bras $ra,_s390x_xts_km
1806 jz .Lxts_enc_km_done
1807
1808 aghi $inp,-16 # take one step back
1809 la $i3,0($out,$inp) # put aside real $out
1810.Lxts_enc_km_steal:
1811 llgc $i1,16($inp)
1812 llgc $i2,0($out,$inp)
1813 stc $i1,0($out,$inp)
1814 stc $i2,16($out,$inp)
1815 la $inp,1($inp)
1816 brct $len,.Lxts_enc_km_steal
1817
1818 la $s2,0($i3)
1819 lghi $s3,16
1820 lrvgr $i1,$s0 # flip byte order
1821 lrvgr $i2,$s1
1822 xg $i1,0($s2)
1823 xg $i2,8($s2)
1824 stg $i1,0($s2)
1825 stg $i2,8($s2)
1826 .long 0xb92e00aa # km $s2,$s2
1827 brc 1,.-4 # can this happen?
1828 lrvgr $i1,$s0 # flip byte order
1829 lrvgr $i2,$s1
1830 xg $i1,0($i3)
1831 xg $i2,8($i3)
1832 stg $i1,0($i3)
1833 stg $i2,8($i3)
1834
1835.Lxts_enc_km_done:
1836 l${g} $ra,14*$SIZE_T($sp)
1837 st${g} $sp,$tweak($sp) # wipe tweak
1838 st${g} $sp,$tweak($sp)
1839 lm${g} %r6,$s3,6*$SIZE_T($sp)
1840 br $ra
1841.align 16
1842.Lxts_enc_software:
1843___
1844$code.=<<___;
1845 stm${g} %r6,$ra,6*$SIZE_T($sp)
1846
1847 slgr $out,$inp
1848
1849 xgr $s0,$s0 # clear upper half
1850 xgr $s1,$s1
1851 lrv $s0,$stdframe+4($sp) # load secno
1852 lrv $s1,$stdframe+0($sp)
1853 xgr $s2,$s2
1854 xgr $s3,$s3
1855 stm${g} %r2,%r5,2*$SIZE_T($sp)
1856 la $key,0($key2)
1857 larl $tbl,AES_Te
1858 bras $ra,_s390x_AES_encrypt # generate the tweak
1859 lm${g} %r2,%r5,2*$SIZE_T($sp)
1860 stm $s0,$s3,$tweak($sp) # save the tweak
1861 j .Lxts_enc_enter
1862
1863.align 16
1864.Lxts_enc_loop:
1865 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1866 lrvg $s3,$tweak+8($sp)
1867 lghi %r1,0x87
1868 srag %r0,$s3,63 # broadcast upper bit
1869 ngr %r1,%r0 # rem
1870 srlg %r0,$s1,63 # carry bit from lower half
1871 sllg $s1,$s1,1
1872 sllg $s3,$s3,1
1873 xgr $s1,%r1
1874 ogr $s3,%r0
1875 lrvgr $s1,$s1 # flip byte order
1876 lrvgr $s3,$s3
1877 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1878 stg $s1,$tweak+0($sp) # save the tweak
1879 llgfr $s1,$s1
1880 srlg $s2,$s3,32
1881 stg $s3,$tweak+8($sp)
1882 llgfr $s3,$s3
1883 la $inp,16($inp) # $inp+=16
1884.Lxts_enc_enter:
1885 x $s0,0($inp) # ^=*($inp)
874a3757
AP
1886 x $s1,4($inp)
1887 x $s2,8($inp)
1888 x $s3,12($inp)
0ab8fd58
AP
1889 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1890 la $key,0($key1)
1891 bras $ra,_s390x_AES_encrypt
1892 lm${g} %r2,%r5,2*$SIZE_T($sp)
1893 x $s0,$tweak+0($sp) # ^=tweak
1894 x $s1,$tweak+4($sp)
1895 x $s2,$tweak+8($sp)
1896 x $s3,$tweak+12($sp)
874a3757
AP
1897 st $s0,0($out,$inp)
1898 st $s1,4($out,$inp)
1899 st $s2,8($out,$inp)
1900 st $s3,12($out,$inp)
0ab8fd58
AP
1901 brct${g} $len,.Lxts_enc_loop
1902
1903 llgc $len,`2*$SIZE_T-1`($sp)
1904 nill $len,0x0f # $len%16
1905 jz .Lxts_enc_done
1906
1907 la $i3,0($inp,$out) # put aside real $out
1908.Lxts_enc_steal:
1909 llgc %r0,16($inp)
1910 llgc %r1,0($out,$inp)
1911 stc %r0,0($out,$inp)
1912 stc %r1,16($out,$inp)
1913 la $inp,1($inp)
1914 brct $len,.Lxts_enc_steal
1915 la $out,0($i3) # restore real $out
1916
1917 # generate last tweak...
1918 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1919 lrvg $s3,$tweak+8($sp)
1920 lghi %r1,0x87
1921 srag %r0,$s3,63 # broadcast upper bit
1922 ngr %r1,%r0 # rem
1923 srlg %r0,$s1,63 # carry bit from lower half
1924 sllg $s1,$s1,1
1925 sllg $s3,$s3,1
1926 xgr $s1,%r1
1927 ogr $s3,%r0
1928 lrvgr $s1,$s1 # flip byte order
1929 lrvgr $s3,$s3
1930 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1931 stg $s1,$tweak+0($sp) # save the tweak
1932 llgfr $s1,$s1
1933 srlg $s2,$s3,32
1934 stg $s3,$tweak+8($sp)
1935 llgfr $s3,$s3
1936
1937 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1938 x $s1,4($out)
1939 x $s2,8($out)
1940 x $s3,12($out)
1941 st${g} $out,4*$SIZE_T($sp)
1942 la $key,0($key1)
1943 bras $ra,_s390x_AES_encrypt
1944 l${g} $out,4*$SIZE_T($sp)
1945 x $s0,`$tweak+0`($sp) # ^=tweak
1946 x $s1,`$tweak+4`($sp)
1947 x $s2,`$tweak+8`($sp)
1948 x $s3,`$tweak+12`($sp)
1949 st $s0,0($out)
1950 st $s1,4($out)
1951 st $s2,8($out)
1952 st $s3,12($out)
1953
1954.Lxts_enc_done:
1955 stg $sp,$tweak+0($sp) # wipe tweak
1956 stg $sp,$twesk+8($sp)
1957 lm${g} %r6,$ra,6*$SIZE_T($sp)
1958 br $ra
1959.size AES_xts_encrypt,.-AES_xts_encrypt
1960___
1961# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1962# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
1963#
1964$code.=<<___;
1965.globl AES_xts_decrypt
1966.type AES_xts_decrypt,\@function
1967.align 16
1968AES_xts_decrypt:
1969 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1970 xgr %r4,%r3
1971 xgr %r3,%r4
1972___
1973$code.=<<___ if ($SIZE_T==4);
1974 llgfr $len,$len
1975___
1976$code.=<<___;
1977 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1978 aghi $len,-16
1979 bcr 4,$ra # abort if less than zero. formally
1980 # wrong, because $len is unsigned,
1981 # but who can afford asking to
1982 # process more than 2^63-1 bytes?
1983 tmll $len,0x0f
1984 jnz .Lxts_dec_proceed
1985 aghi $len,16
1986.Lxts_dec_proceed:
1987___
1988$code.=<<___ if (!$softonly);
1989 llgf %r0,240($key2)
1990 lhi %r1,16
1991 clr %r0,%r1
1992 jl .Lxts_dec_software
874a3757 1993
0ab8fd58
AP
1994 stm${g} %r6,$s3,6*$SIZE_T($sp)
1995 st${g} $ra,14*$SIZE_T($sp)
1996
1997 nill $len,0xfff0 # $len&=~15
1998 slgr $out,$inp
1999
2000 # generate the tweak value
0c237e42 2001 l${g} $s3,$stdframe($sp) # pointer to iv
0ab8fd58 2002 la $s2,$tweak($sp)
0c237e42 2003 lmg $s0,$s1,0($s3)
0ab8fd58 2004 lghi $s3,16
0c237e42 2005 stmg $s0,$s1,0($s2)
0ab8fd58
AP
2006 la %r1,0($key2) # $key2 is not needed past this point
2007 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2008 brc 1,.-4 # can this happen?
2009
2010 l %r0,240($key1)
2011 la %r1,0($key1) # $key1 is not needed anymore
2012
2013 ltgr $len,$len
2014 jz .Lxts_dec_km_short
2015 bras $ra,_s390x_xts_km
2016 jz .Lxts_dec_km_done
2017
2018 lrvgr $s2,$s0 # make copy in reverse byte order
2019 lrvgr $s3,$s1
2020 j .Lxts_dec_km_2ndtweak
2021
2022.Lxts_dec_km_short:
2023 llgc $len,`2*$SIZE_T-1`($sp)
2024 nill $len,0x0f # $len%=16
2025 lrvg $s0,$tweak+0($sp) # load the tweak
2026 lrvg $s1,$tweak+8($sp)
2027 lrvgr $s2,$s0 # make copy in reverse byte order
2028 lrvgr $s3,$s1
2029
2030.Lxts_dec_km_2ndtweak:
2031 lghi $i1,0x87
2032 srag $i2,$s1,63 # broadcast upper bit
2033 ngr $i1,$i2 # rem
2034 srlg $i2,$s0,63 # carry bit from lower half
2035 sllg $s0,$s0,1
2036 sllg $s1,$s1,1
2037 xgr $s0,$i1
2038 ogr $s1,$i2
2039 lrvgr $i1,$s0 # flip byte order
2040 lrvgr $i2,$s1
2041
2042 xg $i1,0($inp)
2043 xg $i2,8($inp)
2044 stg $i1,0($out,$inp)
2045 stg $i2,8($out,$inp)
2046 la $i2,0($out,$inp)
2047 lghi $i3,16
2048 .long 0xb92e0066 # km $i2,$i2
2049 brc 1,.-4 # can this happen?
2050 lrvgr $i1,$s0
2051 lrvgr $i2,$s1
2052 xg $i1,0($out,$inp)
2053 xg $i2,8($out,$inp)
2054 stg $i1,0($out,$inp)
2055 stg $i2,8($out,$inp)
2056
2057 la $i3,0($out,$inp) # put aside real $out
2058.Lxts_dec_km_steal:
2059 llgc $i1,16($inp)
2060 llgc $i2,0($out,$inp)
2061 stc $i1,0($out,$inp)
2062 stc $i2,16($out,$inp)
2063 la $inp,1($inp)
2064 brct $len,.Lxts_dec_km_steal
2065
2066 lgr $s0,$s2
2067 lgr $s1,$s3
2068 xg $s0,0($i3)
2069 xg $s1,8($i3)
2070 stg $s0,0($i3)
2071 stg $s1,8($i3)
2072 la $s0,0($i3)
2073 lghi $s1,16
2074 .long 0xb92e0088 # km $s0,$s0
2075 brc 1,.-4 # can this happen?
2076 xg $s2,0($i3)
2077 xg $s3,8($i3)
2078 stg $s2,0($i3)
2079 stg $s3,8($i3)
2080.Lxts_dec_km_done:
2081 l${g} $ra,14*$SIZE_T($sp)
2082 st${g} $sp,$tweak($sp) # wipe tweak
2083 st${g} $sp,$tweak($sp)
2084 lm${g} %r6,$s3,6*$SIZE_T($sp)
2085 br $ra
2086.align 16
2087.Lxts_dec_software:
2088___
2089$code.=<<___;
2090 stm${g} %r6,$ra,6*$SIZE_T($sp)
2091
2092 srlg $len,$len,4
2093 slgr $out,$inp
2094
2095 xgr $s0,$s0 # clear upper half
2096 xgr $s1,$s1
2097 lrv $s0,$stdframe+4($sp) # load secno
2098 lrv $s1,$stdframe+0($sp)
2099 xgr $s2,$s2
2100 xgr $s3,$s3
2101 stm${g} %r2,%r5,2*$SIZE_T($sp)
2102 la $key,0($key2)
2103 larl $tbl,AES_Te
2104 bras $ra,_s390x_AES_encrypt # generate the tweak
2105 lm${g} %r2,%r5,2*$SIZE_T($sp)
2106 larl $tbl,AES_Td
2107 lt${g}r $len,$len
2108 stm $s0,$s3,$tweak($sp) # save the tweak
2109 jz .Lxts_dec_short
2110 j .Lxts_dec_enter
2111
2112.align 16
2113.Lxts_dec_loop:
2114 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2115 lrvg $s3,$tweak+8($sp)
2116 lghi %r1,0x87
2117 srag %r0,$s3,63 # broadcast upper bit
2118 ngr %r1,%r0 # rem
2119 srlg %r0,$s1,63 # carry bit from lower half
2120 sllg $s1,$s1,1
2121 sllg $s3,$s3,1
2122 xgr $s1,%r1
2123 ogr $s3,%r0
2124 lrvgr $s1,$s1 # flip byte order
2125 lrvgr $s3,$s3
2126 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2127 stg $s1,$tweak+0($sp) # save the tweak
2128 llgfr $s1,$s1
2129 srlg $s2,$s3,32
2130 stg $s3,$tweak+8($sp)
2131 llgfr $s3,$s3
2132.Lxts_dec_enter:
2133 x $s0,0($inp) # tweak^=*(inp)
2134 x $s1,4($inp)
2135 x $s2,8($inp)
2136 x $s3,12($inp)
2137 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2138 la $key,0($key1)
2139 bras $ra,_s390x_AES_decrypt
2140 lm${g} %r2,%r5,2*$SIZE_T($sp)
2141 x $s0,$tweak+0($sp) # ^=tweak
2142 x $s1,$tweak+4($sp)
2143 x $s2,$tweak+8($sp)
2144 x $s3,$tweak+12($sp)
2145 st $s0,0($out,$inp)
2146 st $s1,4($out,$inp)
2147 st $s2,8($out,$inp)
2148 st $s3,12($out,$inp)
874a3757 2149 la $inp,16($inp)
0ab8fd58
AP
2150 brct${g} $len,.Lxts_dec_loop
2151
2152 llgc $len,`2*$SIZE_T-1`($sp)
2153 nill $len,0x0f # $len%16
2154 jz .Lxts_dec_done
2155
2156 # generate pair of tweaks...
2157 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2158 lrvg $s3,$tweak+8($sp)
2159 lghi %r1,0x87
2160 srag %r0,$s3,63 # broadcast upper bit
2161 ngr %r1,%r0 # rem
2162 srlg %r0,$s1,63 # carry bit from lower half
2163 sllg $s1,$s1,1
2164 sllg $s3,$s3,1
2165 xgr $s1,%r1
2166 ogr $s3,%r0
2167 lrvgr $i2,$s1 # flip byte order
2168 lrvgr $i3,$s3
2169 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2170 j .Lxts_dec_2ndtweak
2171
2172.align 16
2173.Lxts_dec_short:
2174 llgc $len,`2*$SIZE_T-1`($sp)
2175 nill $len,0x0f # $len%16
2176 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2177 lrvg $s3,$tweak+8($sp)
2178.Lxts_dec_2ndtweak:
2179 lghi %r1,0x87
2180 srag %r0,$s3,63 # broadcast upper bit
2181 ngr %r1,%r0 # rem
2182 srlg %r0,$s1,63 # carry bit from lower half
2183 sllg $s1,$s1,1
2184 sllg $s3,$s3,1
2185 xgr $s1,%r1
2186 ogr $s3,%r0
2187 lrvgr $s1,$s1 # flip byte order
2188 lrvgr $s3,$s3
2189 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2190 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2191 llgfr $s1,$s1
2192 srlg $s2,$s3,32
2193 stg $s3,$tweak-16+8($sp)
2194 llgfr $s3,$s3
2195
2196 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2197 x $s1,4($inp)
2198 x $s2,8($inp)
2199 x $s3,12($inp)
2200 stm${g} %r2,%r3,2*$SIZE_T($sp)
2201 la $key,0($key1)
2202 bras $ra,_s390x_AES_decrypt
2203 lm${g} %r2,%r5,2*$SIZE_T($sp)
2204 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2205 x $s1,$tweak-16+4($sp)
2206 x $s2,$tweak-16+8($sp)
2207 x $s3,$tweak-16+12($sp)
2208 st $s0,0($out,$inp)
2209 st $s1,4($out,$inp)
2210 st $s2,8($out,$inp)
2211 st $s3,12($out,$inp)
874a3757 2212
0ab8fd58
AP
2213 la $i3,0($out,$inp) # put aside real $out
2214.Lxts_dec_steal:
2215 llgc %r0,16($inp)
2216 llgc %r1,0($out,$inp)
2217 stc %r0,0($out,$inp)
2218 stc %r1,16($out,$inp)
2219 la $inp,1($inp)
2220 brct $len,.Lxts_dec_steal
2221 la $out,0($i3) # restore real $out
2222
2223 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2224 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2225 x $s1,4($out)
2226 x $s2,8($out)
2227 x $s3,12($out)
2228 st${g} $out,4*$SIZE_T($sp)
2229 la $key,0($key1)
2230 bras $ra,_s390x_AES_decrypt
2231 l${g} $out,4*$SIZE_T($sp)
2232 x $s0,$tweak+0($sp) # ^=tweak
2233 x $s1,$tweak+4($sp)
2234 x $s2,$tweak+8($sp)
2235 x $s3,$tweak+12($sp)
2236 st $s0,0($out)
2237 st $s1,4($out)
2238 st $s2,8($out)
2239 st $s3,12($out)
2240 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2241 stg $sp,$tweak-16+8($sp)
2242.Lxts_dec_done:
2243 stg $sp,$tweak+0($sp) # wipe tweak
2244 stg $sp,$twesk+8($sp)
e822c756 2245 lm${g} %r6,$ra,6*$SIZE_T($sp)
874a3757 2246 br $ra
0ab8fd58 2247.size AES_xts_decrypt,.-AES_xts_decrypt
8626230a
AP
2248___
2249}
2250$code.=<<___;
a2a54ffc 2251.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
0ab8fd58 2252.comm OPENSSL_s390xcap_P,16,8
a2a54ffc
AP
2253___
2254
2255$code =~ s/\`([^\`]*)\`/eval $1/gem;
2256print $code;
1cbdca7b 2257close STDOUT; # force flush