]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aes-s390x.pl
Fix Typos
[thirdparty/openssl.git] / crypto / aes / asm / aes-s390x.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
3c7d0945 2# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
c918d8e2 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a2a54ffc
AP
9
10# ====================================================================
e3713c36 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
a2a54ffc
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# AES for s390x.
18
19# April 2007.
20#
21# Software performance improvement over gcc-generated code is ~70% and
22# in absolute terms is ~73 cycles per byte processed with 128-bit key.
23# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24# *strictly* in-order execution and issued instruction [in this case
25# load value from memory is critical] has to complete before execution
76c828c6 26# flow proceeds. S-boxes are compressed to 2KB[+256B].
a2a54ffc
AP
27#
28# As for hardware acceleration support. It's basically a "teaser," as
29# it can and should be improved in several ways. Most notably support
30# for CBC is not utilized, nor multiple blocks are ever processed.
31# Then software key schedule can be postponed till hardware support
32# detection... Performance improvement over assembler is reportedly
251718e4 33# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
a2a54ffc
AP
34# support is implemented.
35
76c828c6
AP
36# May 2007.
37#
38# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39# for 128-bit keys, if hardware support is detected.
40
c2969ff6 41# January 2009.
8626230a
AP
42#
43# Add support for hardware AES192/256 and reschedule instructions to
44# minimize/avoid Address Generation Interlock hazard and to favour
4e52b984
AP
45# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46# almost 50% on z9. The gain is smaller on z10, because being dual-
46f4e1be 47# issue z10 makes it impossible to eliminate the interlock condition:
c2969ff6 48# critical path is not long enough. Yet it spends ~24 cycles per byte
4e52b984 49# processed with 128-bit key.
8626230a
AP
50#
51# Unlike previous version hardware support detection takes place only
52# at the moment of key schedule setup, which is denoted in key->rounds.
53# This is done, because deferred key setup can't be made MT-safe, not
26064d7f 54# for keys longer than 128 bits.
8626230a
AP
55#
56# Add AES_cbc_encrypt, which gives incredible performance improvement,
57# it was measured to be ~6.6x. It's less than previously mentioned 8x,
58# because software implementation was optimized.
59
874a3757
AP
60# May 2010.
61#
26064d7f
AP
62# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63# performance improvement over "generic" counter mode routine relying
64# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65# to the fact that exact throughput value depends on current stack
66# frame alignment within 4KB page. In worst case you get ~75% of the
67# maximum, but *on average* it would be as much as ~98%. Meaning that
68# worst case is unlike, it's like hitting ravine on plateau.
874a3757 69
e822c756
AP
70# November 2010.
71#
72# Adapt for -m31 build. If kernel supports what's called "highgprs"
73# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74# instructions and achieve "64-bit" performance even in 31-bit legacy
75# application context. The feature is not specific to any particular
76# processor, as long as it's "z-CPU". Latter implies that the code
77# remains z/Architecture specific. On z990 it was measured to perform
78# 2x better than code generated by gcc 4.3.
79
0ab8fd58
AP
80# December 2010.
81#
82# Add support for z196 "cipher message with counter" instruction.
83# Note however that it's disengaged, because it was measured to
84# perform ~12% worse than vanilla km-based code...
85
86# February 2011.
87#
0c237e42
AP
88# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89# instructions, which deliver ~70% improvement at 8KB block size over
90# vanilla km-based code, 37% - at most like 512-bytes block size.
0ab8fd58 91
e822c756
AP
92$flavour = shift;
93
94if ($flavour =~ /3[12]/) {
95 $SIZE_T=4;
96 $g="";
97} else {
98 $SIZE_T=8;
99 $g="g";
100}
101
a5aa63a4 102while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
1cbdca7b
AP
103open STDOUT,">$output";
104
a61710b8
AP
105$softonly=0; # allow hardware support
106
8626230a
AP
107$t0="%r0"; $mask="%r0";
108$t1="%r1";
109$t2="%r2"; $inp="%r2";
110$t3="%r3"; $out="%r3"; $bits="%r3";
a2a54ffc
AP
111$key="%r4";
112$i1="%r5";
113$i2="%r6";
114$i3="%r7";
115$s0="%r8";
116$s1="%r9";
117$s2="%r10";
118$s3="%r11";
119$tbl="%r12";
120$rounds="%r13";
121$ra="%r14";
122$sp="%r15";
123
e822c756
AP
124$stdframe=16*$SIZE_T+4*8;
125
a2a54ffc
AP
126sub _data_word()
127{ my $i;
128 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
129}
130
131$code=<<___;
bc4e831c
PS
132#include "s390x_arch.h"
133
a2a54ffc
AP
134.text
135
136.type AES_Te,\@object
8626230a 137.align 256
a2a54ffc
AP
138AES_Te:
139___
140&_data_word(
141 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
142 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
143 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
144 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
145 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
146 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
147 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
148 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
149 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
150 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
151 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
152 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
153 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
154 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
155 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
156 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
157 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
158 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
159 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
160 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
161 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
162 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
163 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
164 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
165 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
166 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
167 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
168 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
169 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
170 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
171 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
172 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
173 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
174 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
175 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
176 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
177 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
178 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
179 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
180 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
181 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
182 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
183 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
184 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
185 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
186 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
187 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
188 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
189 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
190 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
191 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
192 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
193 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
194 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
195 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
196 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
197 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
198 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
199 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
200 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
201 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
202 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
203 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
204 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
205$code.=<<___;
76c828c6
AP
206# Te4[256]
207.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
208.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
209.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
210.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
211.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
212.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
213.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
214.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
215.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
216.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
217.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
218.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
219.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
220.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
221.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
222.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
223.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
224.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
225.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
226.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
227.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
228.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
229.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
230.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
231.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
232.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
233.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
234.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
235.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
236.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
237.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
238.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
239# rcon[]
240.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
241.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
242.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
8626230a 243.align 256
a2a54ffc
AP
244.size AES_Te,.-AES_Te
245
76c828c6 246# void AES_encrypt(const unsigned char *inp, unsigned char *out,
a2a54ffc
AP
247# const AES_KEY *key) {
248.globl AES_encrypt
249.type AES_encrypt,\@function
250AES_encrypt:
a61710b8
AP
251___
252$code.=<<___ if (!$softonly);
8626230a
AP
253 l %r0,240($key)
254 lhi %r1,16
255 clr %r0,%r1
256 jl .Lesoft
257
a2a54ffc 258 la %r1,0($key)
3f6916cf 259 #la %r2,0($inp)
a2a54ffc
AP
260 la %r4,0($out)
261 lghi %r3,16 # single block length
262 .long 0xb92e0042 # km %r4,%r2
8626230a
AP
263 brc 1,.-4 # can this happen?
264 br %r14
265.align 64
a2a54ffc 266.Lesoft:
a61710b8
AP
267___
268$code.=<<___;
e822c756 269 stm${g} %r3,$ra,3*$SIZE_T($sp)
a2a54ffc
AP
270
271 llgf $s0,0($inp)
272 llgf $s1,4($inp)
273 llgf $s2,8($inp)
274 llgf $s3,12($inp)
275
8626230a 276 larl $tbl,AES_Te
a2a54ffc
AP
277 bras $ra,_s390x_AES_encrypt
278
e822c756 279 l${g} $out,3*$SIZE_T($sp)
a2a54ffc
AP
280 st $s0,0($out)
281 st $s1,4($out)
282 st $s2,8($out)
283 st $s3,12($out)
284
e822c756 285 lm${g} %r6,$ra,6*$SIZE_T($sp)
76c828c6 286 br $ra
a2a54ffc
AP
287.size AES_encrypt,.-AES_encrypt
288
289.type _s390x_AES_encrypt,\@function
290.align 16
291_s390x_AES_encrypt:
0ab8fd58 292 st${g} $ra,15*$SIZE_T($sp)
a2a54ffc
AP
293 x $s0,0($key)
294 x $s1,4($key)
295 x $s2,8($key)
296 x $s3,12($key)
297 l $rounds,240($key)
8626230a 298 llill $mask,`0xff<<3`
a2a54ffc 299 aghi $rounds,-1
8626230a
AP
300 j .Lenc_loop
301.align 16
a2a54ffc 302.Lenc_loop:
8626230a
AP
303 sllg $t1,$s0,`0+3`
304 srlg $t2,$s0,`8-3`
305 srlg $t3,$s0,`16-3`
a2a54ffc
AP
306 srl $s0,`24-3`
307 nr $s0,$mask
8626230a
AP
308 ngr $t1,$mask
309 nr $t2,$mask
310 nr $t3,$mask
a2a54ffc
AP
311
312 srlg $i1,$s1,`16-3` # i0
313 sllg $i2,$s1,`0+3`
314 srlg $i3,$s1,`8-3`
315 srl $s1,`24-3`
316 nr $i1,$mask
317 nr $s1,$mask
318 ngr $i2,$mask
319 nr $i3,$mask
8626230a
AP
320
321 l $s0,0($s0,$tbl) # Te0[s0>>24]
322 l $t1,1($t1,$tbl) # Te3[s0>>0]
323 l $t2,2($t2,$tbl) # Te2[s0>>8]
324 l $t3,3($t3,$tbl) # Te1[s0>>16]
325
a2a54ffc
AP
326 x $s0,3($i1,$tbl) # Te1[s1>>16]
327 l $s1,0($s1,$tbl) # Te0[s1>>24]
328 x $t2,1($i2,$tbl) # Te3[s1>>0]
329 x $t3,2($i3,$tbl) # Te2[s1>>8]
a2a54ffc
AP
330
331 srlg $i1,$s2,`8-3` # i0
332 srlg $i2,$s2,`16-3` # i1
a2a54ffc
AP
333 nr $i1,$mask
334 nr $i2,$mask
8626230a
AP
335 sllg $i3,$s2,`0+3`
336 srl $s2,`24-3`
a2a54ffc
AP
337 nr $s2,$mask
338 ngr $i3,$mask
8626230a
AP
339
340 xr $s1,$t1
341 srlg $ra,$s3,`8-3` # i1
342 sllg $t1,$s3,`0+3` # i0
343 nr $ra,$mask
344 la $key,16($key)
345 ngr $t1,$mask
346
a2a54ffc
AP
347 x $s0,2($i1,$tbl) # Te2[s2>>8]
348 x $s1,3($i2,$tbl) # Te1[s2>>16]
349 l $s2,0($s2,$tbl) # Te0[s2>>24]
350 x $t3,1($i3,$tbl) # Te3[s2>>0]
a2a54ffc 351
a2a54ffc 352 srlg $i3,$s3,`16-3` # i2
8626230a 353 xr $s2,$t2
a2a54ffc 354 srl $s3,`24-3`
a2a54ffc
AP
355 nr $i3,$mask
356 nr $s3,$mask
a2a54ffc 357
a2a54ffc
AP
358 x $s0,0($key)
359 x $s1,4($key)
360 x $s2,8($key)
8626230a
AP
361 x $t3,12($key)
362
363 x $s0,1($t1,$tbl) # Te3[s3>>0]
364 x $s1,2($ra,$tbl) # Te2[s3>>8]
365 x $s2,3($i3,$tbl) # Te1[s3>>16]
366 l $s3,0($s3,$tbl) # Te0[s3>>24]
367 xr $s3,$t3
a2a54ffc
AP
368
369 brct $rounds,.Lenc_loop
8626230a 370 .align 16
a2a54ffc 371
8626230a
AP
372 sllg $t1,$s0,`0+3`
373 srlg $t2,$s0,`8-3`
374 ngr $t1,$mask
375 srlg $t3,$s0,`16-3`
a2a54ffc
AP
376 srl $s0,`24-3`
377 nr $s0,$mask
8626230a
AP
378 nr $t2,$mask
379 nr $t3,$mask
a2a54ffc
AP
380
381 srlg $i1,$s1,`16-3` # i0
382 sllg $i2,$s1,`0+3`
8626230a 383 ngr $i2,$mask
a2a54ffc
AP
384 srlg $i3,$s1,`8-3`
385 srl $s1,`24-3`
386 nr $i1,$mask
387 nr $s1,$mask
a2a54ffc 388 nr $i3,$mask
8626230a
AP
389
390 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
391 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
392 sll $s0,24
393 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
394 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
395 sll $t2,8
396 sll $t3,16
397
a2a54ffc
AP
398 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
399 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
400 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
401 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
402 sll $i1,16
403 sll $s1,24
404 sll $i3,8
405 or $s0,$i1
406 or $s1,$t1
407 or $t2,$i2
408 or $t3,$i3
609b0852 409
a2a54ffc
AP
410 srlg $i1,$s2,`8-3` # i0
411 srlg $i2,$s2,`16-3` # i1
a2a54ffc
AP
412 nr $i1,$mask
413 nr $i2,$mask
8626230a
AP
414 sllg $i3,$s2,`0+3`
415 srl $s2,`24-3`
a2a54ffc 416 ngr $i3,$mask
8626230a
AP
417 nr $s2,$mask
418
419 sllg $t1,$s3,`0+3` # i0
420 srlg $ra,$s3,`8-3` # i1
421 ngr $t1,$mask
422
a2a54ffc
AP
423 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
424 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
8626230a 425 sll $i1,8
a2a54ffc
AP
426 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
427 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
a2a54ffc 428 sll $i2,16
8626230a 429 nr $ra,$mask
a2a54ffc
AP
430 sll $s2,24
431 or $s0,$i1
432 or $s1,$i2
433 or $s2,$t2
434 or $t3,$i3
435
a2a54ffc
AP
436 srlg $i3,$s3,`16-3` # i2
437 srl $s3,`24-3`
a2a54ffc
AP
438 nr $i3,$mask
439 nr $s3,$mask
8626230a
AP
440
441 l $t0,16($key)
442 l $t2,20($key)
443
444 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
445 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
a2a54ffc
AP
446 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
447 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
448 sll $i2,8
449 sll $i3,16
450 sll $s3,24
451 or $s0,$i1
452 or $s1,$i2
453 or $s2,$i3
454 or $s3,$t3
455
0ab8fd58 456 l${g} $ra,15*$SIZE_T($sp)
8626230a
AP
457 xr $s0,$t0
458 xr $s1,$t2
a2a54ffc
AP
459 x $s2,24($key)
460 x $s3,28($key)
461
609b0852 462 br $ra
a2a54ffc
AP
463.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
464___
465
466$code.=<<___;
467.type AES_Td,\@object
8626230a 468.align 256
a2a54ffc
AP
469AES_Td:
470___
471&_data_word(
472 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
473 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
474 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
475 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
476 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
477 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
478 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
479 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
480 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
481 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
482 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
483 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
484 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
485 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
486 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
487 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
488 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
489 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
490 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
491 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
492 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
493 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
494 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
495 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
496 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
497 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
498 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
499 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
500 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
501 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
502 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
503 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
504 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
505 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
506 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
507 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
508 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
509 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
510 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
511 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
512 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
513 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
514 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
515 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
516 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
517 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
518 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
519 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
520 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
521 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
522 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
523 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
524 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
525 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
526 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
527 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
528 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
529 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
530 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
531 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
532 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
533 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
534 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
535 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
536$code.=<<___;
76c828c6 537# Td4[256]
a2a54ffc
AP
538.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
539.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
540.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
541.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
542.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
543.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
544.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
545.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
546.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
547.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
548.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
549.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
550.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
551.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
552.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
553.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
554.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
555.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
556.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
557.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
558.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
559.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
560.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
561.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
562.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
563.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
564.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
565.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
566.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
567.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
568.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
569.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
570.size AES_Td,.-AES_Td
571
76c828c6 572# void AES_decrypt(const unsigned char *inp, unsigned char *out,
a2a54ffc
AP
573# const AES_KEY *key) {
574.globl AES_decrypt
575.type AES_decrypt,\@function
576AES_decrypt:
a61710b8
AP
577___
578$code.=<<___ if (!$softonly);
8626230a
AP
579 l %r0,240($key)
580 lhi %r1,16
581 clr %r0,%r1
582 jl .Ldsoft
583
584 la %r1,0($key)
3f6916cf 585 #la %r2,0($inp)
a2a54ffc
AP
586 la %r4,0($out)
587 lghi %r3,16 # single block length
588 .long 0xb92e0042 # km %r4,%r2
8626230a
AP
589 brc 1,.-4 # can this happen?
590 br %r14
591.align 64
a2a54ffc 592.Ldsoft:
a61710b8
AP
593___
594$code.=<<___;
e822c756 595 stm${g} %r3,$ra,3*$SIZE_T($sp)
a2a54ffc
AP
596
597 llgf $s0,0($inp)
598 llgf $s1,4($inp)
599 llgf $s2,8($inp)
600 llgf $s3,12($inp)
601
8626230a 602 larl $tbl,AES_Td
a2a54ffc
AP
603 bras $ra,_s390x_AES_decrypt
604
e822c756 605 l${g} $out,3*$SIZE_T($sp)
a2a54ffc
AP
606 st $s0,0($out)
607 st $s1,4($out)
608 st $s2,8($out)
609 st $s3,12($out)
610
e822c756 611 lm${g} %r6,$ra,6*$SIZE_T($sp)
76c828c6 612 br $ra
a2a54ffc
AP
613.size AES_decrypt,.-AES_decrypt
614
615.type _s390x_AES_decrypt,\@function
616.align 16
617_s390x_AES_decrypt:
0ab8fd58 618 st${g} $ra,15*$SIZE_T($sp)
a2a54ffc
AP
619 x $s0,0($key)
620 x $s1,4($key)
621 x $s2,8($key)
622 x $s3,12($key)
623 l $rounds,240($key)
8626230a 624 llill $mask,`0xff<<3`
a2a54ffc 625 aghi $rounds,-1
8626230a
AP
626 j .Ldec_loop
627.align 16
a2a54ffc 628.Ldec_loop:
8626230a
AP
629 srlg $t1,$s0,`16-3`
630 srlg $t2,$s0,`8-3`
631 sllg $t3,$s0,`0+3`
a2a54ffc
AP
632 srl $s0,`24-3`
633 nr $s0,$mask
8626230a
AP
634 nr $t1,$mask
635 nr $t2,$mask
636 ngr $t3,$mask
a2a54ffc
AP
637
638 sllg $i1,$s1,`0+3` # i0
639 srlg $i2,$s1,`16-3`
640 srlg $i3,$s1,`8-3`
641 srl $s1,`24-3`
642 ngr $i1,$mask
643 nr $s1,$mask
644 nr $i2,$mask
645 nr $i3,$mask
8626230a
AP
646
647 l $s0,0($s0,$tbl) # Td0[s0>>24]
648 l $t1,3($t1,$tbl) # Td1[s0>>16]
649 l $t2,2($t2,$tbl) # Td2[s0>>8]
650 l $t3,1($t3,$tbl) # Td3[s0>>0]
651
a2a54ffc
AP
652 x $s0,1($i1,$tbl) # Td3[s1>>0]
653 l $s1,0($s1,$tbl) # Td0[s1>>24]
654 x $t2,3($i2,$tbl) # Td1[s1>>16]
655 x $t3,2($i3,$tbl) # Td2[s1>>8]
a2a54ffc
AP
656
657 srlg $i1,$s2,`8-3` # i0
658 sllg $i2,$s2,`0+3` # i1
659 srlg $i3,$s2,`16-3`
660 srl $s2,`24-3`
661 nr $i1,$mask
662 ngr $i2,$mask
663 nr $s2,$mask
664 nr $i3,$mask
8626230a
AP
665
666 xr $s1,$t1
667 srlg $ra,$s3,`8-3` # i1
668 srlg $t1,$s3,`16-3` # i0
669 nr $ra,$mask
670 la $key,16($key)
671 nr $t1,$mask
672
a2a54ffc
AP
673 x $s0,2($i1,$tbl) # Td2[s2>>8]
674 x $s1,1($i2,$tbl) # Td3[s2>>0]
675 l $s2,0($s2,$tbl) # Td0[s2>>24]
676 x $t3,3($i3,$tbl) # Td1[s2>>16]
a2a54ffc 677
a2a54ffc
AP
678 sllg $i3,$s3,`0+3` # i2
679 srl $s3,`24-3`
a2a54ffc
AP
680 ngr $i3,$mask
681 nr $s3,$mask
a2a54ffc 682
8626230a 683 xr $s2,$t2
a2a54ffc
AP
684 x $s0,0($key)
685 x $s1,4($key)
686 x $s2,8($key)
8626230a
AP
687 x $t3,12($key)
688
689 x $s0,3($t1,$tbl) # Td1[s3>>16]
690 x $s1,2($ra,$tbl) # Td2[s3>>8]
691 x $s2,1($i3,$tbl) # Td3[s3>>0]
692 l $s3,0($s3,$tbl) # Td0[s3>>24]
693 xr $s3,$t3
a2a54ffc
AP
694
695 brct $rounds,.Ldec_loop
8626230a 696 .align 16
a2a54ffc
AP
697
698 l $t1,`2048+0`($tbl) # prefetch Td4
8626230a
AP
699 l $t2,`2048+64`($tbl)
700 l $t3,`2048+128`($tbl)
701 l $i1,`2048+192`($tbl)
a2a54ffc
AP
702 llill $mask,0xff
703
704 srlg $i3,$s0,24 # i0
8626230a
AP
705 srlg $t1,$s0,16
706 srlg $t2,$s0,8
a2a54ffc 707 nr $s0,$mask # i3
8626230a
AP
708 nr $t1,$mask
709
710 srlg $i1,$s1,24
711 nr $t2,$mask
712 srlg $i2,$s1,16
713 srlg $ra,$s1,8
714 nr $s1,$mask # i0
a2a54ffc 715 nr $i2,$mask
8626230a
AP
716 nr $ra,$mask
717
a2a54ffc 718 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
8626230a
AP
719 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
720 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
721 sll $t1,16
a2a54ffc
AP
722 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
723 sllg $s0,$i3,24
a2a54ffc
AP
724 sll $t2,8
725
a2a54ffc
AP
726 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
727 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
728 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
a2a54ffc 729 sll $i1,24
8626230a 730 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
a2a54ffc
AP
731 sll $i2,16
732 sll $i3,8
733 or $s0,$s1
734 or $t1,$i1
735 or $t2,$i2
736 or $t3,$i3
737
738 srlg $i1,$s2,8 # i0
739 srlg $i2,$s2,24
740 srlg $i3,$s2,16
741 nr $s2,$mask # i1
742 nr $i1,$mask
743 nr $i3,$mask
744 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
745 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
746 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
747 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
748 sll $i1,8
749 sll $i2,24
a2a54ffc 750 or $s0,$i1
8626230a 751 sll $i3,16
a2a54ffc
AP
752 or $t2,$i2
753 or $t3,$i3
754
755 srlg $i1,$s3,16 # i0
756 srlg $i2,$s3,8 # i1
757 srlg $i3,$s3,24
758 nr $s3,$mask # i2
759 nr $i1,$mask
760 nr $i2,$mask
8626230a 761
0ab8fd58 762 l${g} $ra,15*$SIZE_T($sp)
8626230a
AP
763 or $s1,$t1
764 l $t0,16($key)
765 l $t1,20($key)
766
a2a54ffc
AP
767 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
768 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
8626230a 769 sll $i1,16
a2a54ffc
AP
770 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
771 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
a2a54ffc
AP
772 sll $i2,8
773 sll $s3,24
774 or $s0,$i1
775 or $s1,$i2
776 or $s2,$t2
777 or $s3,$t3
778
8626230a
AP
779 xr $s0,$t0
780 xr $s1,$t1
a2a54ffc
AP
781 x $s2,24($key)
782 x $s3,28($key)
783
609b0852 784 br $ra
a2a54ffc 785.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
8626230a 786___
76c828c6 787
8626230a 788$code.=<<___;
76c828c6
AP
789# void AES_set_encrypt_key(const unsigned char *in, int bits,
790# AES_KEY *key) {
791.globl AES_set_encrypt_key
792.type AES_set_encrypt_key,\@function
793.align 16
794AES_set_encrypt_key:
bc9583ef 795_s390x_AES_set_encrypt_key:
8626230a 796 lghi $t0,0
e822c756 797 cl${g}r $inp,$t0
76c828c6 798 je .Lminus1
e822c756 799 cl${g}r $key,$t0
76c828c6
AP
800 je .Lminus1
801
8626230a
AP
802 lghi $t0,128
803 clr $bits,$t0
804 je .Lproceed
805 lghi $t0,192
806 clr $bits,$t0
807 je .Lproceed
808 lghi $t0,256
809 clr $bits,$t0
810 je .Lproceed
76c828c6
AP
811 lghi %r2,-2
812 br %r14
813
8626230a
AP
814.align 16
815.Lproceed:
a61710b8
AP
816___
817$code.=<<___ if (!$softonly);
af1d6387 818 # convert bits to km(c) code, [128,192,256]->[18,19,20]
8626230a
AP
819 lhi %r5,-128
820 lhi %r0,18
821 ar %r5,$bits
822 srl %r5,6
823 ar %r5,%r0
824
91fdacb2 825 larl %r1,OPENSSL_s390xcap_P
670ad0fb
AP
826 llihh %r0,0x8000
827 srlg %r0,%r0,0(%r5)
bc4e831c
PS
828 ng %r0,S390X_KM(%r1) # check availability of both km...
829 ng %r0,S390X_KMC(%r1) # ...and kmc support for given key length
76c828c6
AP
830 jz .Lekey_internal
831
8626230a
AP
832 lmg %r0,%r1,0($inp) # just copy 128 bits...
833 stmg %r0,%r1,0($key)
834 lhi %r0,192
835 cr $bits,%r0
836 jl 1f
837 lg %r1,16($inp)
838 stg %r1,16($key)
839 je 1f
840 lg %r1,24($inp)
841 stg %r1,24($key)
b1fd0ccb
AP
8421: st $bits,236($key) # save bits [for debugging purposes]
843 lgr $t0,%r5
af1d6387 844 st %r5,240($key) # save km(c) code
76c828c6
AP
845 lghi %r2,0
846 br %r14
a61710b8
AP
847___
848$code.=<<___;
76c828c6
AP
849.align 16
850.Lekey_internal:
b1fd0ccb 851 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
76c828c6 852
a61710b8 853 larl $tbl,AES_Te+2048
76c828c6
AP
854
855 llgf $s0,0($inp)
856 llgf $s1,4($inp)
857 llgf $s2,8($inp)
858 llgf $s3,12($inp)
859 st $s0,0($key)
860 st $s1,4($key)
861 st $s2,8($key)
862 st $s3,12($key)
8626230a
AP
863 lghi $t0,128
864 cr $bits,$t0
76c828c6
AP
865 jne .Lnot128
866
867 llill $mask,0xff
868 lghi $t3,0 # i=0
869 lghi $rounds,10
76c828c6
AP
870 st $rounds,240($key)
871
76c828c6
AP
872 llgfr $t2,$s3 # temp=rk[3]
873 srlg $i1,$s3,8
874 srlg $i2,$s3,16
875 srlg $i3,$s3,24
876 nr $t2,$mask
877 nr $i1,$mask
878 nr $i2,$mask
8626230a
AP
879
880.align 16
881.L128_loop:
76c828c6
AP
882 la $t2,0($t2,$tbl)
883 la $i1,0($i1,$tbl)
884 la $i2,0($i2,$tbl)
885 la $i3,0($i3,$tbl)
886 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
887 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
888 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
889 icm $t2,1,0($i3) # Te4[rk[3]>>24]
890 x $t2,256($t3,$tbl) # rcon[i]
891 xr $s0,$t2 # rk[4]=rk[0]^...
892 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
893 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
894 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
8626230a
AP
895
896 llgfr $t2,$s3 # temp=rk[3]
897 srlg $i1,$s3,8
898 srlg $i2,$s3,16
899 nr $t2,$mask
900 nr $i1,$mask
901 srlg $i3,$s3,24
902 nr $i2,$mask
903
76c828c6
AP
904 st $s0,16($key)
905 st $s1,20($key)
906 st $s2,24($key)
907 st $s3,28($key)
908 la $key,16($key) # key+=4
909 la $t3,4($t3) # i++
910 brct $rounds,.L128_loop
b1fd0ccb 911 lghi $t0,10
76c828c6 912 lghi %r2,0
b1fd0ccb 913 lm${g} %r4,%r13,4*$SIZE_T($sp)
76c828c6
AP
914 br $ra
915
8626230a 916.align 16
76c828c6 917.Lnot128:
8626230a
AP
918 llgf $t0,16($inp)
919 llgf $t1,20($inp)
920 st $t0,16($key)
921 st $t1,20($key)
922 lghi $t0,192
923 cr $bits,$t0
76c828c6
AP
924 jne .Lnot192
925
926 llill $mask,0xff
927 lghi $t3,0 # i=0
928 lghi $rounds,12
929 st $rounds,240($key)
930 lghi $rounds,8
931
8626230a
AP
932 srlg $i1,$t1,8
933 srlg $i2,$t1,16
934 srlg $i3,$t1,24
935 nr $t1,$mask
76c828c6
AP
936 nr $i1,$mask
937 nr $i2,$mask
8626230a
AP
938
939.align 16
940.L192_loop:
941 la $t1,0($t1,$tbl)
76c828c6
AP
942 la $i1,0($i1,$tbl)
943 la $i2,0($i2,$tbl)
944 la $i3,0($i3,$tbl)
8626230a
AP
945 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
946 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
947 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
948 icm $t1,1,0($i3) # Te4[rk[5]>>24]
949 x $t1,256($t3,$tbl) # rcon[i]
950 xr $s0,$t1 # rk[6]=rk[0]^...
76c828c6
AP
951 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
952 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
953 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
8626230a 954
76c828c6
AP
955 st $s0,24($key)
956 st $s1,28($key)
957 st $s2,32($key)
958 st $s3,36($key)
959 brct $rounds,.L192_continue
b1fd0ccb 960 lghi $t0,12
76c828c6 961 lghi %r2,0
b1fd0ccb 962 lm${g} %r4,%r13,4*$SIZE_T($sp)
76c828c6 963 br $ra
8626230a
AP
964
965.align 16
76c828c6 966.L192_continue:
8626230a
AP
967 lgr $t1,$s3
968 x $t1,16($key) # rk[10]=rk[4]^rk[9]
969 st $t1,40($key)
970 x $t1,20($key) # rk[11]=rk[5]^rk[10]
971 st $t1,44($key)
972
973 srlg $i1,$t1,8
974 srlg $i2,$t1,16
975 srlg $i3,$t1,24
976 nr $t1,$mask
977 nr $i1,$mask
978 nr $i2,$mask
979
76c828c6
AP
980 la $key,24($key) # key+=6
981 la $t3,4($t3) # i++
982 j .L192_loop
983
8626230a 984.align 16
76c828c6 985.Lnot192:
8626230a
AP
986 llgf $t0,24($inp)
987 llgf $t1,28($inp)
988 st $t0,24($key)
989 st $t1,28($key)
76c828c6
AP
990 llill $mask,0xff
991 lghi $t3,0 # i=0
992 lghi $rounds,14
993 st $rounds,240($key)
994 lghi $rounds,7
995
8626230a
AP
996 srlg $i1,$t1,8
997 srlg $i2,$t1,16
998 srlg $i3,$t1,24
999 nr $t1,$mask
76c828c6
AP
1000 nr $i1,$mask
1001 nr $i2,$mask
8626230a
AP
1002
1003.align 16
1004.L256_loop:
1005 la $t1,0($t1,$tbl)
76c828c6
AP
1006 la $i1,0($i1,$tbl)
1007 la $i2,0($i2,$tbl)
1008 la $i3,0($i3,$tbl)
8626230a
AP
1009 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1010 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1011 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1012 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1013 x $t1,256($t3,$tbl) # rcon[i]
1014 xr $s0,$t1 # rk[8]=rk[0]^...
76c828c6
AP
1015 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1016 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1017 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1018 st $s0,32($key)
1019 st $s1,36($key)
1020 st $s2,40($key)
1021 st $s3,44($key)
1022 brct $rounds,.L256_continue
b1fd0ccb 1023 lghi $t0,14
76c828c6 1024 lghi %r2,0
b1fd0ccb 1025 lm${g} %r4,%r13,4*$SIZE_T($sp)
76c828c6 1026 br $ra
8626230a
AP
1027
1028.align 16
76c828c6 1029.L256_continue:
8626230a 1030 lgr $t1,$s3 # temp=rk[11]
76c828c6
AP
1031 srlg $i1,$s3,8
1032 srlg $i2,$s3,16
1033 srlg $i3,$s3,24
8626230a 1034 nr $t1,$mask
76c828c6
AP
1035 nr $i1,$mask
1036 nr $i2,$mask
8626230a 1037 la $t1,0($t1,$tbl)
76c828c6
AP
1038 la $i1,0($i1,$tbl)
1039 la $i2,0($i2,$tbl)
1040 la $i3,0($i3,$tbl)
8626230a
AP
1041 llgc $t1,0($t1) # Te4[rk[11]>>0]
1042 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1043 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1044 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1045 x $t1,16($key) # rk[12]=rk[4]^...
1046 st $t1,48($key)
1047 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1048 st $t1,52($key)
1049 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1050 st $t1,56($key)
1051 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1052 st $t1,60($key)
1053
1054 srlg $i1,$t1,8
1055 srlg $i2,$t1,16
1056 srlg $i3,$t1,24
1057 nr $t1,$mask
1058 nr $i1,$mask
1059 nr $i2,$mask
76c828c6
AP
1060
1061 la $key,32($key) # key+=8
1062 la $t3,4($t3) # i++
1063 j .L256_loop
8626230a 1064
76c828c6
AP
1065.Lminus1:
1066 lghi %r2,-1
8626230a 1067 br $ra
76c828c6
AP
1068.size AES_set_encrypt_key,.-AES_set_encrypt_key
1069
1070# void AES_set_decrypt_key(const unsigned char *in, int bits,
1071# AES_KEY *key) {
1072.globl AES_set_decrypt_key
1073.type AES_set_decrypt_key,\@function
1074.align 16
1075AES_set_decrypt_key:
b1fd0ccb
AP
1076 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1077 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
bc9583ef 1078 bras $ra,_s390x_AES_set_encrypt_key
b1fd0ccb 1079 #l${g} $key,4*$SIZE_T($sp)
e822c756 1080 l${g} $ra,14*$SIZE_T($sp)
76c828c6
AP
1081 ltgr %r2,%r2
1082 bnzr $ra
a61710b8
AP
1083___
1084$code.=<<___ if (!$softonly);
b1fd0ccb 1085 #l $t0,240($key)
8626230a
AP
1086 lhi $t1,16
1087 cr $t0,$t1
1088 jl .Lgo
e21a8430 1089 oill $t0,S390X_DECRYPT # set "decrypt" bit
8626230a 1090 st $t0,240($key)
76c828c6 1091 br $ra
a61710b8
AP
1092___
1093$code.=<<___;
b1fd0ccb
AP
1094.align 16
1095.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
96b0f6c1 1096 la $i1,0($key)
76c828c6 1097 sllg $i2,$rounds,4
96b0f6c1 1098 la $i2,0($i2,$key)
76c828c6 1099 srl $rounds,1
8626230a 1100 lghi $t1,-16
76c828c6 1101
8626230a 1102.align 16
96b0f6c1
AP
1103.Linv: lmg $s0,$s1,0($i1)
1104 lmg $s2,$s3,0($i2)
1105 stmg $s0,$s1,0($i2)
1106 stmg $s2,$s3,0($i1)
8626230a
AP
1107 la $i1,16($i1)
1108 la $i2,0($t1,$i2)
76c828c6
AP
1109 brct $rounds,.Linv
1110___
1111$mask80=$i1;
1112$mask1b=$i2;
1113$maskfe=$i3;
1114$code.=<<___;
1115 llgf $rounds,240($key)
1116 aghi $rounds,-1
1117 sll $rounds,2 # (rounds-1)*4
1118 llilh $mask80,0x8080
76c828c6 1119 llilh $mask1b,0x1b1b
76c828c6 1120 llilh $maskfe,0xfefe
8626230a
AP
1121 oill $mask80,0x8080
1122 oill $mask1b,0x1b1b
76c828c6
AP
1123 oill $maskfe,0xfefe
1124
8626230a 1125.align 16
76c828c6
AP
1126.Lmix: l $s0,16($key) # tp1
1127 lr $s1,$s0
1128 ngr $s1,$mask80
1129 srlg $t1,$s1,7
1130 slr $s1,$t1
1131 nr $s1,$mask1b
1132 sllg $t1,$s0,1
1133 nr $t1,$maskfe
1134 xr $s1,$t1 # tp2
1135
1136 lr $s2,$s1
1137 ngr $s2,$mask80
1138 srlg $t1,$s2,7
1139 slr $s2,$t1
1140 nr $s2,$mask1b
1141 sllg $t1,$s1,1
1142 nr $t1,$maskfe
1143 xr $s2,$t1 # tp4
1144
1145 lr $s3,$s2
1146 ngr $s3,$mask80
1147 srlg $t1,$s3,7
1148 slr $s3,$t1
1149 nr $s3,$mask1b
1150 sllg $t1,$s2,1
1151 nr $t1,$maskfe
1152 xr $s3,$t1 # tp8
1153
1154 xr $s1,$s0 # tp2^tp1
1155 xr $s2,$s0 # tp4^tp1
1156 rll $s0,$s0,24 # = ROTATE(tp1,8)
8626230a 1157 xr $s2,$s3 # ^=tp8
76c828c6 1158 xr $s0,$s1 # ^=tp2^tp1
76c828c6 1159 xr $s1,$s3 # tp2^tp1^tp8
8626230a 1160 xr $s0,$s2 # ^=tp4^tp1^tp8
76c828c6 1161 rll $s1,$s1,8
76c828c6 1162 rll $s2,$s2,16
8626230a 1163 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
76c828c6 1164 rll $s3,$s3,24
8626230a 1165 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
76c828c6
AP
1166 xr $s0,$s3 # ^= ROTATE(tp8,8)
1167
1168 st $s0,16($key)
1169 la $key,4($key)
1170 brct $rounds,.Lmix
1171
e822c756 1172 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
76c828c6
AP
1173 lghi %r2,0
1174 br $ra
1175.size AES_set_decrypt_key,.-AES_set_decrypt_key
8626230a
AP
1176___
1177
0ab8fd58
AP
1178########################################################################
1179# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
8626230a
AP
1180# size_t length, const AES_KEY *key,
1181# unsigned char *ivec, const int enc)
1182{
1183my $inp="%r2";
1184my $out="%r4"; # length and out are swapped
1185my $len="%r3";
1186my $key="%r5";
1187my $ivp="%r6";
1188
1189$code.=<<___;
1190.globl AES_cbc_encrypt
1191.type AES_cbc_encrypt,\@function
1192.align 16
1193AES_cbc_encrypt:
1194 xgr %r3,%r4 # flip %r3 and %r4, out and len
1195 xgr %r4,%r3
1196 xgr %r3,%r4
1197___
1198$code.=<<___ if (!$softonly);
1199 lhi %r0,16
1200 cl %r0,240($key)
1201 jh .Lcbc_software
1202
1203 lg %r0,0($ivp) # copy ivec
1204 lg %r1,8($ivp)
1205 stmg %r0,%r1,16($sp)
1206 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1207 stmg %r0,%r1,32($sp)
1208 lmg %r0,%r1,16($key)
1209 stmg %r0,%r1,48($sp)
1210 l %r0,240($key) # load kmc code
1211 lghi $key,15 # res=len%16, len-=res;
1212 ngr $key,$len
e822c756 1213 sl${g}r $len,$key
8626230a
AP
1214 la %r1,16($sp) # parameter block - ivec || key
1215 jz .Lkmc_truncated
1216 .long 0xb92f0042 # kmc %r4,%r2
1217 brc 1,.-4 # pay attention to "partial completion"
1218 ltr $key,$key
1219 jnz .Lkmc_truncated
1220.Lkmc_done:
1221 lmg %r0,%r1,16($sp) # copy ivec to caller
1222 stg %r0,0($ivp)
1223 stg %r1,8($ivp)
1224 br $ra
1225.align 16
1226.Lkmc_truncated:
1227 ahi $key,-1 # it's the way it's encoded in mvc
e21a8430 1228 tmll %r0,S390X_DECRYPT
8626230a
AP
1229 jnz .Lkmc_truncated_dec
1230 lghi %r1,0
e822c756
AP
1231 stg %r1,16*$SIZE_T($sp)
1232 stg %r1,16*$SIZE_T+8($sp)
8626230a 1233 bras %r1,1f
e822c756 1234 mvc 16*$SIZE_T(1,$sp),0($inp)
8626230a
AP
12351: ex $key,0(%r1)
1236 la %r1,16($sp) # restore parameter block
e822c756 1237 la $inp,16*$SIZE_T($sp)
8626230a
AP
1238 lghi $len,16
1239 .long 0xb92f0042 # kmc %r4,%r2
1240 j .Lkmc_done
1241.align 16
1242.Lkmc_truncated_dec:
e822c756
AP
1243 st${g} $out,4*$SIZE_T($sp)
1244 la $out,16*$SIZE_T($sp)
8626230a
AP
1245 lghi $len,16
1246 .long 0xb92f0042 # kmc %r4,%r2
e822c756 1247 l${g} $out,4*$SIZE_T($sp)
8626230a 1248 bras %r1,2f
e822c756 1249 mvc 0(1,$out),16*$SIZE_T($sp)
8626230a
AP
12502: ex $key,0(%r1)
1251 j .Lkmc_done
1252.align 16
1253.Lcbc_software:
1254___
1255$code.=<<___;
e822c756 1256 stm${g} $key,$ra,5*$SIZE_T($sp)
8626230a 1257 lhi %r0,0
e822c756 1258 cl %r0,`$stdframe+$SIZE_T-4`($sp)
8626230a
AP
1259 je .Lcbc_decrypt
1260
1261 larl $tbl,AES_Te
1262
1263 llgf $s0,0($ivp)
1264 llgf $s1,4($ivp)
1265 llgf $s2,8($ivp)
1266 llgf $s3,12($ivp)
1267
1268 lghi $t0,16
e822c756 1269 sl${g}r $len,$t0
8626230a
AP
1270 brc 4,.Lcbc_enc_tail # if borrow
1271.Lcbc_enc_loop:
e822c756 1272 stm${g} $inp,$out,2*$SIZE_T($sp)
8626230a
AP
1273 x $s0,0($inp)
1274 x $s1,4($inp)
1275 x $s2,8($inp)
1276 x $s3,12($inp)
1277 lgr %r4,$key
1278
1279 bras $ra,_s390x_AES_encrypt
1280
e822c756 1281 lm${g} $inp,$key,2*$SIZE_T($sp)
8626230a
AP
1282 st $s0,0($out)
1283 st $s1,4($out)
1284 st $s2,8($out)
1285 st $s3,12($out)
1286
1287 la $inp,16($inp)
1288 la $out,16($out)
1289 lghi $t0,16
e822c756 1290 lt${g}r $len,$len
8626230a 1291 jz .Lcbc_enc_done
e822c756 1292 sl${g}r $len,$t0
8626230a
AP
1293 brc 4,.Lcbc_enc_tail # if borrow
1294 j .Lcbc_enc_loop
1295.align 16
1296.Lcbc_enc_done:
e822c756 1297 l${g} $ivp,6*$SIZE_T($sp)
8626230a 1298 st $s0,0($ivp)
609b0852 1299 st $s1,4($ivp)
8626230a
AP
1300 st $s2,8($ivp)
1301 st $s3,12($ivp)
1302
e822c756 1303 lm${g} %r7,$ra,7*$SIZE_T($sp)
8626230a
AP
1304 br $ra
1305
1306.align 16
1307.Lcbc_enc_tail:
1308 aghi $len,15
1309 lghi $t0,0
e822c756
AP
1310 stg $t0,16*$SIZE_T($sp)
1311 stg $t0,16*$SIZE_T+8($sp)
8626230a 1312 bras $t1,3f
e822c756 1313 mvc 16*$SIZE_T(1,$sp),0($inp)
8626230a
AP
13143: ex $len,0($t1)
1315 lghi $len,0
e822c756 1316 la $inp,16*$SIZE_T($sp)
8626230a
AP
1317 j .Lcbc_enc_loop
1318
1319.align 16
1320.Lcbc_decrypt:
1321 larl $tbl,AES_Td
1322
1323 lg $t0,0($ivp)
1324 lg $t1,8($ivp)
e822c756 1325 stmg $t0,$t1,16*$SIZE_T($sp)
8626230a
AP
1326
1327.Lcbc_dec_loop:
e822c756 1328 stm${g} $inp,$out,2*$SIZE_T($sp)
8626230a
AP
1329 llgf $s0,0($inp)
1330 llgf $s1,4($inp)
1331 llgf $s2,8($inp)
1332 llgf $s3,12($inp)
1333 lgr %r4,$key
1334
1335 bras $ra,_s390x_AES_decrypt
1336
e822c756 1337 lm${g} $inp,$key,2*$SIZE_T($sp)
8626230a
AP
1338 sllg $s0,$s0,32
1339 sllg $s2,$s2,32
1340 lr $s0,$s1
1341 lr $s2,$s3
1342
1343 lg $t0,0($inp)
1344 lg $t1,8($inp)
e822c756
AP
1345 xg $s0,16*$SIZE_T($sp)
1346 xg $s2,16*$SIZE_T+8($sp)
8626230a 1347 lghi $s1,16
e822c756 1348 sl${g}r $len,$s1
8626230a
AP
1349 brc 4,.Lcbc_dec_tail # if borrow
1350 brc 2,.Lcbc_dec_done # if zero
1351 stg $s0,0($out)
1352 stg $s2,8($out)
e822c756 1353 stmg $t0,$t1,16*$SIZE_T($sp)
8626230a
AP
1354
1355 la $inp,16($inp)
1356 la $out,16($out)
1357 j .Lcbc_dec_loop
1358
1359.Lcbc_dec_done:
1360 stg $s0,0($out)
1361 stg $s2,8($out)
1362.Lcbc_dec_exit:
e822c756 1363 lm${g} %r6,$ra,6*$SIZE_T($sp)
8626230a
AP
1364 stmg $t0,$t1,0($ivp)
1365
1366 br $ra
1367
1368.align 16
1369.Lcbc_dec_tail:
1370 aghi $len,15
e822c756
AP
1371 stg $s0,16*$SIZE_T($sp)
1372 stg $s2,16*$SIZE_T+8($sp)
8626230a 1373 bras $s1,4f
e822c756 1374 mvc 0(1,$out),16*$SIZE_T($sp)
8626230a
AP
13754: ex $len,0($s1)
1376 j .Lcbc_dec_exit
1377.size AES_cbc_encrypt,.-AES_cbc_encrypt
874a3757
AP
1378___
1379}
0ab8fd58
AP
1380########################################################################
1381# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
874a3757
AP
1382# size_t blocks, const AES_KEY *key,
1383# const unsigned char *ivec)
1384{
1385my $inp="%r2";
0ab8fd58
AP
1386my $out="%r4"; # blocks and out are swapped
1387my $len="%r3";
874a3757
AP
1388my $key="%r5"; my $iv0="%r5";
1389my $ivp="%r6";
1390my $fp ="%r7";
1391
1392$code.=<<___;
1393.globl AES_ctr32_encrypt
1394.type AES_ctr32_encrypt,\@function
1395.align 16
1396AES_ctr32_encrypt:
0ab8fd58
AP
1397 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1398 xgr %r4,%r3
1399 xgr %r3,%r4
e822c756 1400 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
874a3757
AP
1401___
1402$code.=<<___ if (!$softonly);
1403 l %r0,240($key)
1404 lhi %r1,16
1405 clr %r0,%r1
1406 jl .Lctr32_software
1407
1c3a23e4
PS
1408 st${g} $s2,10*$SIZE_T($sp)
1409 st${g} $s3,11*$SIZE_T($sp)
1410
1411 clr $len,%r1 # does work even in 64-bit mode
1412 jle .Lctr32_nokma # kma is slower for <= 16 blocks
1413
1414 larl %r1,OPENSSL_s390xcap_P
1415 lr $s2,%r0
1416 llihh $s3,0x8000
1417 srlg $s3,$s3,0($s2)
1418 ng $s3,S390X_KMA(%r1) # check kma capability vector
1419 jz .Lctr32_nokma
1420
1421 l${g}hi %r1,-$stdframe-112
1422 l${g}r $s3,$sp
1423 la $sp,0(%r1,$sp) # prepare parameter block
1424
1425 lhi %r1,0x0600
1426 sllg $len,$len,4
1427 or %r0,%r1 # set HS and LAAD flags
1428
1429 st${g} $s3,0($sp) # backchain
1430 la %r1,$stdframe($sp)
1431
1432 lmg $s2,$s3,0($key) # copy key
1433 stg $s2,$stdframe+80($sp)
1434 stg $s3,$stdframe+88($sp)
1435 lmg $s2,$s3,16($key)
1436 stg $s2,$stdframe+96($sp)
1437 stg $s3,$stdframe+104($sp)
1438
1439 lmg $s2,$s3,0($ivp) # copy iv
1440 stg $s2,$stdframe+64($sp)
1441 ahi $s3,-1 # kma requires counter-1
1442 stg $s3,$stdframe+72($sp)
1443 st $s3,$stdframe+12($sp) # copy counter
1444
1445 lghi $s2,0 # no AAD
1446 lghi $s3,0
1447
1448 .long 0xb929a042 # kma $out,$s2,$inp
1449 brc 1,.-4 # pay attention to "partial completion"
1450
1451 stg %r0,$stdframe+80($sp) # wipe key
1452 stg %r0,$stdframe+88($sp)
1453 stg %r0,$stdframe+96($sp)
1454 stg %r0,$stdframe+104($sp)
1455 la $sp,$stdframe+112($sp)
1456
1457 lm${g} $s2,$s3,10*$SIZE_T($sp)
1458 br $ra
1459
1460.align 16
1461.Lctr32_nokma:
1462 stm${g} %r6,$s1,6*$SIZE_T($sp)
874a3757
AP
1463
1464 slgr $out,$inp
1465 la %r1,0($key) # %r1 is permanent copy of $key
1466 lg $iv0,0($ivp) # load ivec
1467 lg $ivp,8($ivp)
1468
26064d7f
AP
1469 # prepare and allocate stack frame at the top of 4K page
1470 # with 1K reserved for eventual signal handling
1471 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
874a3757 1472 lghi $s1,-4096
874a3757 1473 algr $s0,$sp
26064d7f 1474 lgr $fp,$sp
874a3757 1475 ngr $s0,$s1 # align at page boundary
26064d7f
AP
1476 slgr $fp,$s0 # total buffer size
1477 lgr $s2,$sp
1478 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1479 slgr $fp,$s1 # deduct reservation to get usable buffer size
1480 # buffer size is at lest 256 and at most 3072+256-16
1481
1482 la $sp,1024($s0) # alloca
1483 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
e822c756
AP
1484 st${g} $s2,0($sp) # back-chain
1485 st${g} $fp,$SIZE_T($sp)
874a3757
AP
1486
1487 slgr $len,$fp
0ab8fd58 1488 brc 1,.Lctr32_hw_switch # not zero, no borrow
26064d7f 1489 algr $fp,$len # input is shorter than allocated buffer
874a3757 1490 lghi $len,0
e822c756 1491 st${g} $fp,$SIZE_T($sp)
874a3757 1492
0ab8fd58
AP
1493.Lctr32_hw_switch:
1494___
4c5100ce 1495$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
0ab8fd58
AP
1496 llgfr $s0,%r0
1497 lgr $s1,%r1
670ad0fb 1498 larl %r1,OPENSSL_s390xcap_P
0ab8fd58
AP
1499 llihh %r0,0x8000 # check if kmctr supports the function code
1500 srlg %r0,%r0,0($s0)
bc4e831c 1501 ng %r0,S390X_KMCTR(%r1) # check kmctr capability vector
0ab8fd58
AP
1502 lgr %r0,$s0
1503 lgr %r1,$s1
1504 jz .Lctr32_km_loop
1505
1506####### kmctr code
1507 algr $out,$inp # restore $out
1508 lgr $s1,$len # $s1 undertakes $len
1509 j .Lctr32_kmctr_loop
1510.align 16
1511.Lctr32_kmctr_loop:
1512 la $s2,16($sp)
1513 lgr $s3,$fp
1514.Lctr32_kmctr_prepare:
1515 stg $iv0,0($s2)
1516 stg $ivp,8($s2)
1517 la $s2,16($s2)
1518 ahi $ivp,1 # 32-bit increment, preserves upper half
1519 brct $s3,.Lctr32_kmctr_prepare
1520
1521 #la $inp,0($inp) # inp
1522 sllg $len,$fp,4 # len
1523 #la $out,0($out) # out
1524 la $s2,16($sp) # iv
1525 .long 0xb92da042 # kmctr $out,$s2,$inp
1526 brc 1,.-4 # pay attention to "partial completion"
1527
1528 slgr $s1,$fp
1529 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1530 algr $fp,$s1
1531 lghi $s1,0
1532 brc 4+1,.Lctr32_kmctr_loop # not zero
1533
1534 l${g} $sp,0($sp)
1535 lm${g} %r6,$s3,6*$SIZE_T($sp)
1536 br $ra
1537.align 16
1538___
4c5100ce 1539$code.=<<___ if (!$softonly);
0ab8fd58 1540.Lctr32_km_loop:
874a3757
AP
1541 la $s2,16($sp)
1542 lgr $s3,$fp
0ab8fd58 1543.Lctr32_km_prepare:
874a3757
AP
1544 stg $iv0,0($s2)
1545 stg $ivp,8($s2)
1546 la $s2,16($s2)
1547 ahi $ivp,1 # 32-bit increment, preserves upper half
0ab8fd58 1548 brct $s3,.Lctr32_km_prepare
874a3757
AP
1549
1550 la $s0,16($sp) # inp
1551 sllg $s1,$fp,4 # len
1552 la $s2,16($sp) # out
1553 .long 0xb92e00a8 # km %r10,%r8
1554 brc 1,.-4 # pay attention to "partial completion"
1555
1556 la $s2,16($sp)
1557 lgr $s3,$fp
1558 slgr $s2,$inp
0ab8fd58 1559.Lctr32_km_xor:
874a3757
AP
1560 lg $s0,0($inp)
1561 lg $s1,8($inp)
1562 xg $s0,0($s2,$inp)
1563 xg $s1,8($s2,$inp)
1564 stg $s0,0($out,$inp)
1565 stg $s1,8($out,$inp)
1566 la $inp,16($inp)
0ab8fd58 1567 brct $s3,.Lctr32_km_xor
874a3757
AP
1568
1569 slgr $len,$fp
0ab8fd58 1570 brc 1,.Lctr32_km_loop # not zero, no borrow
874a3757
AP
1571 algr $fp,$len
1572 lghi $len,0
0ab8fd58 1573 brc 4+1,.Lctr32_km_loop # not zero
874a3757 1574
e822c756
AP
1575 l${g} $s0,0($sp)
1576 l${g} $s1,$SIZE_T($sp)
874a3757 1577 la $s2,16($sp)
0ab8fd58 1578.Lctr32_km_zap:
874a3757
AP
1579 stg $s0,0($s2)
1580 stg $s0,8($s2)
1581 la $s2,16($s2)
0ab8fd58 1582 brct $s1,.Lctr32_km_zap
874a3757
AP
1583
1584 la $sp,0($s0)
e822c756 1585 lm${g} %r6,$s3,6*$SIZE_T($sp)
874a3757
AP
1586 br $ra
1587.align 16
1588.Lctr32_software:
1589___
1590$code.=<<___;
e822c756 1591 stm${g} $key,$ra,5*$SIZE_T($sp)
0ab8fd58 1592 sl${g}r $inp,$out
874a3757
AP
1593 larl $tbl,AES_Te
1594 llgf $t1,12($ivp)
1595
1596.Lctr32_loop:
0ab8fd58 1597 stm${g} $inp,$out,2*$SIZE_T($sp)
874a3757
AP
1598 llgf $s0,0($ivp)
1599 llgf $s1,4($ivp)
1600 llgf $s2,8($ivp)
1601 lgr $s3,$t1
e822c756 1602 st $t1,16*$SIZE_T($sp)
874a3757
AP
1603 lgr %r4,$key
1604
1605 bras $ra,_s390x_AES_encrypt
1606
e822c756
AP
1607 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1608 llgf $t1,16*$SIZE_T($sp)
0ab8fd58
AP
1609 x $s0,0($inp,$out)
1610 x $s1,4($inp,$out)
1611 x $s2,8($inp,$out)
1612 x $s3,12($inp,$out)
1613 stm $s0,$s3,0($out)
1614
1615 la $out,16($out)
1616 ahi $t1,1 # 32-bit increment
1617 brct $len,.Lctr32_loop
1618
1619 lm${g} %r6,$ra,6*$SIZE_T($sp)
1620 br $ra
1621.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1622___
1623}
1624
1625########################################################################
96cce820
PS
1626# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1627# size_t len, const AES_KEY *key1, const AES_KEY *key2,
0c237e42 1628# const unsigned char iv[16]);
0ab8fd58
AP
1629#
1630{
1631my $inp="%r2";
1632my $out="%r4"; # len and out are swapped
1633my $len="%r3";
1634my $key1="%r5"; # $i1
1635my $key2="%r6"; # $i2
1636my $fp="%r7"; # $i3
1637my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1638
1639$code.=<<___;
1640.type _s390x_xts_km,\@function
1641.align 16
1642_s390x_xts_km:
1643___
0c237e42 1644$code.=<<___ if(1);
0ab8fd58
AP
1645 llgfr $s0,%r0 # put aside the function code
1646 lghi $s1,0x7f
1647 nr $s1,%r0
670ad0fb
AP
1648 larl %r1,OPENSSL_s390xcap_P
1649 llihh %r0,0x8000
1650 srlg %r0,%r0,32($s1) # check for 32+function code
bc4e831c 1651 ng %r0,S390X_KM(%r1) # check km capability vector
0ab8fd58
AP
1652 lgr %r0,$s0 # restore the function code
1653 la %r1,0($key1) # restore $key1
1654 jz .Lxts_km_vanilla
1655
1656 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1657 algr $out,$inp
1658
1659 oill %r0,32 # switch to xts function code
1660 aghi $s1,-18 #
1661 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1662 la %r1,$tweak-16($sp)
1663 slgr %r1,$s1 # parameter block position
1664 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1665 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1666 # yes, it contains junk and overlaps
1667 # with the tweak in 128-bit case.
1668 # it's done to avoid conditional
1669 # branch.
1670 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1671
1672 .long 0xb92e0042 # km %r4,%r2
1673 brc 1,.-4 # pay attention to "partial completion"
1674
1675 lrvg $s0,$tweak+0($sp) # load the last tweak
1676 lrvg $s1,$tweak+8($sp)
8df400cf 1677 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
0ab8fd58
AP
1678
1679 nill %r0,0xffdf # switch back to original function code
1680 la %r1,0($key1) # restore pointer to $key1
1681 slgr $out,$inp
1682
1683 llgc $len,2*$SIZE_T-1($sp)
1684 nill $len,0x0f # $len%=16
1685 br $ra
609b0852 1686
0ab8fd58
AP
1687.align 16
1688.Lxts_km_vanilla:
1689___
1690$code.=<<___;
1691 # prepare and allocate stack frame at the top of 4K page
1692 # with 1K reserved for eventual signal handling
1693 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1694 lghi $s1,-4096
1695 algr $s0,$sp
1696 lgr $fp,$sp
1697 ngr $s0,$s1 # align at page boundary
1698 slgr $fp,$s0 # total buffer size
1699 lgr $s2,$sp
1700 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1701 slgr $fp,$s1 # deduct reservation to get usable buffer size
1702 # buffer size is at lest 256 and at most 3072+256-16
1703
1704 la $sp,1024($s0) # alloca
1705 nill $fp,0xfff0 # round to 16*n
1706 st${g} $s2,0($sp) # back-chain
1707 nill $len,0xfff0 # redundant
1708 st${g} $fp,$SIZE_T($sp)
1709
1710 slgr $len,$fp
1711 brc 1,.Lxts_km_go # not zero, no borrow
1712 algr $fp,$len # input is shorter than allocated buffer
1713 lghi $len,0
1714 st${g} $fp,$SIZE_T($sp)
1715
1716.Lxts_km_go:
1717 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1718 lrvg $s1,$tweak+8($s2)
1719
1720 la $s2,16($sp) # vector of ascending tweak values
1721 slgr $s2,$inp
1722 srlg $s3,$fp,4
1723 j .Lxts_km_start
1724
1725.Lxts_km_loop:
1726 la $s2,16($sp)
1727 slgr $s2,$inp
1728 srlg $s3,$fp,4
1729.Lxts_km_prepare:
1730 lghi $i1,0x87
1731 srag $i2,$s1,63 # broadcast upper bit
1732 ngr $i1,$i2 # rem
c3cddeae
AP
1733 algr $s0,$s0
1734 alcgr $s1,$s1
0ab8fd58 1735 xgr $s0,$i1
0ab8fd58
AP
1736.Lxts_km_start:
1737 lrvgr $i1,$s0 # flip byte order
1738 lrvgr $i2,$s1
1739 stg $i1,0($s2,$inp)
1740 stg $i2,8($s2,$inp)
1741 xg $i1,0($inp)
1742 xg $i2,8($inp)
1743 stg $i1,0($out,$inp)
1744 stg $i2,8($out,$inp)
1745 la $inp,16($inp)
1746 brct $s3,.Lxts_km_prepare
1747
1748 slgr $inp,$fp # rewind $inp
1749 la $s2,0($out,$inp)
1750 lgr $s3,$fp
1751 .long 0xb92e00aa # km $s2,$s2
1752 brc 1,.-4 # pay attention to "partial completion"
1753
1754 la $s2,16($sp)
1755 slgr $s2,$inp
1756 srlg $s3,$fp,4
1757.Lxts_km_xor:
1758 lg $i1,0($out,$inp)
1759 lg $i2,8($out,$inp)
1760 xg $i1,0($s2,$inp)
1761 xg $i2,8($s2,$inp)
1762 stg $i1,0($out,$inp)
1763 stg $i2,8($out,$inp)
1764 la $inp,16($inp)
1765 brct $s3,.Lxts_km_xor
1766
1767 slgr $len,$fp
1768 brc 1,.Lxts_km_loop # not zero, no borrow
1769 algr $fp,$len
1770 lghi $len,0
1771 brc 4+1,.Lxts_km_loop # not zero
1772
1773 l${g} $i1,0($sp) # back-chain
1774 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1775 la $i2,16($sp)
1776 srlg $fp,$fp,4
1777.Lxts_km_zap:
1778 stg $i1,0($i2)
1779 stg $i1,8($i2)
1780 la $i2,16($i2)
1781 brct $fp,.Lxts_km_zap
1782
1783 la $sp,0($i1)
1784 llgc $len,2*$SIZE_T-1($i1)
1785 nill $len,0x0f # $len%=16
1786 bzr $ra
1787
1788 # generate one more tweak...
1789 lghi $i1,0x87
1790 srag $i2,$s1,63 # broadcast upper bit
1791 ngr $i1,$i2 # rem
c3cddeae
AP
1792 algr $s0,$s0
1793 alcgr $s1,$s1
0ab8fd58 1794 xgr $s0,$i1
0ab8fd58
AP
1795
1796 ltr $len,$len # clear zero flag
1797 br $ra
1798.size _s390x_xts_km,.-_s390x_xts_km
1799
1800.globl AES_xts_encrypt
1801.type AES_xts_encrypt,\@function
1802.align 16
1803AES_xts_encrypt:
1804 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1805 xgr %r4,%r3
1806 xgr %r3,%r4
1807___
1808$code.=<<___ if ($SIZE_T==4);
1809 llgfr $len,$len
1810___
1811$code.=<<___;
1812 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1813 srag $len,$len,4 # formally wrong, because it expands
1814 # sign byte, but who can afford asking
1815 # to process more than 2^63-1 bytes?
1816 # I use it, because it sets condition
1817 # code...
1818 bcr 8,$ra # abort if zero (i.e. less than 16)
1819___
1820$code.=<<___ if (!$softonly);
1821 llgf %r0,240($key2)
1822 lhi %r1,16
1823 clr %r0,%r1
1824 jl .Lxts_enc_software
1825
8df400cf 1826 st${g} $ra,5*$SIZE_T($sp)
0ab8fd58 1827 stm${g} %r6,$s3,6*$SIZE_T($sp)
0ab8fd58
AP
1828
1829 sllg $len,$len,4 # $len&=~15
1830 slgr $out,$inp
1831
0c237e42
AP
1832 # generate the tweak value
1833 l${g} $s3,$stdframe($sp) # pointer to iv
0ab8fd58 1834 la $s2,$tweak($sp)
0c237e42 1835 lmg $s0,$s1,0($s3)
0ab8fd58
AP
1836 lghi $s3,16
1837 stmg $s0,$s1,0($s2)
1838 la %r1,0($key2) # $key2 is not needed anymore
1839 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1840 brc 1,.-4 # can this happen?
1841
1842 l %r0,240($key1)
1843 la %r1,0($key1) # $key1 is not needed anymore
1844 bras $ra,_s390x_xts_km
1845 jz .Lxts_enc_km_done
1846
1847 aghi $inp,-16 # take one step back
1848 la $i3,0($out,$inp) # put aside real $out
1849.Lxts_enc_km_steal:
1850 llgc $i1,16($inp)
1851 llgc $i2,0($out,$inp)
1852 stc $i1,0($out,$inp)
1853 stc $i2,16($out,$inp)
1854 la $inp,1($inp)
1855 brct $len,.Lxts_enc_km_steal
1856
1857 la $s2,0($i3)
1858 lghi $s3,16
1859 lrvgr $i1,$s0 # flip byte order
1860 lrvgr $i2,$s1
1861 xg $i1,0($s2)
1862 xg $i2,8($s2)
1863 stg $i1,0($s2)
1864 stg $i2,8($s2)
1865 .long 0xb92e00aa # km $s2,$s2
1866 brc 1,.-4 # can this happen?
1867 lrvgr $i1,$s0 # flip byte order
1868 lrvgr $i2,$s1
1869 xg $i1,0($i3)
1870 xg $i2,8($i3)
1871 stg $i1,0($i3)
1872 stg $i2,8($i3)
1873
1874.Lxts_enc_km_done:
8df400cf
AP
1875 stg $sp,$tweak+0($sp) # wipe tweak
1876 stg $sp,$tweak+8($sp)
1877 l${g} $ra,5*$SIZE_T($sp)
0ab8fd58
AP
1878 lm${g} %r6,$s3,6*$SIZE_T($sp)
1879 br $ra
1880.align 16
1881.Lxts_enc_software:
1882___
1883$code.=<<___;
1884 stm${g} %r6,$ra,6*$SIZE_T($sp)
1885
1886 slgr $out,$inp
1887
c3cddeae
AP
1888 l${g} $s3,$stdframe($sp) # ivp
1889 llgf $s0,0($s3) # load iv
1890 llgf $s1,4($s3)
1891 llgf $s2,8($s3)
1892 llgf $s3,12($s3)
0ab8fd58
AP
1893 stm${g} %r2,%r5,2*$SIZE_T($sp)
1894 la $key,0($key2)
1895 larl $tbl,AES_Te
1896 bras $ra,_s390x_AES_encrypt # generate the tweak
1897 lm${g} %r2,%r5,2*$SIZE_T($sp)
1898 stm $s0,$s3,$tweak($sp) # save the tweak
1899 j .Lxts_enc_enter
1900
1901.align 16
1902.Lxts_enc_loop:
1903 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1904 lrvg $s3,$tweak+8($sp)
1905 lghi %r1,0x87
1906 srag %r0,$s3,63 # broadcast upper bit
1907 ngr %r1,%r0 # rem
c3cddeae
AP
1908 algr $s1,$s1
1909 alcgr $s3,$s3
0ab8fd58 1910 xgr $s1,%r1
0ab8fd58
AP
1911 lrvgr $s1,$s1 # flip byte order
1912 lrvgr $s3,$s3
609b0852 1913 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
0ab8fd58
AP
1914 stg $s1,$tweak+0($sp) # save the tweak
1915 llgfr $s1,$s1
1916 srlg $s2,$s3,32
1917 stg $s3,$tweak+8($sp)
1918 llgfr $s3,$s3
1919 la $inp,16($inp) # $inp+=16
1920.Lxts_enc_enter:
1921 x $s0,0($inp) # ^=*($inp)
874a3757
AP
1922 x $s1,4($inp)
1923 x $s2,8($inp)
1924 x $s3,12($inp)
0ab8fd58
AP
1925 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1926 la $key,0($key1)
1927 bras $ra,_s390x_AES_encrypt
1928 lm${g} %r2,%r5,2*$SIZE_T($sp)
1929 x $s0,$tweak+0($sp) # ^=tweak
1930 x $s1,$tweak+4($sp)
1931 x $s2,$tweak+8($sp)
1932 x $s3,$tweak+12($sp)
874a3757
AP
1933 st $s0,0($out,$inp)
1934 st $s1,4($out,$inp)
1935 st $s2,8($out,$inp)
1936 st $s3,12($out,$inp)
0ab8fd58
AP
1937 brct${g} $len,.Lxts_enc_loop
1938
1939 llgc $len,`2*$SIZE_T-1`($sp)
1940 nill $len,0x0f # $len%16
1941 jz .Lxts_enc_done
1942
1943 la $i3,0($inp,$out) # put aside real $out
1944.Lxts_enc_steal:
1945 llgc %r0,16($inp)
1946 llgc %r1,0($out,$inp)
1947 stc %r0,0($out,$inp)
1948 stc %r1,16($out,$inp)
1949 la $inp,1($inp)
1950 brct $len,.Lxts_enc_steal
1951 la $out,0($i3) # restore real $out
1952
1953 # generate last tweak...
1954 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1955 lrvg $s3,$tweak+8($sp)
1956 lghi %r1,0x87
1957 srag %r0,$s3,63 # broadcast upper bit
1958 ngr %r1,%r0 # rem
c3cddeae
AP
1959 algr $s1,$s1
1960 alcgr $s3,$s3
0ab8fd58 1961 xgr $s1,%r1
0ab8fd58
AP
1962 lrvgr $s1,$s1 # flip byte order
1963 lrvgr $s3,$s3
609b0852 1964 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
0ab8fd58
AP
1965 stg $s1,$tweak+0($sp) # save the tweak
1966 llgfr $s1,$s1
1967 srlg $s2,$s3,32
1968 stg $s3,$tweak+8($sp)
1969 llgfr $s3,$s3
1970
1971 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1972 x $s1,4($out)
1973 x $s2,8($out)
1974 x $s3,12($out)
1975 st${g} $out,4*$SIZE_T($sp)
1976 la $key,0($key1)
1977 bras $ra,_s390x_AES_encrypt
1978 l${g} $out,4*$SIZE_T($sp)
1979 x $s0,`$tweak+0`($sp) # ^=tweak
1980 x $s1,`$tweak+4`($sp)
1981 x $s2,`$tweak+8`($sp)
1982 x $s3,`$tweak+12`($sp)
1983 st $s0,0($out)
1984 st $s1,4($out)
1985 st $s2,8($out)
1986 st $s3,12($out)
1987
1988.Lxts_enc_done:
1989 stg $sp,$tweak+0($sp) # wipe tweak
1990 stg $sp,$twesk+8($sp)
1991 lm${g} %r6,$ra,6*$SIZE_T($sp)
1992 br $ra
1993.size AES_xts_encrypt,.-AES_xts_encrypt
1994___
96cce820
PS
1995# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1996# size_t len, const AES_KEY *key1, const AES_KEY *key2,
c3cddeae 1997# const unsigned char iv[16]);
0ab8fd58
AP
1998#
1999$code.=<<___;
2000.globl AES_xts_decrypt
2001.type AES_xts_decrypt,\@function
2002.align 16
2003AES_xts_decrypt:
2004 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
2005 xgr %r4,%r3
2006 xgr %r3,%r4
2007___
2008$code.=<<___ if ($SIZE_T==4);
2009 llgfr $len,$len
2010___
2011$code.=<<___;
2012 st${g} $len,1*$SIZE_T($sp) # save copy of $len
2013 aghi $len,-16
2014 bcr 4,$ra # abort if less than zero. formally
2015 # wrong, because $len is unsigned,
2016 # but who can afford asking to
2017 # process more than 2^63-1 bytes?
2018 tmll $len,0x0f
2019 jnz .Lxts_dec_proceed
2020 aghi $len,16
2021.Lxts_dec_proceed:
2022___
2023$code.=<<___ if (!$softonly);
2024 llgf %r0,240($key2)
2025 lhi %r1,16
2026 clr %r0,%r1
2027 jl .Lxts_dec_software
874a3757 2028
8df400cf 2029 st${g} $ra,5*$SIZE_T($sp)
0ab8fd58 2030 stm${g} %r6,$s3,6*$SIZE_T($sp)
0ab8fd58
AP
2031
2032 nill $len,0xfff0 # $len&=~15
2033 slgr $out,$inp
2034
2035 # generate the tweak value
0c237e42 2036 l${g} $s3,$stdframe($sp) # pointer to iv
0ab8fd58 2037 la $s2,$tweak($sp)
0c237e42 2038 lmg $s0,$s1,0($s3)
0ab8fd58 2039 lghi $s3,16
0c237e42 2040 stmg $s0,$s1,0($s2)
0ab8fd58
AP
2041 la %r1,0($key2) # $key2 is not needed past this point
2042 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2043 brc 1,.-4 # can this happen?
2044
2045 l %r0,240($key1)
2046 la %r1,0($key1) # $key1 is not needed anymore
2047
2048 ltgr $len,$len
2049 jz .Lxts_dec_km_short
2050 bras $ra,_s390x_xts_km
2051 jz .Lxts_dec_km_done
2052
2053 lrvgr $s2,$s0 # make copy in reverse byte order
2054 lrvgr $s3,$s1
2055 j .Lxts_dec_km_2ndtweak
2056
2057.Lxts_dec_km_short:
2058 llgc $len,`2*$SIZE_T-1`($sp)
2059 nill $len,0x0f # $len%=16
2060 lrvg $s0,$tweak+0($sp) # load the tweak
2061 lrvg $s1,$tweak+8($sp)
2062 lrvgr $s2,$s0 # make copy in reverse byte order
2063 lrvgr $s3,$s1
2064
2065.Lxts_dec_km_2ndtweak:
2066 lghi $i1,0x87
2067 srag $i2,$s1,63 # broadcast upper bit
2068 ngr $i1,$i2 # rem
c3cddeae
AP
2069 algr $s0,$s0
2070 alcgr $s1,$s1
0ab8fd58 2071 xgr $s0,$i1
0ab8fd58
AP
2072 lrvgr $i1,$s0 # flip byte order
2073 lrvgr $i2,$s1
2074
2075 xg $i1,0($inp)
2076 xg $i2,8($inp)
2077 stg $i1,0($out,$inp)
2078 stg $i2,8($out,$inp)
2079 la $i2,0($out,$inp)
2080 lghi $i3,16
2081 .long 0xb92e0066 # km $i2,$i2
2082 brc 1,.-4 # can this happen?
2083 lrvgr $i1,$s0
2084 lrvgr $i2,$s1
2085 xg $i1,0($out,$inp)
2086 xg $i2,8($out,$inp)
2087 stg $i1,0($out,$inp)
2088 stg $i2,8($out,$inp)
2089
2090 la $i3,0($out,$inp) # put aside real $out
2091.Lxts_dec_km_steal:
2092 llgc $i1,16($inp)
2093 llgc $i2,0($out,$inp)
2094 stc $i1,0($out,$inp)
2095 stc $i2,16($out,$inp)
2096 la $inp,1($inp)
2097 brct $len,.Lxts_dec_km_steal
2098
2099 lgr $s0,$s2
2100 lgr $s1,$s3
2101 xg $s0,0($i3)
2102 xg $s1,8($i3)
2103 stg $s0,0($i3)
2104 stg $s1,8($i3)
2105 la $s0,0($i3)
2106 lghi $s1,16
2107 .long 0xb92e0088 # km $s0,$s0
2108 brc 1,.-4 # can this happen?
2109 xg $s2,0($i3)
2110 xg $s3,8($i3)
2111 stg $s2,0($i3)
2112 stg $s3,8($i3)
2113.Lxts_dec_km_done:
8df400cf
AP
2114 stg $sp,$tweak+0($sp) # wipe tweak
2115 stg $sp,$tweak+8($sp)
2116 l${g} $ra,5*$SIZE_T($sp)
0ab8fd58
AP
2117 lm${g} %r6,$s3,6*$SIZE_T($sp)
2118 br $ra
2119.align 16
2120.Lxts_dec_software:
2121___
2122$code.=<<___;
2123 stm${g} %r6,$ra,6*$SIZE_T($sp)
2124
2125 srlg $len,$len,4
2126 slgr $out,$inp
2127
c3cddeae
AP
2128 l${g} $s3,$stdframe($sp) # ivp
2129 llgf $s0,0($s3) # load iv
2130 llgf $s1,4($s3)
2131 llgf $s2,8($s3)
2132 llgf $s3,12($s3)
0ab8fd58
AP
2133 stm${g} %r2,%r5,2*$SIZE_T($sp)
2134 la $key,0($key2)
2135 larl $tbl,AES_Te
2136 bras $ra,_s390x_AES_encrypt # generate the tweak
2137 lm${g} %r2,%r5,2*$SIZE_T($sp)
2138 larl $tbl,AES_Td
2139 lt${g}r $len,$len
2140 stm $s0,$s3,$tweak($sp) # save the tweak
2141 jz .Lxts_dec_short
2142 j .Lxts_dec_enter
2143
2144.align 16
2145.Lxts_dec_loop:
2146 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2147 lrvg $s3,$tweak+8($sp)
2148 lghi %r1,0x87
2149 srag %r0,$s3,63 # broadcast upper bit
2150 ngr %r1,%r0 # rem
c3cddeae
AP
2151 algr $s1,$s1
2152 alcgr $s3,$s3
0ab8fd58 2153 xgr $s1,%r1
0ab8fd58
AP
2154 lrvgr $s1,$s1 # flip byte order
2155 lrvgr $s3,$s3
609b0852 2156 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
0ab8fd58
AP
2157 stg $s1,$tweak+0($sp) # save the tweak
2158 llgfr $s1,$s1
2159 srlg $s2,$s3,32
2160 stg $s3,$tweak+8($sp)
2161 llgfr $s3,$s3
2162.Lxts_dec_enter:
2163 x $s0,0($inp) # tweak^=*(inp)
2164 x $s1,4($inp)
2165 x $s2,8($inp)
2166 x $s3,12($inp)
2167 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2168 la $key,0($key1)
2169 bras $ra,_s390x_AES_decrypt
2170 lm${g} %r2,%r5,2*$SIZE_T($sp)
2171 x $s0,$tweak+0($sp) # ^=tweak
2172 x $s1,$tweak+4($sp)
2173 x $s2,$tweak+8($sp)
2174 x $s3,$tweak+12($sp)
2175 st $s0,0($out,$inp)
2176 st $s1,4($out,$inp)
2177 st $s2,8($out,$inp)
2178 st $s3,12($out,$inp)
874a3757 2179 la $inp,16($inp)
0ab8fd58
AP
2180 brct${g} $len,.Lxts_dec_loop
2181
2182 llgc $len,`2*$SIZE_T-1`($sp)
2183 nill $len,0x0f # $len%16
2184 jz .Lxts_dec_done
2185
2186 # generate pair of tweaks...
2187 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2188 lrvg $s3,$tweak+8($sp)
2189 lghi %r1,0x87
2190 srag %r0,$s3,63 # broadcast upper bit
2191 ngr %r1,%r0 # rem
c3cddeae
AP
2192 algr $s1,$s1
2193 alcgr $s3,$s3
0ab8fd58 2194 xgr $s1,%r1
0ab8fd58
AP
2195 lrvgr $i2,$s1 # flip byte order
2196 lrvgr $i3,$s3
2197 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2198 j .Lxts_dec_2ndtweak
2199
2200.align 16
2201.Lxts_dec_short:
2202 llgc $len,`2*$SIZE_T-1`($sp)
2203 nill $len,0x0f # $len%16
2204 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2205 lrvg $s3,$tweak+8($sp)
2206.Lxts_dec_2ndtweak:
2207 lghi %r1,0x87
2208 srag %r0,$s3,63 # broadcast upper bit
2209 ngr %r1,%r0 # rem
c3cddeae
AP
2210 algr $s1,$s1
2211 alcgr $s3,$s3
0ab8fd58 2212 xgr $s1,%r1
0ab8fd58
AP
2213 lrvgr $s1,$s1 # flip byte order
2214 lrvgr $s3,$s3
2215 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2216 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2217 llgfr $s1,$s1
2218 srlg $s2,$s3,32
2219 stg $s3,$tweak-16+8($sp)
2220 llgfr $s3,$s3
2221
2222 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2223 x $s1,4($inp)
2224 x $s2,8($inp)
2225 x $s3,12($inp)
2226 stm${g} %r2,%r3,2*$SIZE_T($sp)
2227 la $key,0($key1)
2228 bras $ra,_s390x_AES_decrypt
2229 lm${g} %r2,%r5,2*$SIZE_T($sp)
2230 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2231 x $s1,$tweak-16+4($sp)
2232 x $s2,$tweak-16+8($sp)
2233 x $s3,$tweak-16+12($sp)
2234 st $s0,0($out,$inp)
2235 st $s1,4($out,$inp)
2236 st $s2,8($out,$inp)
2237 st $s3,12($out,$inp)
874a3757 2238
0ab8fd58
AP
2239 la $i3,0($out,$inp) # put aside real $out
2240.Lxts_dec_steal:
2241 llgc %r0,16($inp)
2242 llgc %r1,0($out,$inp)
2243 stc %r0,0($out,$inp)
2244 stc %r1,16($out,$inp)
2245 la $inp,1($inp)
2246 brct $len,.Lxts_dec_steal
2247 la $out,0($i3) # restore real $out
2248
2249 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2250 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2251 x $s1,4($out)
2252 x $s2,8($out)
2253 x $s3,12($out)
2254 st${g} $out,4*$SIZE_T($sp)
2255 la $key,0($key1)
2256 bras $ra,_s390x_AES_decrypt
2257 l${g} $out,4*$SIZE_T($sp)
2258 x $s0,$tweak+0($sp) # ^=tweak
2259 x $s1,$tweak+4($sp)
2260 x $s2,$tweak+8($sp)
2261 x $s3,$tweak+12($sp)
2262 st $s0,0($out)
2263 st $s1,4($out)
2264 st $s2,8($out)
2265 st $s3,12($out)
2266 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2267 stg $sp,$tweak-16+8($sp)
2268.Lxts_dec_done:
2269 stg $sp,$tweak+0($sp) # wipe tweak
2270 stg $sp,$twesk+8($sp)
e822c756 2271 lm${g} %r6,$ra,6*$SIZE_T($sp)
874a3757 2272 br $ra
0ab8fd58 2273.size AES_xts_decrypt,.-AES_xts_decrypt
8626230a
AP
2274___
2275}
2276$code.=<<___;
a2a54ffc
AP
2277.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2278___
2279
2280$code =~ s/\`([^\`]*)\`/eval $1/gem;
2281print $code;
1cbdca7b 2282close STDOUT; # force flush