]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-s390x.pl
Following the license change, modify the boilerplates in crypto/sha/
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-s390x.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for s390x.
17 #
18 # June 2017.
19 #
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # In the nutshell it's transliteration of x86_64 module, because both
26 # architectures have similar capabilities/limitations. Performance
27 # measurement is problematic as I don't have access to an idle system.
28 # It looks like z13 processes one byte [out of long message] in ~14
29 # cycles. At least the result is consistent with estimate based on
30 # amount of instruction and assumed instruction issue rate. It's ~2.5x
31 # faster than compiler-generated code.
32
33 $flavour = shift;
34
35 if ($flavour =~ /3[12]/) {
36 $SIZE_T=4;
37 $g="";
38 } else {
39 $SIZE_T=8;
40 $g="g";
41 }
42
43 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
44 open STDOUT,">$output";
45
46 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
47
48 my @C = map("%r$_",(0,1,5..7));
49 my @D = map("%r$_",(8..12));
50 my @T = map("%r$_",(13..14));
51 my ($src,$dst,$iotas) = map("%r$_",(2..4));
52 my $sp = "%r15";
53
54 $stdframe=16*$SIZE_T+4*8;
55 $frame=$stdframe+25*8;
56
57 my @rhotates = ([ 0, 1, 62, 28, 27 ],
58 [ 36, 44, 6, 55, 20 ],
59 [ 3, 10, 43, 25, 39 ],
60 [ 41, 45, 15, 21, 8 ],
61 [ 18, 2, 61, 56, 14 ]);
62
63 { my @C = @C; # copy, because we mess them up...
64 my @D = @D;
65
66 $code.=<<___;
67 .text
68
69 .type __KeccakF1600,\@function
70 .align 32
71 __KeccakF1600:
72 st${g} %r14,$SIZE_T*14($sp)
73 lg @C[0],$A[4][0]($src)
74 lg @C[1],$A[4][1]($src)
75 lg @C[2],$A[4][2]($src)
76 lg @C[3],$A[4][3]($src)
77 lg @C[4],$A[4][4]($src)
78 larl $iotas,iotas
79 j .Loop
80
81 .align 16
82 .Loop:
83 lg @D[0],$A[0][0]($src)
84 lg @D[1],$A[1][1]($src)
85 lg @D[2],$A[2][2]($src)
86 lg @D[3],$A[3][3]($src)
87
88 xgr @C[0],@D[0]
89 xg @C[1],$A[0][1]($src)
90 xg @C[2],$A[0][2]($src)
91 xg @C[3],$A[0][3]($src)
92 lgr @D[4],@C[4]
93 xg @C[4],$A[0][4]($src)
94
95 xg @C[0],$A[1][0]($src)
96 xgr @C[1],@D[1]
97 xg @C[2],$A[1][2]($src)
98 xg @C[3],$A[1][3]($src)
99 xg @C[4],$A[1][4]($src)
100
101 xg @C[0],$A[2][0]($src)
102 xg @C[1],$A[2][1]($src)
103 xgr @C[2],@D[2]
104 xg @C[3],$A[2][3]($src)
105 xg @C[4],$A[2][4]($src)
106
107 xg @C[0],$A[3][0]($src)
108 xg @C[1],$A[3][1]($src)
109 xg @C[2],$A[3][2]($src)
110 xgr @C[3],@D[3]
111 xg @C[4],$A[3][4]($src)
112
113 lgr @T[0],@C[2]
114 rllg @C[2],@C[2],1
115 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
116
117 rllg @C[0],@C[0],1
118 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
119
120 rllg @C[3],@C[3],1
121 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
122
123 rllg @C[1],@C[1],1
124 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
125
126 rllg @C[4],@C[4],1
127 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
128 ___
129 (@D[0..4], @C) = (@C[1..4,0], @D);
130 $code.=<<___;
131 xgr @C[1],@D[1]
132 xgr @C[2],@D[2]
133 xgr @C[3],@D[3]
134 rllg @C[1],@C[1],$rhotates[1][1]
135 xgr @C[4],@D[4]
136 rllg @C[2],@C[2],$rhotates[2][2]
137 xgr @C[0],@D[0]
138
139 lgr @T[0],@C[1]
140 ogr @C[1],@C[2]
141 rllg @C[3],@C[3],$rhotates[3][3]
142 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
143 rllg @C[4],@C[4],$rhotates[4][4]
144 xg @C[1],0($iotas)
145 la $iotas,8($iotas)
146 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
147
148 lgr @T[1],@C[4]
149 ngr @C[4],@C[3]
150 lghi @C[1],-1 # no 'not' instruction :-(
151 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
152 xgr @C[2],@C[1] # not @C[2]
153 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
154 ogr @C[2],@C[3]
155 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
156
157 ngr @T[0],@C[0]
158 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
159 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
160 ogr @T[1],@C[0]
161 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
162 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
163 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
164
165
166 lg @C[0],$A[0][3]($src)
167 lg @C[4],$A[4][2]($src)
168 lg @C[3],$A[3][1]($src)
169 lg @C[1],$A[1][4]($src)
170 lg @C[2],$A[2][0]($src)
171
172 xgr @C[0],@D[3]
173 xgr @C[4],@D[2]
174 rllg @C[0],@C[0],$rhotates[0][3]
175 xgr @C[3],@D[1]
176 rllg @C[4],@C[4],$rhotates[4][2]
177 xgr @C[1],@D[4]
178 rllg @C[3],@C[3],$rhotates[3][1]
179 xgr @C[2],@D[0]
180
181 lgr @T[0],@C[0]
182 ogr @C[0],@C[4]
183 rllg @C[1],@C[1],$rhotates[1][4]
184 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
185 rllg @C[2],@C[2],$rhotates[2][0]
186 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
187
188 lgr @T[1],@C[1]
189 ngr @C[1],@T[0]
190 lghi @C[0],-1 # no 'not' instruction :-(
191 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
192 xgr @C[4],@C[0] # not @C[4]
193 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
194
195 ogr @C[4],@C[3]
196 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
197
198 ngr @C[3],@C[2]
199 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
200 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
201 ogr @T[1],@C[2]
202 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
203 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
204 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
205
206
207 lg @C[2],$A[2][3]($src)
208 lg @C[3],$A[3][4]($src)
209 lg @C[1],$A[1][2]($src)
210 lg @C[4],$A[4][0]($src)
211 lg @C[0],$A[0][1]($src)
212
213 xgr @C[2],@D[3]
214 xgr @C[3],@D[4]
215 rllg @C[2],@C[2],$rhotates[2][3]
216 xgr @C[1],@D[2]
217 rllg @C[3],@C[3],$rhotates[3][4]
218 xgr @C[4],@D[0]
219 rllg @C[1],@C[1],$rhotates[1][2]
220 xgr @C[0],@D[1]
221
222 lgr @T[0],@C[2]
223 ngr @C[2],@C[3]
224 rllg @C[4],@C[4],$rhotates[4][0]
225 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
226 lghi @T[1],-1 # no 'not' instruction :-(
227 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
228
229 xgr @C[3],@T[1] # not @C[3]
230 lgr @T[1],@C[4]
231 ngr @C[4],@C[3]
232 rllg @C[0],@C[0],$rhotates[0][1]
233 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
234 ogr @T[0],@C[1]
235 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
236 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
237
238 ngr @C[1],@C[0]
239 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
240 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
241 ogr @C[0],@T[1]
242 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
243 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
244 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
245
246
247 lg @C[2],$A[2][1]($src)
248 lg @C[3],$A[3][2]($src)
249 lg @C[1],$A[1][0]($src)
250 lg @C[4],$A[4][3]($src)
251 lg @C[0],$A[0][4]($src)
252
253 xgr @C[2],@D[1]
254 xgr @C[3],@D[2]
255 rllg @C[2],@C[2],$rhotates[2][1]
256 xgr @C[1],@D[0]
257 rllg @C[3],@C[3],$rhotates[3][2]
258 xgr @C[4],@D[3]
259 rllg @C[1],@C[1],$rhotates[1][0]
260 xgr @C[0],@D[4]
261 rllg @C[4],@C[4],$rhotates[4][3]
262
263 lgr @T[0],@C[2]
264 ogr @C[2],@C[3]
265 lghi @T[1],-1 # no 'not' instruction :-(
266 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
267 xgr @C[3],@T[1] # not @C[3]
268 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
269
270 lgr @T[1],@C[4]
271 ogr @C[4],@C[3]
272 rllg @C[0],@C[0],$rhotates[0][4]
273 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
274 ngr @T[0],@C[1]
275 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
276 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
277
278 ogr @C[1],@C[0]
279 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
280 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
281 ngr @C[0],@T[1]
282 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
283 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
284 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
285
286
287 xg @D[2],$A[0][2]($src)
288 xg @D[3],$A[1][3]($src)
289 xg @D[1],$A[4][1]($src)
290 xg @D[4],$A[2][4]($src)
291 xgr $dst,$src # xchg $dst,$src
292 rllg @D[2],@D[2],$rhotates[0][2]
293 xg @D[0],$A[3][0]($src)
294 rllg @D[3],@D[3],$rhotates[1][3]
295 xgr $src,$dst
296 rllg @D[1],@D[1],$rhotates[4][1]
297 xgr $dst,$src
298 rllg @D[4],@D[4],$rhotates[2][4]
299 ___
300 @C = @D[2..4,0,1];
301 $code.=<<___;
302 lgr @T[0],@C[0]
303 ngr @C[0],@C[1]
304 lghi @T[1],-1 # no 'not' instruction :-(
305 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
306 xgr @C[1],@T[1] # not @C[1]
307 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
308
309 lgr @T[1],@C[2]
310 ngr @C[2],@C[1]
311 rllg @D[0],@D[0],$rhotates[3][0]
312 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
313 ogr @T[0],@C[4]
314 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
315 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
316
317 ngr @C[4],@C[3]
318 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
319 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
320 ogr @C[3],@T[1]
321 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
322 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
323
324 lgr @C[1],@C[0] # harmonize with the loop top
325 lgr @C[0],@T[0]
326 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
327
328 tmll $iotas,255
329 jnz .Loop
330
331 l${g} %r14,$SIZE_T*14($sp)
332 br %r14
333 .size __KeccakF1600,.-__KeccakF1600
334 ___
335 }
336 {
337 $code.=<<___;
338 .type KeccakF1600,\@function
339 .align 32
340 KeccakF1600:
341 .LKeccakF1600:
342 lghi %r1,-$frame
343 stm${g} %r6,%r15,$SIZE_T*6($sp)
344 lgr %r0,$sp
345 la $sp,0(%r1,$sp)
346 st${g} %r0,0($sp)
347
348 lghi @D[0],-1 # no 'not' instruction :-(
349 lghi @D[1],-1
350 lghi @D[2],-1
351 lghi @D[3],-1
352 lghi @D[4],-1
353 lghi @T[0],-1
354 xg @D[0],$A[0][1]($src)
355 xg @D[1],$A[0][2]($src)
356 xg @D[2],$A[1][3]($src)
357 xg @D[3],$A[2][2]($src)
358 xg @D[4],$A[3][2]($src)
359 xg @T[0],$A[4][0]($src)
360 stmg @D[0],@D[1],$A[0][1]($src)
361 stg @D[2],$A[1][3]($src)
362 stg @D[3],$A[2][2]($src)
363 stg @D[4],$A[3][2]($src)
364 stg @T[0],$A[4][0]($src)
365
366 la $dst,$stdframe($sp)
367
368 bras %r14,__KeccakF1600
369
370 lghi @D[0],-1 # no 'not' instruction :-(
371 lghi @D[1],-1
372 lghi @D[2],-1
373 lghi @D[3],-1
374 lghi @D[4],-1
375 lghi @T[0],-1
376 xg @D[0],$A[0][1]($src)
377 xg @D[1],$A[0][2]($src)
378 xg @D[2],$A[1][3]($src)
379 xg @D[3],$A[2][2]($src)
380 xg @D[4],$A[3][2]($src)
381 xg @T[0],$A[4][0]($src)
382 stmg @D[0],@D[1],$A[0][1]($src)
383 stg @D[2],$A[1][3]($src)
384 stg @D[3],$A[2][2]($src)
385 stg @D[4],$A[3][2]($src)
386 stg @T[0],$A[4][0]($src)
387
388 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
389 br %r14
390 .size KeccakF1600,.-KeccakF1600
391 ___
392 }
393 { my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
394
395 $code.=<<___;
396 .globl SHA3_absorb
397 .type SHA3_absorb,\@function
398 .align 32
399 SHA3_absorb:
400 lghi %r1,-$frame
401 stm${g} %r5,%r15,$SIZE_T*5($sp)
402 lgr %r0,$sp
403 la $sp,0(%r1,$sp)
404 st${g} %r0,0($sp)
405
406 lghi @D[0],-1 # no 'not' instruction :-(
407 lghi @D[1],-1
408 lghi @D[2],-1
409 lghi @D[3],-1
410 lghi @D[4],-1
411 lghi @T[0],-1
412 xg @D[0],$A[0][1]($src)
413 xg @D[1],$A[0][2]($src)
414 xg @D[2],$A[1][3]($src)
415 xg @D[3],$A[2][2]($src)
416 xg @D[4],$A[3][2]($src)
417 xg @T[0],$A[4][0]($src)
418 stmg @D[0],@D[1],$A[0][1]($src)
419 stg @D[2],$A[1][3]($src)
420 stg @D[3],$A[2][2]($src)
421 stg @D[4],$A[3][2]($src)
422 stg @T[0],$A[4][0]($src)
423
424 .Loop_absorb:
425 cl${g}r $len,$bsz
426 jl .Ldone_absorb
427
428 srl${g} $bsz,3
429 la %r1,0($A_flat)
430
431 .Lblock_absorb:
432 lrvg %r0,0($inp)
433 la $inp,8($inp)
434 xg %r0,0(%r1)
435 a${g}hi $len,-8
436 stg %r0,0(%r1)
437 la %r1,8(%r1)
438 brct $bsz,.Lblock_absorb
439
440 stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
441 la $dst,$stdframe($sp)
442 bras %r14,__KeccakF1600
443 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
444 j .Loop_absorb
445
446 .align 16
447 .Ldone_absorb:
448 lghi @D[0],-1 # no 'not' instruction :-(
449 lghi @D[1],-1
450 lghi @D[2],-1
451 lghi @D[3],-1
452 lghi @D[4],-1
453 lghi @T[0],-1
454 xg @D[0],$A[0][1]($src)
455 xg @D[1],$A[0][2]($src)
456 xg @D[2],$A[1][3]($src)
457 xg @D[3],$A[2][2]($src)
458 xg @D[4],$A[3][2]($src)
459 xg @T[0],$A[4][0]($src)
460 stmg @D[0],@D[1],$A[0][1]($src)
461 stg @D[2],$A[1][3]($src)
462 stg @D[3],$A[2][2]($src)
463 stg @D[4],$A[3][2]($src)
464 stg @T[0],$A[4][0]($src)
465
466 lgr %r2,$len # return value
467
468 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
469 br %r14
470 .size SHA3_absorb,.-SHA3_absorb
471 ___
472 }
473 { my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
474
475 $code.=<<___;
476 .globl SHA3_squeeze
477 .type SHA3_squeeze,\@function
478 .align 32
479 SHA3_squeeze:
480 srl${g} $bsz,3
481 st${g} %r14,2*$SIZE_T($sp)
482 lghi %r14,8
483 st${g} $bsz,5*$SIZE_T($sp)
484 la %r1,0($A_flat)
485
486 j .Loop_squeeze
487
488 .align 16
489 .Loop_squeeze:
490 cl${g}r $len,%r14
491 jl .Ltail_squeeze
492
493 lrvg %r0,0(%r1)
494 la %r1,8(%r1)
495 stg %r0,0($out)
496 la $out,8($out)
497 a${g}hi $len,-8 # len -= 8
498 jz .Ldone_squeeze
499
500 brct $bsz,.Loop_squeeze # bsz--
501
502 stm${g} $out,$len,3*$SIZE_T($sp)
503 bras %r14,.LKeccakF1600
504 lm${g} $out,$bsz,3*$SIZE_T($sp)
505 lghi %r14,8
506 la %r1,0($A_flat)
507 j .Loop_squeeze
508
509 .Ltail_squeeze:
510 lg %r0,0(%r1)
511 .Loop_tail_squeeze:
512 stc %r0,0($out)
513 la $out,1($out)
514 srlg %r0,8
515 brct $len,.Loop_tail_squeeze
516
517 .Ldone_squeeze:
518 l${g} %r14,2*$SIZE_T($sp)
519 br %r14
520 .size SHA3_squeeze,.-SHA3_squeeze
521 ___
522 }
523 $code.=<<___;
524 .align 256
525 .quad 0,0,0,0,0,0,0,0
526 .type iotas,\@object
527 iotas:
528 .quad 0x0000000000000001
529 .quad 0x0000000000008082
530 .quad 0x800000000000808a
531 .quad 0x8000000080008000
532 .quad 0x000000000000808b
533 .quad 0x0000000080000001
534 .quad 0x8000000080008081
535 .quad 0x8000000000008009
536 .quad 0x000000000000008a
537 .quad 0x0000000000000088
538 .quad 0x0000000080008009
539 .quad 0x000000008000000a
540 .quad 0x000000008000808b
541 .quad 0x800000000000008b
542 .quad 0x8000000000008089
543 .quad 0x8000000000008003
544 .quad 0x8000000000008002
545 .quad 0x8000000000000080
546 .quad 0x000000000000800a
547 .quad 0x800000008000000a
548 .quad 0x8000000080008081
549 .quad 0x8000000000008080
550 .quad 0x0000000080000001
551 .quad 0x8000000080008008
552 .size iotas,.-iotas
553 .asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
554 ___
555
556 # unlike 32-bit shift 64-bit one takes three arguments
557 $code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
558
559 print $code;
560 close STDOUT;