]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-x86_64.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for x86_64.
17 #
18 # June 2017.
19 #
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # How does it compare to x86_64 assembly module in Keccak Code Package?
26 # Depending on processor it's either as fast or faster by up to 15%...
27 #
28 ########################################################################
29 # Numbers are cycles per processed byte out of large message.
30 #
31 # r=1088(*)
32 #
33 # P4 25.8
34 # Core 2 12.9
35 # Westmere 13.7
36 # Sandy Bridge 12.9(**)
37 # Haswell 9.6
38 # Skylake 9.4
39 # Silvermont 22.8
40 # Goldmont 15.8
41 # VIA Nano 17.3
42 # Sledgehammer 13.3
43 # Bulldozer 16.5
44 # Ryzen 8.8
45 #
46 # (*) Corresponds to SHA3-256. Improvement over compiler-generate
47 # varies a lot, most commont coefficient is 15% in comparison to
48 # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49 # (**) Sandy Bridge has broken rotate instruction. Performance can be
50 # improved by 14% by replacing rotates with double-precision
51 # shift with same register as source and destination.
52
53 $flavour = shift;
54 $output = shift;
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
63
64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
65 *STDOUT=*OUT;
66
67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
69
70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71 my @D = map("%r$_",(8..12));
72 my @T = map("%r$_",(13..14));
73 my $iotas = "%r15";
74
75 my @rhotates = ([ 0, 1, 62, 28, 27 ],
76 [ 36, 44, 6, 55, 20 ],
77 [ 3, 10, 43, 25, 39 ],
78 [ 41, 45, 15, 21, 8 ],
79 [ 18, 2, 61, 56, 14 ]);
80
81 $code.=<<___;
82 .text
83
84 .type __KeccakF1600,\@abi-omnipotent
85 .align 32
86 __KeccakF1600:
87 mov $A[4][0](%rdi),@C[0]
88 mov $A[4][1](%rdi),@C[1]
89 mov $A[4][2](%rdi),@C[2]
90 mov $A[4][3](%rdi),@C[3]
91 mov $A[4][4](%rdi),@C[4]
92 jmp .Loop
93
94 .align 32
95 .Loop:
96 mov $A[0][0](%rdi),@D[0]
97 mov $A[1][1](%rdi),@D[1]
98 mov $A[2][2](%rdi),@D[2]
99 mov $A[3][3](%rdi),@D[3]
100
101 xor $A[0][2](%rdi),@C[2]
102 xor $A[0][3](%rdi),@C[3]
103 xor @D[0], @C[0]
104 xor $A[0][1](%rdi),@C[1]
105 xor $A[1][2](%rdi),@C[2]
106 xor $A[1][0](%rdi),@C[0]
107 mov @C[4],@D[4]
108 xor $A[0][4](%rdi),@C[4]
109
110 xor @D[2], @C[2]
111 xor $A[2][0](%rdi),@C[0]
112 xor $A[1][3](%rdi),@C[3]
113 xor @D[1], @C[1]
114 xor $A[1][4](%rdi),@C[4]
115
116 xor $A[3][2](%rdi),@C[2]
117 xor $A[3][0](%rdi),@C[0]
118 xor $A[2][3](%rdi),@C[3]
119 xor $A[2][1](%rdi),@C[1]
120 xor $A[2][4](%rdi),@C[4]
121
122 mov @C[2],@T[0]
123 rol \$1,@C[2]
124 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
125 xor @D[3], @C[3]
126
127 rol \$1,@C[0]
128 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
129 xor $A[3][1](%rdi),@C[1]
130
131 rol \$1,@C[3]
132 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
133 xor $A[3][4](%rdi),@C[4]
134
135 rol \$1,@C[1]
136 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
137
138 rol \$1,@C[4]
139 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
140 ___
141 (@D[0..4], @C) = (@C[1..4,0], @D);
142 $code.=<<___;
143 xor @D[1],@C[1]
144 xor @D[2],@C[2]
145 rol \$$rhotates[1][1],@C[1]
146 xor @D[3],@C[3]
147 xor @D[4],@C[4]
148 rol \$$rhotates[2][2],@C[2]
149 xor @D[0],@C[0]
150 mov @C[1],@T[0]
151 rol \$$rhotates[3][3],@C[3]
152 or @C[2],@C[1]
153 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
154 rol \$$rhotates[4][4],@C[4]
155
156 xor ($iotas),@C[1]
157 lea 8($iotas),$iotas
158
159 mov @C[4],@T[1]
160 and @C[3],@C[4]
161 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
162 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
163 not @C[2]
164 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
165
166 or @C[3],@C[2]
167 mov $A[4][2](%rdi),@C[4]
168 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
169 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
170
171 and @C[0],@T[0]
172 mov $A[1][4](%rdi),@C[1]
173 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
174 mov $A[2][0](%rdi),@C[2]
175 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
176
177 or @C[0],@T[1]
178 mov $A[0][3](%rdi),@C[0]
179 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
180 mov $A[3][1](%rdi),@C[3]
181 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
182
183
184 xor @D[3],@C[0]
185 xor @D[2],@C[4]
186 rol \$$rhotates[0][3],@C[0]
187 xor @D[1],@C[3]
188 xor @D[4],@C[1]
189 rol \$$rhotates[4][2],@C[4]
190 rol \$$rhotates[3][1],@C[3]
191 xor @D[0],@C[2]
192 rol \$$rhotates[1][4],@C[1]
193 mov @C[0],@T[0]
194 or @C[4],@C[0]
195 rol \$$rhotates[2][0],@C[2]
196
197 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
198 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
199
200 mov @C[1],@T[1]
201 and @T[0],@C[1]
202 mov $A[0][1](%rdi),@C[0]
203 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
204 not @C[4]
205 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
206
207 or @C[3],@C[4]
208 mov $A[1][2](%rdi),@C[1]
209 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
210 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
211
212 and @C[2],@C[3]
213 mov $A[4][0](%rdi),@C[4]
214 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
215 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
216
217 or @C[2],@T[1]
218 mov $A[2][3](%rdi),@C[2]
219 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
220 mov $A[3][4](%rdi),@C[3]
221 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
222
223
224 xor @D[3],@C[2]
225 xor @D[4],@C[3]
226 rol \$$rhotates[2][3],@C[2]
227 xor @D[2],@C[1]
228 rol \$$rhotates[3][4],@C[3]
229 xor @D[0],@C[4]
230 rol \$$rhotates[1][2],@C[1]
231 xor @D[1],@C[0]
232 rol \$$rhotates[4][0],@C[4]
233 mov @C[2],@T[0]
234 and @C[3],@C[2]
235 rol \$$rhotates[0][1],@C[0]
236
237 not @C[3]
238 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
239 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
240
241 mov @C[4],@T[1]
242 and @C[3],@C[4]
243 mov $A[2][1](%rdi),@C[2]
244 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
245 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
246
247 or @C[1],@T[0]
248 mov $A[4][3](%rdi),@C[4]
249 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
250 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
251
252 and @C[0],@C[1]
253 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
254 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
255
256 or @C[0],@T[1]
257 mov $A[1][0](%rdi),@C[1]
258 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
259 mov $A[3][2](%rdi),@C[3]
260 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
261
262
263 mov $A[0][4](%rdi),@C[0]
264
265 xor @D[1],@C[2]
266 xor @D[2],@C[3]
267 rol \$$rhotates[2][1],@C[2]
268 xor @D[0],@C[1]
269 rol \$$rhotates[3][2],@C[3]
270 xor @D[3],@C[4]
271 rol \$$rhotates[1][0],@C[1]
272 xor @D[4],@C[0]
273 rol \$$rhotates[4][3],@C[4]
274 mov @C[2],@T[0]
275 or @C[3],@C[2]
276 rol \$$rhotates[0][4],@C[0]
277
278 not @C[3]
279 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
280 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
281
282 mov @C[4],@T[1]
283 or @C[3],@C[4]
284 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
285 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
286
287 and @C[1],@T[0]
288 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
289 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
290
291 or @C[0],@C[1]
292 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
293 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
294
295 and @T[1],@C[0]
296 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
297 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
298
299
300 xor $A[0][2](%rdi),@D[2]
301 xor $A[1][3](%rdi),@D[3]
302 rol \$$rhotates[0][2],@D[2]
303 xor $A[4][1](%rdi),@D[1]
304 rol \$$rhotates[1][3],@D[3]
305 xor $A[2][4](%rdi),@D[4]
306 rol \$$rhotates[4][1],@D[1]
307 xor $A[3][0](%rdi),@D[0]
308 xchg %rsi,%rdi
309 rol \$$rhotates[2][4],@D[4]
310 rol \$$rhotates[3][0],@D[0]
311 ___
312 @C = @D[2..4,0,1];
313 $code.=<<___;
314 mov @C[0],@T[0]
315 and @C[1],@C[0]
316 not @C[1]
317 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
318 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
319
320 mov @C[2],@T[1]
321 and @C[1],@C[2]
322 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
323 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
324
325 or @C[4],@T[0]
326 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
327 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
328
329 and @C[3],@C[4]
330 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
331 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
332
333 or @T[1],@C[3]
334 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
335 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
336
337 mov @C[0],@C[1] # harmonize with the loop top
338 mov @T[0],@C[0]
339
340 test \$255,$iotas
341 jnz .Loop
342
343 lea -192($iotas),$iotas # rewind iotas
344 ret
345 .size __KeccakF1600,.-__KeccakF1600
346
347 .globl KeccakF1600
348 .type KeccakF1600,\@abi-omnipotent
349 .align 32
350 KeccakF1600:
351 .cfi_startproc
352 push %rbx
353 .cfi_push %rbx
354 push %rbp
355 .cfi_push %rbp
356 push %r12
357 .cfi_push %r12
358 push %r13
359 .cfi_push %r13
360 push %r14
361 .cfi_push %r14
362 push %r15
363 .cfi_push %r15
364
365 lea 100(%rdi),%rdi # size optimization
366 sub \$200,%rsp
367 .cfi_adjust_cfa_offset 200
368
369 notq $A[0][1](%rdi)
370 notq $A[0][2](%rdi)
371 notq $A[1][3](%rdi)
372 notq $A[2][2](%rdi)
373 notq $A[3][2](%rdi)
374 notq $A[4][0](%rdi)
375
376 lea iotas(%rip),$iotas
377 lea 100(%rsp),%rsi # size optimization
378
379 call __KeccakF1600
380
381 notq $A[0][1](%rdi)
382 notq $A[0][2](%rdi)
383 notq $A[1][3](%rdi)
384 notq $A[2][2](%rdi)
385 notq $A[3][2](%rdi)
386 notq $A[4][0](%rdi)
387 lea -100(%rdi),%rdi # preserve A[][]
388
389 add \$200,%rsp
390 .cfi_adjust_cfa_offset -200
391
392 pop %r15
393 .cfi_pop %r15
394 pop %r14
395 .cfi_pop %r14
396 pop %r13
397 .cfi_pop %r13
398 pop %r12
399 .cfi_pop %r12
400 pop %rbp
401 .cfi_pop %rbp
402 pop %rbx
403 .cfi_pop %rbx
404 ret
405 .cfi_endproc
406 .size KeccakF1600,.-KeccakF1600
407 ___
408
409 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
410 ($A_flat,$inp) = ("%r8","%r9");
411 $code.=<<___;
412 .globl SHA3_absorb
413 .type SHA3_absorb,\@function,4
414 .align 32
415 SHA3_absorb:
416 .cfi_startproc
417 push %rbx
418 .cfi_push %rbx
419 push %rbp
420 .cfi_push %rbp
421 push %r12
422 .cfi_push %r12
423 push %r13
424 .cfi_push %r13
425 push %r14
426 .cfi_push %r14
427 push %r15
428 .cfi_push %r15
429
430 lea 100(%rdi),%rdi # size optimization
431 sub \$232,%rsp
432 .cfi_adjust_cfa_offset 232
433
434 mov %rsi,$inp
435 lea 100(%rsp),%rsi # size optimization
436
437 notq $A[0][1](%rdi)
438 notq $A[0][2](%rdi)
439 notq $A[1][3](%rdi)
440 notq $A[2][2](%rdi)
441 notq $A[3][2](%rdi)
442 notq $A[4][0](%rdi)
443 lea iotas(%rip),$iotas
444
445 mov $bsz,216-100(%rsi) # save bsz
446
447 .Loop_absorb:
448 cmp $bsz,$len
449 jc .Ldone_absorb
450
451 shr \$3,$bsz
452 lea -100(%rdi),$A_flat
453
454 .Lblock_absorb:
455 mov ($inp),%rax
456 lea 8($inp),$inp
457 xor ($A_flat),%rax
458 lea 8($A_flat),$A_flat
459 sub \$8,$len
460 mov %rax,-8($A_flat)
461 sub \$1,$bsz
462 jnz .Lblock_absorb
463
464 mov $inp,200-100(%rsi) # save inp
465 mov $len,208-100(%rsi) # save len
466 call __KeccakF1600
467 mov 200-100(%rsi),$inp # pull inp
468 mov 208-100(%rsi),$len # pull len
469 mov 216-100(%rsi),$bsz # pull bsz
470 jmp .Loop_absorb
471
472 .align 32
473 .Ldone_absorb:
474 mov $len,%rax # return value
475
476 notq $A[0][1](%rdi)
477 notq $A[0][2](%rdi)
478 notq $A[1][3](%rdi)
479 notq $A[2][2](%rdi)
480 notq $A[3][2](%rdi)
481 notq $A[4][0](%rdi)
482
483 add \$232,%rsp
484 .cfi_adjust_cfa_offset -232
485
486 pop %r15
487 .cfi_pop %r15
488 pop %r14
489 .cfi_pop %r14
490 pop %r13
491 .cfi_pop %r13
492 pop %r12
493 .cfi_pop %r12
494 pop %rbp
495 .cfi_pop %rbp
496 pop %rbx
497 .cfi_pop %rbx
498 ret
499 .cfi_endproc
500 .size SHA3_absorb,.-SHA3_absorb
501 ___
502 }
503 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
504 ($out,$len,$bsz) = ("%r12","%r13","%r14");
505
506 $code.=<<___;
507 .globl SHA3_squeeze
508 .type SHA3_squeeze,\@function,4
509 .align 32
510 SHA3_squeeze:
511 .cfi_startproc
512 push %r12
513 .cfi_push %r12
514 push %r13
515 .cfi_push %r13
516 push %r14
517 .cfi_push %r14
518
519 shr \$3,%rcx
520 mov $A_flat,%r8
521 mov %rsi,$out
522 mov %rdx,$len
523 mov %rcx,$bsz
524 jmp .Loop_squeeze
525
526 .align 32
527 .Loop_squeeze:
528 cmp \$8,$len
529 jb .Ltail_squeeze
530
531 mov (%r8),%rax
532 lea 8(%r8),%r8
533 mov %rax,($out)
534 lea 8($out),$out
535 sub \$8,$len # len -= 8
536 jz .Ldone_squeeze
537
538 sub \$1,%rcx # bsz--
539 jnz .Loop_squeeze
540
541 call KeccakF1600
542 mov $A_flat,%r8
543 mov $bsz,%rcx
544 jmp .Loop_squeeze
545
546 .Ltail_squeeze:
547 mov %r8, %rsi
548 mov $out,%rdi
549 mov $len,%rcx
550 .byte 0xf3,0xa4 # rep movsb
551
552 .Ldone_squeeze:
553 pop %r14
554 .cfi_pop %r14
555 pop %r13
556 .cfi_pop %r13
557 pop %r12
558 .cfi_pop %r13
559 ret
560 .cfi_endproc
561 .size SHA3_squeeze,.-SHA3_squeeze
562 ___
563 }
564 $code.=<<___;
565 .align 256
566 .quad 0,0,0,0,0,0,0,0
567 .type iotas,\@object
568 iotas:
569 .quad 0x0000000000000001
570 .quad 0x0000000000008082
571 .quad 0x800000000000808a
572 .quad 0x8000000080008000
573 .quad 0x000000000000808b
574 .quad 0x0000000080000001
575 .quad 0x8000000080008081
576 .quad 0x8000000000008009
577 .quad 0x000000000000008a
578 .quad 0x0000000000000088
579 .quad 0x0000000080008009
580 .quad 0x000000008000000a
581 .quad 0x000000008000808b
582 .quad 0x800000000000008b
583 .quad 0x8000000000008089
584 .quad 0x8000000000008003
585 .quad 0x8000000000008002
586 .quad 0x8000000000000080
587 .quad 0x000000000000800a
588 .quad 0x800000008000000a
589 .quad 0x8000000080008081
590 .quad 0x8000000000008080
591 .quad 0x0000000080000001
592 .quad 0x8000000080008008
593 .size iotas,.-iotas
594 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
595 ___
596
597 foreach (split("\n",$code)) {
598 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
599 # Haswell, but it hurts other processors by up to 2-3-4x...
600 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
601 # Below replacement results in 9.3 on Haswell [as well as
602 # on Ryzen, i.e. it *hurts* Ryzen]...
603 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
604
605 print $_, "\n";
606 }
607
608 close STDOUT;