]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-ppc64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-ppc64.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for PPC64.
17 #
18 # June 2017.
19 #
20 # This is straightforward KECCAK_1X_ALT implementation that works on
21 # *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
22 # it's possible to achieve performance better than below, but that is
23 # naturally option only for POWER8 and successors...
24 #
25 ######################################################################
26 # Numbers are cycles per processed byte.
27 #
28 # r=1088(*)
29 #
30 # PPC970/G5 14.0/+130%
31 # POWER7 9.7/+110%
32 # POWER8 10.6/+100%
33 # POWER9 8.2/+66%
34 #
35 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
36 # over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
37 # much better (but watch out for them generating code specific
38 # to processor they execute on).
39
40 # $output is the last argument if it looks like a file (it has an extension)
41 # $flavour is the first argument if it doesn't look like a file
42 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
43 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
44
45 if ($flavour =~ /64/) {
46 $SIZE_T =8;
47 $LRSAVE =2*$SIZE_T;
48 $UCMP ="cmpld";
49 $STU ="stdu";
50 $POP ="ld";
51 $PUSH ="std";
52 } else { die "nonsense $flavour"; }
53
54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
56 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
57 die "can't locate ppc-xlate.pl";
58
59 open STDOUT,"| $^X $xlate $flavour \"$output\""
60 or die "can't call $xlate: $!";
61
62 $FRAME=24*$SIZE_T+6*$SIZE_T+32;
63 $LOCALS=6*$SIZE_T;
64 $TEMP=$LOCALS+6*$SIZE_T;
65
66 my $sp ="r1";
67
68 my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
69 (7, 12, 17, 22, 27));
70 $A[1][1] = "r6"; # r13 is reserved
71
72 my @C = map("r$_", (0,3,4,5));
73
74 my @rhotates = ([ 0, 1, 62, 28, 27 ],
75 [ 36, 44, 6, 55, 20 ],
76 [ 3, 10, 43, 25, 39 ],
77 [ 41, 45, 15, 21, 8 ],
78 [ 18, 2, 61, 56, 14 ]);
79
80 $code.=<<___;
81 .text
82
83 .type KeccakF1600_int,\@function
84 .align 5
85 KeccakF1600_int:
86 li r0,24
87 mtctr r0
88 b .Loop
89 .align 4
90 .Loop:
91 xor $C[0],$A[0][0],$A[1][0] ; Theta
92 std $A[0][4],`$TEMP+0`($sp)
93 xor $C[1],$A[0][1],$A[1][1]
94 std $A[1][4],`$TEMP+8`($sp)
95 xor $C[2],$A[0][2],$A[1][2]
96 std $A[2][4],`$TEMP+16`($sp)
97 xor $C[3],$A[0][3],$A[1][3]
98 std $A[3][4],`$TEMP+24`($sp)
99 ___
100 $C[4]=$A[0][4];
101 $C[5]=$A[1][4];
102 $C[6]=$A[2][4];
103 $C[7]=$A[3][4];
104 $code.=<<___;
105 xor $C[4],$A[0][4],$A[1][4]
106 xor $C[0],$C[0],$A[2][0]
107 xor $C[1],$C[1],$A[2][1]
108 xor $C[2],$C[2],$A[2][2]
109 xor $C[3],$C[3],$A[2][3]
110 xor $C[4],$C[4],$A[2][4]
111 xor $C[0],$C[0],$A[3][0]
112 xor $C[1],$C[1],$A[3][1]
113 xor $C[2],$C[2],$A[3][2]
114 xor $C[3],$C[3],$A[3][3]
115 xor $C[4],$C[4],$A[3][4]
116 xor $C[0],$C[0],$A[4][0]
117 xor $C[2],$C[2],$A[4][2]
118 xor $C[1],$C[1],$A[4][1]
119 xor $C[3],$C[3],$A[4][3]
120 rotldi $C[5],$C[2],1
121 xor $C[4],$C[4],$A[4][4]
122 rotldi $C[6],$C[3],1
123 xor $C[5],$C[5],$C[0]
124 rotldi $C[7],$C[4],1
125
126 xor $A[0][1],$A[0][1],$C[5]
127 xor $A[1][1],$A[1][1],$C[5]
128 xor $A[2][1],$A[2][1],$C[5]
129 xor $A[3][1],$A[3][1],$C[5]
130 xor $A[4][1],$A[4][1],$C[5]
131
132 rotldi $C[5],$C[0],1
133 xor $C[6],$C[6],$C[1]
134 xor $C[2],$C[2],$C[7]
135 rotldi $C[7],$C[1],1
136 xor $C[3],$C[3],$C[5]
137 xor $C[4],$C[4],$C[7]
138
139 xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2]
140 xor $A[1][2],$A[1][2],$C[6]
141 xor $A[2][2],$A[2][2],$C[6]
142 xor $A[3][2],$A[3][2],$C[6]
143 xor $A[4][2],$A[4][2],$C[6]
144
145 xor $A[0][0],$A[0][0],$C[4]
146 xor $A[1][0],$A[1][0],$C[4]
147 xor $A[2][0],$A[2][0],$C[4]
148 xor $A[3][0],$A[3][0],$C[4]
149 xor $A[4][0],$A[4][0],$C[4]
150 ___
151 $C[4]=undef;
152 $C[5]=undef;
153 $C[6]=undef;
154 $C[7]=undef;
155 $code.=<<___;
156 ld $A[0][4],`$TEMP+0`($sp)
157 xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3]
158 ld $A[1][4],`$TEMP+8`($sp)
159 xor $A[1][3],$A[1][3],$C[2]
160 ld $A[2][4],`$TEMP+16`($sp)
161 xor $A[2][3],$A[2][3],$C[2]
162 ld $A[3][4],`$TEMP+24`($sp)
163 xor $A[3][3],$A[3][3],$C[2]
164 xor $A[4][3],$A[4][3],$C[2]
165
166 xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4]
167 xor $A[1][4],$A[1][4],$C[3]
168 xor $A[2][4],$A[2][4],$C[3]
169 xor $A[3][4],$A[3][4],$C[3]
170 xor $A[4][4],$A[4][4],$C[3]
171
172 mr $C[3],$A[0][1] ; Rho+Pi
173 rotldi $A[0][1],$A[1][1],$rhotates[1][1]
174 ;mr $C[1],$A[0][2]
175 rotldi $A[0][2],$A[2][2],$rhotates[2][2]
176 ;mr $C[0],$A[0][3]
177 rotldi $A[0][3],$A[3][3],$rhotates[3][3]
178 ;mr $C[2],$A[0][4]
179 rotldi $A[0][4],$A[4][4],$rhotates[4][4]
180
181 rotldi $A[1][1],$A[1][4],$rhotates[1][4]
182 rotldi $A[2][2],$A[2][3],$rhotates[2][3]
183 rotldi $A[3][3],$A[3][2],$rhotates[3][2]
184 rotldi $A[4][4],$A[4][1],$rhotates[4][1]
185
186 rotldi $A[1][4],$A[4][2],$rhotates[4][2]
187 rotldi $A[2][3],$A[3][4],$rhotates[3][4]
188 rotldi $A[3][2],$A[2][1],$rhotates[2][1]
189 rotldi $A[4][1],$A[1][3],$rhotates[1][3]
190
191 rotldi $A[4][2],$A[2][4],$rhotates[2][4]
192 rotldi $A[3][4],$A[4][3],$rhotates[4][3]
193 rotldi $A[2][1],$A[1][2],$rhotates[1][2]
194 rotldi $A[1][3],$A[3][1],$rhotates[3][1]
195
196 rotldi $A[2][4],$A[4][0],$rhotates[4][0]
197 rotldi $A[4][3],$A[3][0],$rhotates[3][0]
198 rotldi $A[1][2],$A[2][0],$rhotates[2][0]
199 rotldi $A[3][1],$A[1][0],$rhotates[1][0]
200
201 rotldi $A[1][0],$C[0],$rhotates[0][3]
202 rotldi $A[2][0],$C[3],$rhotates[0][1]
203 rotldi $A[3][0],$C[2],$rhotates[0][4]
204 rotldi $A[4][0],$C[1],$rhotates[0][2]
205
206 andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota
207 andc $C[1],$A[0][3],$A[0][2]
208 andc $C[2],$A[0][0],$A[0][4]
209 andc $C[3],$A[0][1],$A[0][0]
210 xor $A[0][0],$A[0][0],$C[0]
211 andc $C[0],$A[0][4],$A[0][3]
212 xor $A[0][1],$A[0][1],$C[1]
213 ld $C[1],`$LOCALS+4*$SIZE_T`($sp)
214 xor $A[0][3],$A[0][3],$C[2]
215 xor $A[0][4],$A[0][4],$C[3]
216 xor $A[0][2],$A[0][2],$C[0]
217 ldu $C[3],8($C[1]) ; Iota[i++]
218
219 andc $C[0],$A[1][2],$A[1][1]
220 std $C[1],`$LOCALS+4*$SIZE_T`($sp)
221 andc $C[1],$A[1][3],$A[1][2]
222 andc $C[2],$A[1][0],$A[1][4]
223 xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota
224 andc $C[3],$A[1][1],$A[1][0]
225 xor $A[1][0],$A[1][0],$C[0]
226 andc $C[0],$A[1][4],$A[1][3]
227 xor $A[1][1],$A[1][1],$C[1]
228 xor $A[1][3],$A[1][3],$C[2]
229 xor $A[1][4],$A[1][4],$C[3]
230 xor $A[1][2],$A[1][2],$C[0]
231
232 andc $C[0],$A[2][2],$A[2][1]
233 andc $C[1],$A[2][3],$A[2][2]
234 andc $C[2],$A[2][0],$A[2][4]
235 andc $C[3],$A[2][1],$A[2][0]
236 xor $A[2][0],$A[2][0],$C[0]
237 andc $C[0],$A[2][4],$A[2][3]
238 xor $A[2][1],$A[2][1],$C[1]
239 xor $A[2][3],$A[2][3],$C[2]
240 xor $A[2][4],$A[2][4],$C[3]
241 xor $A[2][2],$A[2][2],$C[0]
242
243 andc $C[0],$A[3][2],$A[3][1]
244 andc $C[1],$A[3][3],$A[3][2]
245 andc $C[2],$A[3][0],$A[3][4]
246 andc $C[3],$A[3][1],$A[3][0]
247 xor $A[3][0],$A[3][0],$C[0]
248 andc $C[0],$A[3][4],$A[3][3]
249 xor $A[3][1],$A[3][1],$C[1]
250 xor $A[3][3],$A[3][3],$C[2]
251 xor $A[3][4],$A[3][4],$C[3]
252 xor $A[3][2],$A[3][2],$C[0]
253
254 andc $C[0],$A[4][2],$A[4][1]
255 andc $C[1],$A[4][3],$A[4][2]
256 andc $C[2],$A[4][0],$A[4][4]
257 andc $C[3],$A[4][1],$A[4][0]
258 xor $A[4][0],$A[4][0],$C[0]
259 andc $C[0],$A[4][4],$A[4][3]
260 xor $A[4][1],$A[4][1],$C[1]
261 xor $A[4][3],$A[4][3],$C[2]
262 xor $A[4][4],$A[4][4],$C[3]
263 xor $A[4][2],$A[4][2],$C[0]
264
265 bdnz .Loop
266
267 blr
268 .long 0
269 .byte 0,12,0x14,0,0,0,0,0
270 .size KeccakF1600_int,.-KeccakF1600_int
271
272 .type KeccakF1600,\@function
273 .align 5
274 KeccakF1600:
275 $STU $sp,-$FRAME($sp)
276 mflr r0
277 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
278 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
279 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
280 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
281 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
282 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
283 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
284 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
285 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
286 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
287 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
288 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
289 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
290 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
291 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
292 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
293 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
294 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
295 $PUSH r0,`$FRAME+$LRSAVE`($sp)
296
297 bl PICmeup
298 subi r12,r12,8 ; prepare for ldu
299
300 $PUSH r3,`$LOCALS+0*$SIZE_T`($sp)
301 ;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp)
302 ;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp)
303 ;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp)
304 $PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
305
306 ld $A[0][0],`8*0`(r3) ; load A[5][5]
307 ld $A[0][1],`8*1`(r3)
308 ld $A[0][2],`8*2`(r3)
309 ld $A[0][3],`8*3`(r3)
310 ld $A[0][4],`8*4`(r3)
311 ld $A[1][0],`8*5`(r3)
312 ld $A[1][1],`8*6`(r3)
313 ld $A[1][2],`8*7`(r3)
314 ld $A[1][3],`8*8`(r3)
315 ld $A[1][4],`8*9`(r3)
316 ld $A[2][0],`8*10`(r3)
317 ld $A[2][1],`8*11`(r3)
318 ld $A[2][2],`8*12`(r3)
319 ld $A[2][3],`8*13`(r3)
320 ld $A[2][4],`8*14`(r3)
321 ld $A[3][0],`8*15`(r3)
322 ld $A[3][1],`8*16`(r3)
323 ld $A[3][2],`8*17`(r3)
324 ld $A[3][3],`8*18`(r3)
325 ld $A[3][4],`8*19`(r3)
326 ld $A[4][0],`8*20`(r3)
327 ld $A[4][1],`8*21`(r3)
328 ld $A[4][2],`8*22`(r3)
329 ld $A[4][3],`8*23`(r3)
330 ld $A[4][4],`8*24`(r3)
331
332 bl KeccakF1600_int
333
334 $POP r3,`$LOCALS+0*$SIZE_T`($sp)
335 std $A[0][0],`8*0`(r3) ; return A[5][5]
336 std $A[0][1],`8*1`(r3)
337 std $A[0][2],`8*2`(r3)
338 std $A[0][3],`8*3`(r3)
339 std $A[0][4],`8*4`(r3)
340 std $A[1][0],`8*5`(r3)
341 std $A[1][1],`8*6`(r3)
342 std $A[1][2],`8*7`(r3)
343 std $A[1][3],`8*8`(r3)
344 std $A[1][4],`8*9`(r3)
345 std $A[2][0],`8*10`(r3)
346 std $A[2][1],`8*11`(r3)
347 std $A[2][2],`8*12`(r3)
348 std $A[2][3],`8*13`(r3)
349 std $A[2][4],`8*14`(r3)
350 std $A[3][0],`8*15`(r3)
351 std $A[3][1],`8*16`(r3)
352 std $A[3][2],`8*17`(r3)
353 std $A[3][3],`8*18`(r3)
354 std $A[3][4],`8*19`(r3)
355 std $A[4][0],`8*20`(r3)
356 std $A[4][1],`8*21`(r3)
357 std $A[4][2],`8*22`(r3)
358 std $A[4][3],`8*23`(r3)
359 std $A[4][4],`8*24`(r3)
360
361 $POP r0,`$FRAME+$LRSAVE`($sp)
362 $POP r14,`$FRAME-$SIZE_T*18`($sp)
363 $POP r15,`$FRAME-$SIZE_T*17`($sp)
364 $POP r16,`$FRAME-$SIZE_T*16`($sp)
365 $POP r17,`$FRAME-$SIZE_T*15`($sp)
366 $POP r18,`$FRAME-$SIZE_T*14`($sp)
367 $POP r19,`$FRAME-$SIZE_T*13`($sp)
368 $POP r20,`$FRAME-$SIZE_T*12`($sp)
369 $POP r21,`$FRAME-$SIZE_T*11`($sp)
370 $POP r22,`$FRAME-$SIZE_T*10`($sp)
371 $POP r23,`$FRAME-$SIZE_T*9`($sp)
372 $POP r24,`$FRAME-$SIZE_T*8`($sp)
373 $POP r25,`$FRAME-$SIZE_T*7`($sp)
374 $POP r26,`$FRAME-$SIZE_T*6`($sp)
375 $POP r27,`$FRAME-$SIZE_T*5`($sp)
376 $POP r28,`$FRAME-$SIZE_T*4`($sp)
377 $POP r29,`$FRAME-$SIZE_T*3`($sp)
378 $POP r30,`$FRAME-$SIZE_T*2`($sp)
379 $POP r31,`$FRAME-$SIZE_T*1`($sp)
380 mtlr r0
381 addi $sp,$sp,$FRAME
382 blr
383 .long 0
384 .byte 0,12,4,1,0x80,18,1,0
385 .long 0
386 .size KeccakF1600,.-KeccakF1600
387
388 .type dword_le_load,\@function
389 .align 5
390 dword_le_load:
391 lbz r0,1(r3)
392 lbz r4,2(r3)
393 lbz r5,3(r3)
394 insrdi r0,r4,8,48
395 lbz r4,4(r3)
396 insrdi r0,r5,8,40
397 lbz r5,5(r3)
398 insrdi r0,r4,8,32
399 lbz r4,6(r3)
400 insrdi r0,r5,8,24
401 lbz r5,7(r3)
402 insrdi r0,r4,8,16
403 lbzu r4,8(r3)
404 insrdi r0,r5,8,8
405 insrdi r0,r4,8,0
406 blr
407 .long 0
408 .byte 0,12,0x14,0,0,0,1,0
409 .long 0
410 .size dword_le_load,.-dword_le_load
411
412 .globl SHA3_absorb
413 .type SHA3_absorb,\@function
414 .align 5
415 SHA3_absorb:
416 $STU $sp,-$FRAME($sp)
417 mflr r0
418 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
419 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
420 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
421 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
422 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
423 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
424 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
425 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
426 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
427 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
428 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
429 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
430 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
431 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
432 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
433 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
434 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
435 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
436 $PUSH r0,`$FRAME+$LRSAVE`($sp)
437
438 bl PICmeup
439 subi r4,r4,1 ; prepare for lbzu
440 subi r12,r12,8 ; prepare for ldu
441
442 $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][]
443 $PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp
444 $PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len
445 $PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz
446 mr r0,r6
447 $PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
448
449 ld $A[0][0],`8*0`(r3) ; load A[5][5]
450 ld $A[0][1],`8*1`(r3)
451 ld $A[0][2],`8*2`(r3)
452 ld $A[0][3],`8*3`(r3)
453 ld $A[0][4],`8*4`(r3)
454 ld $A[1][0],`8*5`(r3)
455 ld $A[1][1],`8*6`(r3)
456 ld $A[1][2],`8*7`(r3)
457 ld $A[1][3],`8*8`(r3)
458 ld $A[1][4],`8*9`(r3)
459 ld $A[2][0],`8*10`(r3)
460 ld $A[2][1],`8*11`(r3)
461 ld $A[2][2],`8*12`(r3)
462 ld $A[2][3],`8*13`(r3)
463 ld $A[2][4],`8*14`(r3)
464 ld $A[3][0],`8*15`(r3)
465 ld $A[3][1],`8*16`(r3)
466 ld $A[3][2],`8*17`(r3)
467 ld $A[3][3],`8*18`(r3)
468 ld $A[3][4],`8*19`(r3)
469 ld $A[4][0],`8*20`(r3)
470 ld $A[4][1],`8*21`(r3)
471 ld $A[4][2],`8*22`(r3)
472 ld $A[4][3],`8*23`(r3)
473 ld $A[4][4],`8*24`(r3)
474
475 mr r3,r4
476 mr r4,r5
477 mr r5,r0
478
479 b .Loop_absorb
480
481 .align 4
482 .Loop_absorb:
483 $UCMP r4,r5 ; len < bsz?
484 blt .Labsorbed
485
486 sub r4,r4,r5 ; len -= bsz
487 srwi r5,r5,3
488 $PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len
489 mtctr r5
490 bl dword_le_load ; *inp++
491 xor $A[0][0],$A[0][0],r0
492 bdz .Lprocess_block
493 bl dword_le_load ; *inp++
494 xor $A[0][1],$A[0][1],r0
495 bdz .Lprocess_block
496 bl dword_le_load ; *inp++
497 xor $A[0][2],$A[0][2],r0
498 bdz .Lprocess_block
499 bl dword_le_load ; *inp++
500 xor $A[0][3],$A[0][3],r0
501 bdz .Lprocess_block
502 bl dword_le_load ; *inp++
503 xor $A[0][4],$A[0][4],r0
504 bdz .Lprocess_block
505 bl dword_le_load ; *inp++
506 xor $A[1][0],$A[1][0],r0
507 bdz .Lprocess_block
508 bl dword_le_load ; *inp++
509 xor $A[1][1],$A[1][1],r0
510 bdz .Lprocess_block
511 bl dword_le_load ; *inp++
512 xor $A[1][2],$A[1][2],r0
513 bdz .Lprocess_block
514 bl dword_le_load ; *inp++
515 xor $A[1][3],$A[1][3],r0
516 bdz .Lprocess_block
517 bl dword_le_load ; *inp++
518 xor $A[1][4],$A[1][4],r0
519 bdz .Lprocess_block
520 bl dword_le_load ; *inp++
521 xor $A[2][0],$A[2][0],r0
522 bdz .Lprocess_block
523 bl dword_le_load ; *inp++
524 xor $A[2][1],$A[2][1],r0
525 bdz .Lprocess_block
526 bl dword_le_load ; *inp++
527 xor $A[2][2],$A[2][2],r0
528 bdz .Lprocess_block
529 bl dword_le_load ; *inp++
530 xor $A[2][3],$A[2][3],r0
531 bdz .Lprocess_block
532 bl dword_le_load ; *inp++
533 xor $A[2][4],$A[2][4],r0
534 bdz .Lprocess_block
535 bl dword_le_load ; *inp++
536 xor $A[3][0],$A[3][0],r0
537 bdz .Lprocess_block
538 bl dword_le_load ; *inp++
539 xor $A[3][1],$A[3][1],r0
540 bdz .Lprocess_block
541 bl dword_le_load ; *inp++
542 xor $A[3][2],$A[3][2],r0
543 bdz .Lprocess_block
544 bl dword_le_load ; *inp++
545 xor $A[3][3],$A[3][3],r0
546 bdz .Lprocess_block
547 bl dword_le_load ; *inp++
548 xor $A[3][4],$A[3][4],r0
549 bdz .Lprocess_block
550 bl dword_le_load ; *inp++
551 xor $A[4][0],$A[4][0],r0
552 bdz .Lprocess_block
553 bl dword_le_load ; *inp++
554 xor $A[4][1],$A[4][1],r0
555 bdz .Lprocess_block
556 bl dword_le_load ; *inp++
557 xor $A[4][2],$A[4][2],r0
558 bdz .Lprocess_block
559 bl dword_le_load ; *inp++
560 xor $A[4][3],$A[4][3],r0
561 bdz .Lprocess_block
562 bl dword_le_load ; *inp++
563 xor $A[4][4],$A[4][4],r0
564
565 .Lprocess_block:
566 $PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp
567
568 bl KeccakF1600_int
569
570 $POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24]
571 $POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz
572 $POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len
573 $POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp
574 addic r0,r0,`-8*24` ; rewind iotas
575 $PUSH r0,`$LOCALS+4*$SIZE_T`($sp)
576
577 b .Loop_absorb
578
579 .align 4
580 .Labsorbed:
581 $POP r3,`$LOCALS+0*$SIZE_T`($sp)
582 std $A[0][0],`8*0`(r3) ; return A[5][5]
583 std $A[0][1],`8*1`(r3)
584 std $A[0][2],`8*2`(r3)
585 std $A[0][3],`8*3`(r3)
586 std $A[0][4],`8*4`(r3)
587 std $A[1][0],`8*5`(r3)
588 std $A[1][1],`8*6`(r3)
589 std $A[1][2],`8*7`(r3)
590 std $A[1][3],`8*8`(r3)
591 std $A[1][4],`8*9`(r3)
592 std $A[2][0],`8*10`(r3)
593 std $A[2][1],`8*11`(r3)
594 std $A[2][2],`8*12`(r3)
595 std $A[2][3],`8*13`(r3)
596 std $A[2][4],`8*14`(r3)
597 std $A[3][0],`8*15`(r3)
598 std $A[3][1],`8*16`(r3)
599 std $A[3][2],`8*17`(r3)
600 std $A[3][3],`8*18`(r3)
601 std $A[3][4],`8*19`(r3)
602 std $A[4][0],`8*20`(r3)
603 std $A[4][1],`8*21`(r3)
604 std $A[4][2],`8*22`(r3)
605 std $A[4][3],`8*23`(r3)
606 std $A[4][4],`8*24`(r3)
607
608 mr r3,r4 ; return value
609 $POP r0,`$FRAME+$LRSAVE`($sp)
610 $POP r14,`$FRAME-$SIZE_T*18`($sp)
611 $POP r15,`$FRAME-$SIZE_T*17`($sp)
612 $POP r16,`$FRAME-$SIZE_T*16`($sp)
613 $POP r17,`$FRAME-$SIZE_T*15`($sp)
614 $POP r18,`$FRAME-$SIZE_T*14`($sp)
615 $POP r19,`$FRAME-$SIZE_T*13`($sp)
616 $POP r20,`$FRAME-$SIZE_T*12`($sp)
617 $POP r21,`$FRAME-$SIZE_T*11`($sp)
618 $POP r22,`$FRAME-$SIZE_T*10`($sp)
619 $POP r23,`$FRAME-$SIZE_T*9`($sp)
620 $POP r24,`$FRAME-$SIZE_T*8`($sp)
621 $POP r25,`$FRAME-$SIZE_T*7`($sp)
622 $POP r26,`$FRAME-$SIZE_T*6`($sp)
623 $POP r27,`$FRAME-$SIZE_T*5`($sp)
624 $POP r28,`$FRAME-$SIZE_T*4`($sp)
625 $POP r29,`$FRAME-$SIZE_T*3`($sp)
626 $POP r30,`$FRAME-$SIZE_T*2`($sp)
627 $POP r31,`$FRAME-$SIZE_T*1`($sp)
628 mtlr r0
629 addi $sp,$sp,$FRAME
630 blr
631 .long 0
632 .byte 0,12,4,1,0x80,18,4,0
633 .long 0
634 .size SHA3_absorb,.-SHA3_absorb
635 ___
636 {
637 my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
638 $code.=<<___;
639 .globl SHA3_squeeze
640 .type SHA3_squeeze,\@function
641 .align 5
642 SHA3_squeeze:
643 $STU $sp,`-10*$SIZE_T`($sp)
644 mflr r0
645 $PUSH r28,`6*$SIZE_T`($sp)
646 $PUSH r29,`7*$SIZE_T`($sp)
647 $PUSH r30,`8*$SIZE_T`($sp)
648 $PUSH r31,`9*$SIZE_T`($sp)
649 $PUSH r0,`10*$SIZE_T+$LRSAVE`($sp)
650
651 mr $A_flat,r3
652 subi r3,r3,8 ; prepare for ldu
653 subi $out,r4,1 ; prepare for stbu
654 mr $len,r5
655 mr $bsz,r6
656 b .Loop_squeeze
657
658 .align 4
659 .Loop_squeeze:
660 ldu r0,8(r3)
661 ${UCMP}i $len,8
662 blt .Lsqueeze_tail
663
664 stb r0,1($out)
665 srdi r0,r0,8
666 stb r0,2($out)
667 srdi r0,r0,8
668 stb r0,3($out)
669 srdi r0,r0,8
670 stb r0,4($out)
671 srdi r0,r0,8
672 stb r0,5($out)
673 srdi r0,r0,8
674 stb r0,6($out)
675 srdi r0,r0,8
676 stb r0,7($out)
677 srdi r0,r0,8
678 stbu r0,8($out)
679
680 subic. $len,$len,8
681 beq .Lsqueeze_done
682
683 subic. r6,r6,8
684 bgt .Loop_squeeze
685
686 mr r3,$A_flat
687 bl KeccakF1600
688 subi r3,$A_flat,8 ; prepare for ldu
689 mr r6,$bsz
690 b .Loop_squeeze
691
692 .align 4
693 .Lsqueeze_tail:
694 mtctr $len
695 .Loop_tail:
696 stbu r0,1($out)
697 srdi r0,r0,8
698 bdnz .Loop_tail
699
700 .Lsqueeze_done:
701 $POP r0,`10*$SIZE_T+$LRSAVE`($sp)
702 $POP r28,`6*$SIZE_T`($sp)
703 $POP r29,`7*$SIZE_T`($sp)
704 $POP r30,`8*$SIZE_T`($sp)
705 $POP r31,`9*$SIZE_T`($sp)
706 mtlr r0
707 addi $sp,$sp,`10*$SIZE_T`
708 blr
709 .long 0
710 .byte 0,12,4,1,0x80,4,4,0
711 .long 0
712 .size SHA3_squeeze,.-SHA3_squeeze
713 ___
714 }
715
716 # Ugly hack here, because PPC assembler syntax seem to vary too
717 # much from platforms to platform...
718 $code.=<<___;
719 .align 6
720 PICmeup:
721 mflr r0
722 bcl 20,31,\$+4
723 mflr r12 ; vvvvvv "distance" between . and 1st data entry
724 addi r12,r12,`64-8`
725 mtlr r0
726 blr
727 .long 0
728 .byte 0,12,0x14,0,0,0,0,0
729 .space `64-9*4`
730 .type iotas,\@object
731 iotas:
732 .quad 0x0000000000000001
733 .quad 0x0000000000008082
734 .quad 0x800000000000808a
735 .quad 0x8000000080008000
736 .quad 0x000000000000808b
737 .quad 0x0000000080000001
738 .quad 0x8000000080008081
739 .quad 0x8000000000008009
740 .quad 0x000000000000008a
741 .quad 0x0000000000000088
742 .quad 0x0000000080008009
743 .quad 0x000000008000000a
744 .quad 0x000000008000808b
745 .quad 0x800000000000008b
746 .quad 0x8000000000008089
747 .quad 0x8000000000008003
748 .quad 0x8000000000008002
749 .quad 0x8000000000000080
750 .quad 0x000000000000800a
751 .quad 0x800000008000000a
752 .quad 0x8000000080008081
753 .quad 0x8000000000008080
754 .quad 0x0000000080000001
755 .quad 0x8000000080008008
756 .size iotas,.-iotas
757 .asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
758 ___
759
760 $code =~ s/\`([^\`]*)\`/eval $1/gem;
761 print $code;
762 close STDOUT or die "error closing STDOUT: $!";