]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-ppc64.pl
PPC assembly pack: correct POWER9 results.
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-ppc64.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for PPC64.
17 #
18 # June 2017.
19 #
20 # This is straightforward KECCAK_1X_ALT implementation that works on
21 # *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
22 # it's possible to achieve performance better than below, but that is
23 # naturally option only for POWER8 and successors...
24 #
25 ######################################################################
26 # Numbers are cycles per processed byte.
27 #
28 # r=1088(*)
29 #
30 # PPC970/G5 14.6/+120%
31 # POWER7 10.3/+100%
32 # POWER8 11.5/+85%
33 # POWER9 9.4/+45%
34 #
35 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
36 # over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
37 # much better (but watch out for them generating code specific
38 # to processor they execute on).
39
40 $flavour = shift;
41
42 if ($flavour =~ /64/) {
43 $SIZE_T =8;
44 $LRSAVE =2*$SIZE_T;
45 $UCMP ="cmpld";
46 $STU ="stdu";
47 $POP ="ld";
48 $PUSH ="std";
49 } else { die "nonsense $flavour"; }
50
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54 die "can't locate ppc-xlate.pl";
55
56 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
57
58 $FRAME=24*$SIZE_T+6*$SIZE_T+32;
59 $LOCALS=6*$SIZE_T;
60 $TEMP=$LOCALS+6*$SIZE_T;
61
62 my $sp ="r1";
63
64 my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
65 (7, 12, 17, 22, 27));
66 $A[1][1] = "r6"; # r13 is reserved
67
68 my @C = map("r$_", (0,3,4,5));
69
70 my @rhotates = ([ 0, 1, 62, 28, 27 ],
71 [ 36, 44, 6, 55, 20 ],
72 [ 3, 10, 43, 25, 39 ],
73 [ 41, 45, 15, 21, 8 ],
74 [ 18, 2, 61, 56, 14 ]);
75
76 $code.=<<___;
77 .text
78
79 .type KeccakF1600_int,\@function
80 .align 5
81 KeccakF1600_int:
82 li r0,24
83 mtctr r0
84 b .Loop
85 .align 4
86 .Loop:
87 xor $C[0],$A[0][0],$A[1][0] ; Theta
88 std $A[0][4],`$TEMP+0`($sp)
89 xor $C[1],$A[0][1],$A[1][1]
90 std $A[1][4],`$TEMP+8`($sp)
91 xor $C[2],$A[0][2],$A[1][2]
92 std $A[2][4],`$TEMP+16`($sp)
93 xor $C[3],$A[0][3],$A[1][3]
94 std $A[3][4],`$TEMP+24`($sp)
95 ___
96 $C[4]=$A[0][4];
97 $C[5]=$A[1][4];
98 $C[6]=$A[2][4];
99 $C[7]=$A[3][4];
100 $code.=<<___;
101 xor $C[4],$A[0][4],$A[1][4]
102 xor $C[0],$C[0],$A[2][0]
103 xor $C[1],$C[1],$A[2][1]
104 xor $C[2],$C[2],$A[2][2]
105 xor $C[3],$C[3],$A[2][3]
106 xor $C[4],$C[4],$A[2][4]
107 xor $C[0],$C[0],$A[3][0]
108 xor $C[1],$C[1],$A[3][1]
109 xor $C[2],$C[2],$A[3][2]
110 xor $C[3],$C[3],$A[3][3]
111 xor $C[4],$C[4],$A[3][4]
112 xor $C[0],$C[0],$A[4][0]
113 xor $C[2],$C[2],$A[4][2]
114 xor $C[1],$C[1],$A[4][1]
115 xor $C[3],$C[3],$A[4][3]
116 rotldi $C[5],$C[2],1
117 xor $C[4],$C[4],$A[4][4]
118 rotldi $C[6],$C[3],1
119 xor $C[5],$C[5],$C[0]
120 rotldi $C[7],$C[4],1
121
122 xor $A[0][1],$A[0][1],$C[5]
123 xor $A[1][1],$A[1][1],$C[5]
124 xor $A[2][1],$A[2][1],$C[5]
125 xor $A[3][1],$A[3][1],$C[5]
126 xor $A[4][1],$A[4][1],$C[5]
127
128 rotldi $C[5],$C[0],1
129 xor $C[6],$C[6],$C[1]
130 xor $C[2],$C[2],$C[7]
131 rotldi $C[7],$C[1],1
132 xor $C[3],$C[3],$C[5]
133 xor $C[4],$C[4],$C[7]
134
135 xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2]
136 xor $A[1][2],$A[1][2],$C[6]
137 xor $A[2][2],$A[2][2],$C[6]
138 xor $A[3][2],$A[3][2],$C[6]
139 xor $A[4][2],$A[4][2],$C[6]
140
141 xor $A[0][0],$A[0][0],$C[4]
142 xor $A[1][0],$A[1][0],$C[4]
143 xor $A[2][0],$A[2][0],$C[4]
144 xor $A[3][0],$A[3][0],$C[4]
145 xor $A[4][0],$A[4][0],$C[4]
146 ___
147 $C[4]=undef;
148 $C[5]=undef;
149 $C[6]=undef;
150 $C[7]=undef;
151 $code.=<<___;
152 ld $A[0][4],`$TEMP+0`($sp)
153 xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3]
154 ld $A[1][4],`$TEMP+8`($sp)
155 xor $A[1][3],$A[1][3],$C[2]
156 ld $A[2][4],`$TEMP+16`($sp)
157 xor $A[2][3],$A[2][3],$C[2]
158 ld $A[3][4],`$TEMP+24`($sp)
159 xor $A[3][3],$A[3][3],$C[2]
160 xor $A[4][3],$A[4][3],$C[2]
161
162 xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4]
163 xor $A[1][4],$A[1][4],$C[3]
164 xor $A[2][4],$A[2][4],$C[3]
165 xor $A[3][4],$A[3][4],$C[3]
166 xor $A[4][4],$A[4][4],$C[3]
167
168 mr $C[3],$A[0][1] ; Rho+Pi
169 rotldi $A[0][1],$A[1][1],$rhotates[1][1]
170 ;mr $C[1],$A[0][2]
171 rotldi $A[0][2],$A[2][2],$rhotates[2][2]
172 ;mr $C[0],$A[0][3]
173 rotldi $A[0][3],$A[3][3],$rhotates[3][3]
174 ;mr $C[2],$A[0][4]
175 rotldi $A[0][4],$A[4][4],$rhotates[4][4]
176
177 rotldi $A[1][1],$A[1][4],$rhotates[1][4]
178 rotldi $A[2][2],$A[2][3],$rhotates[2][3]
179 rotldi $A[3][3],$A[3][2],$rhotates[3][2]
180 rotldi $A[4][4],$A[4][1],$rhotates[4][1]
181
182 rotldi $A[1][4],$A[4][2],$rhotates[4][2]
183 rotldi $A[2][3],$A[3][4],$rhotates[3][4]
184 rotldi $A[3][2],$A[2][1],$rhotates[2][1]
185 rotldi $A[4][1],$A[1][3],$rhotates[1][3]
186
187 rotldi $A[4][2],$A[2][4],$rhotates[2][4]
188 rotldi $A[3][4],$A[4][3],$rhotates[4][3]
189 rotldi $A[2][1],$A[1][2],$rhotates[1][2]
190 rotldi $A[1][3],$A[3][1],$rhotates[3][1]
191
192 rotldi $A[2][4],$A[4][0],$rhotates[4][0]
193 rotldi $A[4][3],$A[3][0],$rhotates[3][0]
194 rotldi $A[1][2],$A[2][0],$rhotates[2][0]
195 rotldi $A[3][1],$A[1][0],$rhotates[1][0]
196
197 rotldi $A[1][0],$C[0],$rhotates[0][3]
198 rotldi $A[2][0],$C[3],$rhotates[0][1]
199 rotldi $A[3][0],$C[2],$rhotates[0][4]
200 rotldi $A[4][0],$C[1],$rhotates[0][2]
201
202 andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota
203 andc $C[1],$A[0][3],$A[0][2]
204 andc $C[2],$A[0][0],$A[0][4]
205 andc $C[3],$A[0][1],$A[0][0]
206 xor $A[0][0],$A[0][0],$C[0]
207 andc $C[0],$A[0][4],$A[0][3]
208 xor $A[0][1],$A[0][1],$C[1]
209 ld $C[1],`$LOCALS+4*$SIZE_T`($sp)
210 xor $A[0][3],$A[0][3],$C[2]
211 xor $A[0][4],$A[0][4],$C[3]
212 xor $A[0][2],$A[0][2],$C[0]
213 ldu $C[3],8($C[1]) ; Iota[i++]
214
215 andc $C[0],$A[1][2],$A[1][1]
216 std $C[1],`$LOCALS+4*$SIZE_T`($sp)
217 andc $C[1],$A[1][3],$A[1][2]
218 andc $C[2],$A[1][0],$A[1][4]
219 xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota
220 andc $C[3],$A[1][1],$A[1][0]
221 xor $A[1][0],$A[1][0],$C[0]
222 andc $C[0],$A[1][4],$A[1][3]
223 xor $A[1][1],$A[1][1],$C[1]
224 xor $A[1][3],$A[1][3],$C[2]
225 xor $A[1][4],$A[1][4],$C[3]
226 xor $A[1][2],$A[1][2],$C[0]
227
228 andc $C[0],$A[2][2],$A[2][1]
229 andc $C[1],$A[2][3],$A[2][2]
230 andc $C[2],$A[2][0],$A[2][4]
231 andc $C[3],$A[2][1],$A[2][0]
232 xor $A[2][0],$A[2][0],$C[0]
233 andc $C[0],$A[2][4],$A[2][3]
234 xor $A[2][1],$A[2][1],$C[1]
235 xor $A[2][3],$A[2][3],$C[2]
236 xor $A[2][4],$A[2][4],$C[3]
237 xor $A[2][2],$A[2][2],$C[0]
238
239 andc $C[0],$A[3][2],$A[3][1]
240 andc $C[1],$A[3][3],$A[3][2]
241 andc $C[2],$A[3][0],$A[3][4]
242 andc $C[3],$A[3][1],$A[3][0]
243 xor $A[3][0],$A[3][0],$C[0]
244 andc $C[0],$A[3][4],$A[3][3]
245 xor $A[3][1],$A[3][1],$C[1]
246 xor $A[3][3],$A[3][3],$C[2]
247 xor $A[3][4],$A[3][4],$C[3]
248 xor $A[3][2],$A[3][2],$C[0]
249
250 andc $C[0],$A[4][2],$A[4][1]
251 andc $C[1],$A[4][3],$A[4][2]
252 andc $C[2],$A[4][0],$A[4][4]
253 andc $C[3],$A[4][1],$A[4][0]
254 xor $A[4][0],$A[4][0],$C[0]
255 andc $C[0],$A[4][4],$A[4][3]
256 xor $A[4][1],$A[4][1],$C[1]
257 xor $A[4][3],$A[4][3],$C[2]
258 xor $A[4][4],$A[4][4],$C[3]
259 xor $A[4][2],$A[4][2],$C[0]
260
261 bdnz .Loop
262
263 blr
264 .long 0
265 .byte 0,12,0x14,0,0,0,0,0
266 .size KeccakF1600_int,.-KeccakF1600_int
267
268 .type KeccakF1600,\@function
269 .align 5
270 KeccakF1600:
271 $STU $sp,-$FRAME($sp)
272 mflr r0
273 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
274 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
275 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
276 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
277 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
278 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
279 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
280 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
281 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
282 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
283 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
284 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
285 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
286 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
287 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
288 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
289 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
290 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
291 $PUSH r0,`$FRAME+$LRSAVE`($sp)
292
293 bl PICmeup
294 subi r12,r12,8 ; prepare for ldu
295
296 $PUSH r3,`$LOCALS+0*$SIZE_T`($sp)
297 ;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp)
298 ;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp)
299 ;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp)
300 $PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
301
302 ld $A[0][0],`8*0`(r3) ; load A[5][5]
303 ld $A[0][1],`8*1`(r3)
304 ld $A[0][2],`8*2`(r3)
305 ld $A[0][3],`8*3`(r3)
306 ld $A[0][4],`8*4`(r3)
307 ld $A[1][0],`8*5`(r3)
308 ld $A[1][1],`8*6`(r3)
309 ld $A[1][2],`8*7`(r3)
310 ld $A[1][3],`8*8`(r3)
311 ld $A[1][4],`8*9`(r3)
312 ld $A[2][0],`8*10`(r3)
313 ld $A[2][1],`8*11`(r3)
314 ld $A[2][2],`8*12`(r3)
315 ld $A[2][3],`8*13`(r3)
316 ld $A[2][4],`8*14`(r3)
317 ld $A[3][0],`8*15`(r3)
318 ld $A[3][1],`8*16`(r3)
319 ld $A[3][2],`8*17`(r3)
320 ld $A[3][3],`8*18`(r3)
321 ld $A[3][4],`8*19`(r3)
322 ld $A[4][0],`8*20`(r3)
323 ld $A[4][1],`8*21`(r3)
324 ld $A[4][2],`8*22`(r3)
325 ld $A[4][3],`8*23`(r3)
326 ld $A[4][4],`8*24`(r3)
327
328 bl KeccakF1600_int
329
330 $POP r3,`$LOCALS+0*$SIZE_T`($sp)
331 std $A[0][0],`8*0`(r3) ; return A[5][5]
332 std $A[0][1],`8*1`(r3)
333 std $A[0][2],`8*2`(r3)
334 std $A[0][3],`8*3`(r3)
335 std $A[0][4],`8*4`(r3)
336 std $A[1][0],`8*5`(r3)
337 std $A[1][1],`8*6`(r3)
338 std $A[1][2],`8*7`(r3)
339 std $A[1][3],`8*8`(r3)
340 std $A[1][4],`8*9`(r3)
341 std $A[2][0],`8*10`(r3)
342 std $A[2][1],`8*11`(r3)
343 std $A[2][2],`8*12`(r3)
344 std $A[2][3],`8*13`(r3)
345 std $A[2][4],`8*14`(r3)
346 std $A[3][0],`8*15`(r3)
347 std $A[3][1],`8*16`(r3)
348 std $A[3][2],`8*17`(r3)
349 std $A[3][3],`8*18`(r3)
350 std $A[3][4],`8*19`(r3)
351 std $A[4][0],`8*20`(r3)
352 std $A[4][1],`8*21`(r3)
353 std $A[4][2],`8*22`(r3)
354 std $A[4][3],`8*23`(r3)
355 std $A[4][4],`8*24`(r3)
356
357 $POP r0,`$FRAME+$LRSAVE`($sp)
358 $POP r14,`$FRAME-$SIZE_T*18`($sp)
359 $POP r15,`$FRAME-$SIZE_T*17`($sp)
360 $POP r16,`$FRAME-$SIZE_T*16`($sp)
361 $POP r17,`$FRAME-$SIZE_T*15`($sp)
362 $POP r18,`$FRAME-$SIZE_T*14`($sp)
363 $POP r19,`$FRAME-$SIZE_T*13`($sp)
364 $POP r20,`$FRAME-$SIZE_T*12`($sp)
365 $POP r21,`$FRAME-$SIZE_T*11`($sp)
366 $POP r22,`$FRAME-$SIZE_T*10`($sp)
367 $POP r23,`$FRAME-$SIZE_T*9`($sp)
368 $POP r24,`$FRAME-$SIZE_T*8`($sp)
369 $POP r25,`$FRAME-$SIZE_T*7`($sp)
370 $POP r26,`$FRAME-$SIZE_T*6`($sp)
371 $POP r27,`$FRAME-$SIZE_T*5`($sp)
372 $POP r28,`$FRAME-$SIZE_T*4`($sp)
373 $POP r29,`$FRAME-$SIZE_T*3`($sp)
374 $POP r30,`$FRAME-$SIZE_T*2`($sp)
375 $POP r31,`$FRAME-$SIZE_T*1`($sp)
376 mtlr r0
377 addi $sp,$sp,$FRAME
378 blr
379 .long 0
380 .byte 0,12,4,1,0x80,18,1,0
381 .long 0
382 .size KeccakF1600,.-KeccakF1600
383
384 .type dword_le_load,\@function
385 .align 5
386 dword_le_load:
387 lbzu r0,1(r3)
388 lbzu r4,1(r3)
389 lbzu r5,1(r3)
390 insrdi r0,r4,8,48
391 lbzu r4,1(r3)
392 insrdi r0,r5,8,40
393 lbzu r5,1(r3)
394 insrdi r0,r4,8,32
395 lbzu r4,1(r3)
396 insrdi r0,r5,8,24
397 lbzu r5,1(r3)
398 insrdi r0,r4,8,16
399 lbzu r4,1(r3)
400 insrdi r0,r5,8,8
401 insrdi r0,r4,8,0
402 blr
403 .long 0
404 .byte 0,12,0x14,0,0,0,1,0
405 .long 0
406 .size dword_le_load,.-dword_le_load
407
408 .globl SHA3_absorb
409 .type SHA3_absorb,\@function
410 .align 5
411 SHA3_absorb:
412 $STU $sp,-$FRAME($sp)
413 mflr r0
414 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
415 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
416 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
417 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
418 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
419 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
420 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
421 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
422 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
423 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
424 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
425 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
426 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
427 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
428 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
429 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
430 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
431 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
432 $PUSH r0,`$FRAME+$LRSAVE`($sp)
433
434 bl PICmeup
435 subi r4,r4,1 ; prepare for lbzu
436 subi r12,r12,8 ; prepare for ldu
437
438 $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][]
439 $PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp
440 $PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len
441 $PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz
442 mr r0,r6
443 $PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
444
445 ld $A[0][0],`8*0`(r3) ; load A[5][5]
446 ld $A[0][1],`8*1`(r3)
447 ld $A[0][2],`8*2`(r3)
448 ld $A[0][3],`8*3`(r3)
449 ld $A[0][4],`8*4`(r3)
450 ld $A[1][0],`8*5`(r3)
451 ld $A[1][1],`8*6`(r3)
452 ld $A[1][2],`8*7`(r3)
453 ld $A[1][3],`8*8`(r3)
454 ld $A[1][4],`8*9`(r3)
455 ld $A[2][0],`8*10`(r3)
456 ld $A[2][1],`8*11`(r3)
457 ld $A[2][2],`8*12`(r3)
458 ld $A[2][3],`8*13`(r3)
459 ld $A[2][4],`8*14`(r3)
460 ld $A[3][0],`8*15`(r3)
461 ld $A[3][1],`8*16`(r3)
462 ld $A[3][2],`8*17`(r3)
463 ld $A[3][3],`8*18`(r3)
464 ld $A[3][4],`8*19`(r3)
465 ld $A[4][0],`8*20`(r3)
466 ld $A[4][1],`8*21`(r3)
467 ld $A[4][2],`8*22`(r3)
468 ld $A[4][3],`8*23`(r3)
469 ld $A[4][4],`8*24`(r3)
470
471 mr r3,r4
472 mr r4,r5
473 mr r5,r0
474
475 b .Loop_absorb
476
477 .align 4
478 .Loop_absorb:
479 $UCMP r4,r5 ; len < bsz?
480 blt .Labsorbed
481
482 sub r4,r4,r5 ; len -= bsz
483 srwi r5,r5,3
484 $PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len
485 mtctr r5
486 bl dword_le_load ; *inp++
487 xor $A[0][0],$A[0][0],r0
488 bdz .Lprocess_block
489 bl dword_le_load ; *inp++
490 xor $A[0][1],$A[0][1],r0
491 bdz .Lprocess_block
492 bl dword_le_load ; *inp++
493 xor $A[0][2],$A[0][2],r0
494 bdz .Lprocess_block
495 bl dword_le_load ; *inp++
496 xor $A[0][3],$A[0][3],r0
497 bdz .Lprocess_block
498 bl dword_le_load ; *inp++
499 xor $A[0][4],$A[0][4],r0
500 bdz .Lprocess_block
501 bl dword_le_load ; *inp++
502 xor $A[1][0],$A[1][0],r0
503 bdz .Lprocess_block
504 bl dword_le_load ; *inp++
505 xor $A[1][1],$A[1][1],r0
506 bdz .Lprocess_block
507 bl dword_le_load ; *inp++
508 xor $A[1][2],$A[1][2],r0
509 bdz .Lprocess_block
510 bl dword_le_load ; *inp++
511 xor $A[1][3],$A[1][3],r0
512 bdz .Lprocess_block
513 bl dword_le_load ; *inp++
514 xor $A[1][4],$A[1][4],r0
515 bdz .Lprocess_block
516 bl dword_le_load ; *inp++
517 xor $A[2][0],$A[2][0],r0
518 bdz .Lprocess_block
519 bl dword_le_load ; *inp++
520 xor $A[2][1],$A[2][1],r0
521 bdz .Lprocess_block
522 bl dword_le_load ; *inp++
523 xor $A[2][2],$A[2][2],r0
524 bdz .Lprocess_block
525 bl dword_le_load ; *inp++
526 xor $A[2][3],$A[2][3],r0
527 bdz .Lprocess_block
528 bl dword_le_load ; *inp++
529 xor $A[2][4],$A[2][4],r0
530 bdz .Lprocess_block
531 bl dword_le_load ; *inp++
532 xor $A[3][0],$A[3][0],r0
533 bdz .Lprocess_block
534 bl dword_le_load ; *inp++
535 xor $A[3][1],$A[3][1],r0
536 bdz .Lprocess_block
537 bl dword_le_load ; *inp++
538 xor $A[3][2],$A[3][2],r0
539 bdz .Lprocess_block
540 bl dword_le_load ; *inp++
541 xor $A[3][3],$A[3][3],r0
542 bdz .Lprocess_block
543 bl dword_le_load ; *inp++
544 xor $A[3][4],$A[3][4],r0
545 bdz .Lprocess_block
546 bl dword_le_load ; *inp++
547 xor $A[4][0],$A[4][0],r0
548 bdz .Lprocess_block
549 bl dword_le_load ; *inp++
550 xor $A[4][1],$A[4][1],r0
551 bdz .Lprocess_block
552 bl dword_le_load ; *inp++
553 xor $A[4][2],$A[4][2],r0
554 bdz .Lprocess_block
555 bl dword_le_load ; *inp++
556 xor $A[4][3],$A[4][3],r0
557 bdz .Lprocess_block
558 bl dword_le_load ; *inp++
559 xor $A[4][4],$A[4][4],r0
560
561 .Lprocess_block:
562 $PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp
563
564 bl KeccakF1600_int
565
566 $POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24]
567 $POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz
568 $POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len
569 $POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp
570 addic r0,r0,`-8*24` ; rewind iotas
571 $PUSH r0,`$LOCALS+4*$SIZE_T`($sp)
572
573 b .Loop_absorb
574
575 .align 4
576 .Labsorbed:
577 $POP r3,`$LOCALS+0*$SIZE_T`($sp)
578 std $A[0][0],`8*0`(r3) ; return A[5][5]
579 std $A[0][1],`8*1`(r3)
580 std $A[0][2],`8*2`(r3)
581 std $A[0][3],`8*3`(r3)
582 std $A[0][4],`8*4`(r3)
583 std $A[1][0],`8*5`(r3)
584 std $A[1][1],`8*6`(r3)
585 std $A[1][2],`8*7`(r3)
586 std $A[1][3],`8*8`(r3)
587 std $A[1][4],`8*9`(r3)
588 std $A[2][0],`8*10`(r3)
589 std $A[2][1],`8*11`(r3)
590 std $A[2][2],`8*12`(r3)
591 std $A[2][3],`8*13`(r3)
592 std $A[2][4],`8*14`(r3)
593 std $A[3][0],`8*15`(r3)
594 std $A[3][1],`8*16`(r3)
595 std $A[3][2],`8*17`(r3)
596 std $A[3][3],`8*18`(r3)
597 std $A[3][4],`8*19`(r3)
598 std $A[4][0],`8*20`(r3)
599 std $A[4][1],`8*21`(r3)
600 std $A[4][2],`8*22`(r3)
601 std $A[4][3],`8*23`(r3)
602 std $A[4][4],`8*24`(r3)
603
604 mr r3,r4 ; return value
605 $POP r0,`$FRAME+$LRSAVE`($sp)
606 $POP r14,`$FRAME-$SIZE_T*18`($sp)
607 $POP r15,`$FRAME-$SIZE_T*17`($sp)
608 $POP r16,`$FRAME-$SIZE_T*16`($sp)
609 $POP r17,`$FRAME-$SIZE_T*15`($sp)
610 $POP r18,`$FRAME-$SIZE_T*14`($sp)
611 $POP r19,`$FRAME-$SIZE_T*13`($sp)
612 $POP r20,`$FRAME-$SIZE_T*12`($sp)
613 $POP r21,`$FRAME-$SIZE_T*11`($sp)
614 $POP r22,`$FRAME-$SIZE_T*10`($sp)
615 $POP r23,`$FRAME-$SIZE_T*9`($sp)
616 $POP r24,`$FRAME-$SIZE_T*8`($sp)
617 $POP r25,`$FRAME-$SIZE_T*7`($sp)
618 $POP r26,`$FRAME-$SIZE_T*6`($sp)
619 $POP r27,`$FRAME-$SIZE_T*5`($sp)
620 $POP r28,`$FRAME-$SIZE_T*4`($sp)
621 $POP r29,`$FRAME-$SIZE_T*3`($sp)
622 $POP r30,`$FRAME-$SIZE_T*2`($sp)
623 $POP r31,`$FRAME-$SIZE_T*1`($sp)
624 mtlr r0
625 addi $sp,$sp,$FRAME
626 blr
627 .long 0
628 .byte 0,12,4,1,0x80,18,4,0
629 .long 0
630 .size SHA3_absorb,.-SHA3_absorb
631 ___
632 {
633 my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
634 $code.=<<___;
635 .globl SHA3_squeeze
636 .type SHA3_squeeze,\@function
637 .align 5
638 SHA3_squeeze:
639 $STU $sp,`-10*$SIZE_T`($sp)
640 mflr r0
641 $PUSH r28,`6*$SIZE_T`($sp)
642 $PUSH r29,`7*$SIZE_T`($sp)
643 $PUSH r30,`8*$SIZE_T`($sp)
644 $PUSH r31,`9*$SIZE_T`($sp)
645 $PUSH r0,`10*$SIZE_T+$LRSAVE`($sp)
646
647 mr $A_flat,r3
648 subi r3,r3,8 ; prepare for ldu
649 subi $out,r4,1 ; prepare for stbu
650 mr $len,r5
651 mr $bsz,r6
652 b .Loop_squeeze
653
654 .align 4
655 .Loop_squeeze:
656 ldu r0,8(r3)
657 ${UCMP}i $len,8
658 blt .Lsqueeze_tail
659
660 stbu r0,1($out)
661 srdi r0,r0,8
662 stbu r0,1($out)
663 srdi r0,r0,8
664 stbu r0,1($out)
665 srdi r0,r0,8
666 stbu r0,1($out)
667 srdi r0,r0,8
668 stbu r0,1($out)
669 srdi r0,r0,8
670 stbu r0,1($out)
671 srdi r0,r0,8
672 stbu r0,1($out)
673 srdi r0,r0,8
674 stbu r0,1($out)
675
676 subic. $len,$len,8
677 beq .Lsqueeze_done
678
679 subic. r6,r6,8
680 bgt .Loop_squeeze
681
682 mr r3,$A_flat
683 bl KeccakF1600
684 subi r3,$A_flat,8 ; prepare for ldu
685 mr r6,$bsz
686 b .Loop_squeeze
687
688 .align 4
689 .Lsqueeze_tail:
690 mtctr $len
691 .Loop_tail:
692 stbu r0,1($out)
693 srdi r0,r0,8
694 bdnz .Loop_tail
695
696 .Lsqueeze_done:
697 $POP r0,`10*$SIZE_T+$LRSAVE`($sp)
698 $POP r28,`6*$SIZE_T`($sp)
699 $POP r29,`7*$SIZE_T`($sp)
700 $POP r30,`8*$SIZE_T`($sp)
701 $POP r31,`9*$SIZE_T`($sp)
702 mtlr r0
703 addi $sp,$sp,`10*$SIZE_T`
704 blr
705 .long 0
706 .byte 0,12,4,1,0x80,4,4,0
707 .long 0
708 .size SHA3_squeeze,.-SHA3_squeeze
709 ___
710 }
711
712 # Ugly hack here, because PPC assembler syntax seem to vary too
713 # much from platforms to platform...
714 $code.=<<___;
715 .align 6
716 PICmeup:
717 mflr r0
718 bcl 20,31,\$+4
719 mflr r12 ; vvvvvv "distance" between . and 1st data entry
720 addi r12,r12,`64-8`
721 mtlr r0
722 blr
723 .long 0
724 .byte 0,12,0x14,0,0,0,0,0
725 .space `64-9*4`
726 .type iotas,\@object
727 iotas:
728 .quad 0x0000000000000001
729 .quad 0x0000000000008082
730 .quad 0x800000000000808a
731 .quad 0x8000000080008000
732 .quad 0x000000000000808b
733 .quad 0x0000000080000001
734 .quad 0x8000000080008081
735 .quad 0x8000000000008009
736 .quad 0x000000000000008a
737 .quad 0x0000000000000088
738 .quad 0x0000000080008009
739 .quad 0x000000008000000a
740 .quad 0x000000008000808b
741 .quad 0x800000000000008b
742 .quad 0x8000000000008089
743 .quad 0x8000000000008003
744 .quad 0x8000000000008002
745 .quad 0x8000000000000080
746 .quad 0x000000000000800a
747 .quad 0x800000008000000a
748 .quad 0x8000000080008081
749 .quad 0x8000000000008080
750 .quad 0x0000000080000001
751 .quad 0x8000000080008008
752 .size iotas,.-iotas
753 .asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
754 ___
755
756 $code =~ s/\`([^\`]*)\`/eval $1/gem;
757 print $code;
758 close STDOUT;