]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/keccak1600p8-ppc.pl
PPC assembly pack: correct POWER9 results.
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600p8-ppc.pl
CommitLineData
53ddf7dd 1#!/usr/bin/env perl
83cf7abf 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
53ddf7dd
AP
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PowerISA 2.07.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT SIMD implementation, but with
21# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
22# POWER8 processor spends 9.8 cycles to process byte out of large
23# buffer for r=1088, which matches SHA3-256. This is 17% better than
24# scalar PPC64 code. It probably should be noted that if POWER8's
25# successor can achieve higher scalar instruction issue rate, then
41013cd6 26# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
53ddf7dd
AP
27
28$flavour = shift;
29
30if ($flavour =~ /64/) {
31 $SIZE_T =8;
32 $LRSAVE =2*$SIZE_T;
33 $UCMP ="cmpld";
34 $STU ="stdu";
35 $POP ="ld";
36 $PUSH ="std";
37} elsif ($flavour =~ /32/) {
38 $SIZE_T =4;
39 $LRSAVE =$SIZE_T;
40 $STU ="stwu";
41 $POP ="lwz";
42 $PUSH ="stw";
43 $UCMP ="cmplw";
44} else { die "nonsense $flavour"; }
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
54
55my $sp ="r1";
56
57my $iotas = "r12";
58
59########################################################################
60# Register layout:
61#
62# v0 A[0][0] A[1][0]
63# v1 A[0][1] A[1][1]
64# v2 A[0][2] A[1][2]
65# v3 A[0][3] A[1][3]
66# v4 A[0][4] A[1][4]
67#
68# v5 A[2][0] A[3][0]
69# v6 A[2][1] A[3][1]
70# v7 A[2][2] A[3][2]
71# v8 A[2][3] A[3][3]
72# v9 A[2][4] A[3][4]
73#
74# v10 A[4][0] A[4][1]
75# v11 A[4][2] A[4][3]
76# v12 A[4][4] A[4][4]
77#
78# v13..25 rhotates[][]
79# v26..31 volatile
80#
81$code.=<<___;
82.machine "any"
83.text
84
85.type KeccakF1600_int,\@function
86.align 5
87KeccakF1600_int:
88 li r0,24
89 mtctr r0
90 li r0,0
91 b .Loop
92
93.align 4
94.Loop:
95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
96 vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0]
97 vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1]
98 vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2]
99 vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3]
100 vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4]
101 vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1]
102 vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1]
103 vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3]
104 vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3]
105 vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4]
106 vxor v26,v26,v31 ; C[0..1]
107 vxor v27,v27,v28 ; C[2..3]
108 vxor v28,v29,v30 ; C[4..4]
109 vspltisb v31,1
110 vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1]
111 vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3]
112 vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low!
113
114 vrld v29,v26,v31 ; ROL64(C[0..1],1)
115 vrld v30,v27,v31 ; ROL64(C[2..3],1)
116 vrld v31,v28,v31 ; ROL64(C[4..4],1)
117 vpermdi v31,v31,v29,0b10
118 vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1)
119 vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1)
120 vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low!
121
122 vpermdi v29,v26,v26,0b00 ; C[0..0]
123 vpermdi v30,v28,v26,0b10 ; C[4..0]
124 vpermdi v31,v28,v28,0b11 ; C[4..4]
125 vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0]
126 vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0]
127 vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0]
128 vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4]
129 vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4]
130
131 vpermdi v29,v27,v27,0b00 ; C[2..2]
132 vpermdi v30,v26,v26,0b11 ; C[1..1]
133 vpermdi v31,v26,v27,0b10 ; C[1..2]
134 vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2]
135 vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2]
136 vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1]
137 vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1]
138 vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2]
139
140 vpermdi v29,v27,v27,0b11 ; C[3..3]
141 vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3]
142 vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3]
143 vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3]
144
145 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
146 vrld v26,v0, v13 ; v0
147 vrld v1, v1, v14
148 vrld v27,v2, v15 ; v2
149 vrld v28,v3, v16 ; v3
150 vrld v4, v4, v17
151 vrld v5, v5, v18
152 vrld v6, v6, v19
153 vrld v29,v7, v20 ; v7
154 vrld v8, v8, v21
155 vrld v9, v9, v22
156 vrld v10,v10,v23
157 vrld v30,v11,v24 ; v11
158 vrld v12,v12,v25
159
160 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
161 vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3]
162 vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0]
163 vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0]
164 vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4]
165 vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4]
166 vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1]
167 vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2]
168 vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1]
169 vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0]
170 vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2]
171 vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1]
172 vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3]
173 vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3]
174
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
176 lvx_u v31,$iotas,r0 ; iotas[index]
177 addic r0,r0,16 ; index++
178
179 vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2])
180 vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3])
181 vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4])
182 vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0])
183 vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1])
184 vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
185 vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
186 vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
187 vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
188 vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
189
190 vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2])
191 vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3])
192 vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4])
193 vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0])
194 vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1])
195 vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
196 vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
197 vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
198 vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
199 vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
200
201 vxor v0, v0, v31 ; A[0][0] ^= iotas[index++]
202
203 vpermdi v26,v10,v11,0b10 ; A[4][1..2]
204 vpermdi v27,v12,v10,0b00 ; A[4][4..0]
205 vpermdi v28,v11,v12,0b10 ; A[4][3..4]
206 vpermdi v29,v10,v10,0b10 ; A[4][1..0]
207 vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3])
208 vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0])
209 vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1])
210 vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
211 vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
212 vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
213
214 bdnz .Loop
215
216 vpermdi v12,v12,v12,0b11 ; broadcast A[4][4]
217 blr
218 .long 0
219 .byte 0,12,0x14,0,0,0,0,0
220.size KeccakF1600_int,.-KeccakF1600_int
221
222.type KeccakF1600,\@function
223.align 5
224KeccakF1600:
225 $STU $sp,-$FRAME($sp)
226 li r10,`15+6*$SIZE_T`
227 li r11,`31+6*$SIZE_T`
228 mflr r8
229 mfspr r7, 256 ; save vrsave
230 stvx v20,r10,$sp
231 addi r10,r10,32
232 stvx v21,r11,$sp
233 addi r11,r11,32
234 stvx v22,r10,$sp
235 addi r10,r10,32
236 stvx v23,r11,$sp
237 addi r11,r11,32
238 stvx v24,r10,$sp
239 addi r10,r10,32
240 stvx v25,r11,$sp
241 addi r11,r11,32
242 stvx v26,r10,$sp
243 addi r10,r10,32
244 stvx v27,r11,$sp
245 addi r11,r11,32
246 stvx v28,r10,$sp
247 addi r10,r10,32
248 stvx v29,r11,$sp
249 addi r11,r11,32
250 stvx v30,r10,$sp
251 stvx v31,r11,$sp
252 stw r7,`$FRAME-4`($sp) ; save vrsave
253 li r0, -1
254 $PUSH r8,`$FRAME+$LRSAVE`($sp)
255 mtspr 256, r0 ; preserve all AltiVec registers
256
257 li r11,16
258 lvx_4w v0,0,r3 ; load A[5][5]
259 li r10,32
260 lvx_4w v1,r11,r3
261 addi r11,r11,32
262 lvx_4w v2,r10,r3
263 addi r10,r10,32
264 lvx_4w v3,r11,r3
265 addi r11,r11,32
266 lvx_4w v4,r10,r3
267 addi r10,r10,32
268 lvx_4w v5,r11,r3
269 addi r11,r11,32
270 lvx_4w v6,r10,r3
271 addi r10,r10,32
272 lvx_4w v7,r11,r3
273 addi r11,r11,32
274 lvx_4w v8,r10,r3
275 addi r10,r10,32
276 lvx_4w v9,r11,r3
277 addi r11,r11,32
278 lvx_4w v10,r10,r3
279 addi r10,r10,32
280 lvx_4w v11,r11,r3
281 lvx_splt v12,r10,r3
282
283 bl PICmeup
284
285 li r11,16
286 lvx_u v13,0,r12 ; load rhotates
287 li r10,32
288 lvx_u v14,r11,r12
289 addi r11,r11,32
290 lvx_u v15,r10,r12
291 addi r10,r10,32
292 lvx_u v16,r11,r12
293 addi r11,r11,32
294 lvx_u v17,r10,r12
295 addi r10,r10,32
296 lvx_u v18,r11,r12
297 addi r11,r11,32
298 lvx_u v19,r10,r12
299 addi r10,r10,32
300 lvx_u v20,r11,r12
301 addi r11,r11,32
302 lvx_u v21,r10,r12
303 addi r10,r10,32
304 lvx_u v22,r11,r12
305 addi r11,r11,32
306 lvx_u v23,r10,r12
307 addi r10,r10,32
308 lvx_u v24,r11,r12
309 lvx_u v25,r10,r12
310 addi r12,r12,`16*16` ; points at iotas
311
312 bl KeccakF1600_int
313
314 li r11,16
315 stvx_4w v0,0,r3 ; return A[5][5]
316 li r10,32
317 stvx_4w v1,r11,r3
318 addi r11,r11,32
319 stvx_4w v2,r10,r3
320 addi r10,r10,32
321 stvx_4w v3,r11,r3
322 addi r11,r11,32
323 stvx_4w v4,r10,r3
324 addi r10,r10,32
325 stvx_4w v5,r11,r3
326 addi r11,r11,32
327 stvx_4w v6,r10,r3
328 addi r10,r10,32
329 stvx_4w v7,r11,r3
330 addi r11,r11,32
331 stvx_4w v8,r10,r3
332 addi r10,r10,32
333 stvx_4w v9,r11,r3
334 addi r11,r11,32
335 stvx_4w v10,r10,r3
336 addi r10,r10,32
337 stvx_4w v11,r11,r3
338 stvdx_u v12,r10,r3
339
340 li r10,`15+6*$SIZE_T`
341 li r11,`31+6*$SIZE_T`
342 mtlr r8
343 mtspr 256, r7 ; restore vrsave
344 lvx v20,r10,$sp
345 addi r10,r10,32
346 lvx v21,r11,$sp
347 addi r11,r11,32
348 lvx v22,r10,$sp
349 addi r10,r10,32
350 lvx v23,r11,$sp
351 addi r11,r11,32
352 lvx v24,r10,$sp
353 addi r10,r10,32
354 lvx v25,r11,$sp
355 addi r11,r11,32
356 lvx v26,r10,$sp
357 addi r10,r10,32
358 lvx v27,r11,$sp
359 addi r11,r11,32
360 lvx v28,r10,$sp
361 addi r10,r10,32
362 lvx v29,r11,$sp
363 addi r11,r11,32
364 lvx v30,r10,$sp
365 lvx v31,r11,$sp
366 addi $sp,$sp,$FRAME
367 blr
368 .long 0
369 .byte 0,12,0x04,1,0x80,0,1,0
370 .long 0
371.size KeccakF1600,.-KeccakF1600
372___
373{
374my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
375
376$code.=<<___;
377.globl SHA3_absorb
378.type SHA3_absorb,\@function
379.align 5
380SHA3_absorb:
381 $STU $sp,-$FRAME($sp)
382 li r10,`15+6*$SIZE_T`
383 li r11,`31+6*$SIZE_T`
384 mflr r8
385 mfspr r7, 256 ; save vrsave
386 stvx v20,r10,$sp
387 addi r10,r10,32
388 stvx v21,r11,$sp
389 addi r11,r11,32
390 stvx v22,r10,$sp
391 addi r10,r10,32
392 stvx v23,r11,$sp
393 addi r11,r11,32
394 stvx v24,r10,$sp
395 addi r10,r10,32
396 stvx v25,r11,$sp
397 addi r11,r11,32
398 stvx v26,r10,$sp
399 addi r10,r10,32
400 stvx v27,r11,$sp
401 addi r11,r11,32
402 stvx v28,r10,$sp
403 addi r10,r10,32
404 stvx v29,r11,$sp
405 addi r11,r11,32
406 stvx v30,r10,$sp
407 stvx v31,r11,$sp
408 stw r7,`$FRAME-4`($sp) ; save vrsave
409 li r0, -1
410 $PUSH r8,`$FRAME+$LRSAVE`($sp)
411 mtspr 256, r0 ; preserve all AltiVec registers
412
413 li r11,16
414 lvx_4w v0,0,$A_jagged ; load A[5][5]
415 li r10,32
416 lvx_4w v1,r11,$A_jagged
417 addi r11,r11,32
418 lvx_4w v2,r10,$A_jagged
419 addi r10,r10,32
420 lvx_4w v3,r11,$A_jagged
421 addi r11,r11,32
422 lvx_4w v4,r10,$A_jagged
423 addi r10,r10,32
424 lvx_4w v5,r11,$A_jagged
425 addi r11,r11,32
426 lvx_4w v6,r10,$A_jagged
427 addi r10,r10,32
428 lvx_4w v7,r11,$A_jagged
429 addi r11,r11,32
430 lvx_4w v8,r10,$A_jagged
431 addi r10,r10,32
432 lvx_4w v9,r11,$A_jagged
433 addi r11,r11,32
434 lvx_4w v10,r10,$A_jagged
435 addi r10,r10,32
436 lvx_4w v11,r11,$A_jagged
437 lvx_splt v12,r10,$A_jagged
438
439 bl PICmeup
440
441 li r11,16
442 lvx_u v13,0,r12 ; load rhotates
443 li r10,32
444 lvx_u v14,r11,r12
445 addi r11,r11,32
446 lvx_u v15,r10,r12
447 addi r10,r10,32
448 lvx_u v16,r11,r12
449 addi r11,r11,32
450 lvx_u v17,r10,r12
451 addi r10,r10,32
452 lvx_u v18,r11,r12
453 addi r11,r11,32
454 lvx_u v19,r10,r12
455 addi r10,r10,32
456 lvx_u v20,r11,r12
457 addi r11,r11,32
458 lvx_u v21,r10,r12
459 addi r10,r10,32
460 lvx_u v22,r11,r12
461 addi r11,r11,32
462 lvx_u v23,r10,r12
463 addi r10,r10,32
464 lvx_u v24,r11,r12
465 lvx_u v25,r10,r12
466 li r10,-32
467 li r11,-16
468 addi r12,r12,`16*16` ; points at iotas
469 b .Loop_absorb
470
471.align 4
472.Loop_absorb:
473 $UCMP $len,$bsz ; len < bsz?
474 blt .Labsorbed
475
476 sub $len,$len,$bsz ; len -= bsz
477 srwi r0,$bsz,3
478 mtctr r0
479
480 lvx_u v30,r10,r12 ; permutation masks
481 lvx_u v31,r11,r12
482 ?vspltisb v27,7 ; prepare masks for byte swap
483 ?vxor v30,v30,v27 ; on big-endian
484 ?vxor v31,v31,v27
485
486 vxor v27,v27,v27 ; zero
487 lvdx_u v26,0,$inp
488 addi $inp,$inp,8
489 vperm v26,v26,v27,v30
490 vxor v0, v0, v26
491 bdz .Lprocess_block
492 lvdx_u v26,0,$inp
493 addi $inp,$inp,8
494 vperm v26,v26,v27,v30
495 vxor v1, v1, v26
496 bdz .Lprocess_block
497 lvdx_u v26,0,$inp
498 addi $inp,$inp,8
499 vperm v26,v26,v27,v30
500 vxor v2, v2, v26
501 bdz .Lprocess_block
502 lvdx_u v26,0,$inp
503 addi $inp,$inp,8
504 vperm v26,v26,v27,v30
505 vxor v3, v3, v26
506 bdz .Lprocess_block
507 lvdx_u v26,0,$inp
508 addi $inp,$inp,8
509 vperm v26,v26,v27,v30
510 vxor v4, v4, v26
511 bdz .Lprocess_block
512 lvdx_u v26,0,$inp
513 addi $inp,$inp,8
514 vperm v26,v26,v27,v31
515 vxor v0, v0, v26
516 bdz .Lprocess_block
517 lvdx_u v26,0,$inp
518 addi $inp,$inp,8
519 vperm v26,v26,v27,v31
520 vxor v1, v1, v26
521 bdz .Lprocess_block
522 lvdx_u v26,0,$inp
523 addi $inp,$inp,8
524 vperm v26,v26,v27,v31
525 vxor v2, v2, v26
526 bdz .Lprocess_block
527 lvdx_u v26,0,$inp
528 addi $inp,$inp,8
529 vperm v26,v26,v27,v31
530 vxor v3, v3, v26
531 bdz .Lprocess_block
532 lvdx_u v26,0,$inp
533 addi $inp,$inp,8
534 vperm v26,v26,v27,v31
535 vxor v4, v4, v26
536 bdz .Lprocess_block
537 lvdx_u v26,0,$inp
538 addi $inp,$inp,8
539 vperm v26,v26,v27,v30
540 vxor v5, v5, v26
541 bdz .Lprocess_block
542 lvdx_u v26,0,$inp
543 addi $inp,$inp,8
544 vperm v26,v26,v27,v30
545 vxor v6, v6, v26
546 bdz .Lprocess_block
547 lvdx_u v26,0,$inp
548 addi $inp,$inp,8
549 vperm v26,v26,v27,v30
550 vxor v7, v7, v26
551 bdz .Lprocess_block
552 lvdx_u v26,0,$inp
553 addi $inp,$inp,8
554 vperm v26,v26,v27,v30
555 vxor v8, v8, v26
556 bdz .Lprocess_block
557 lvdx_u v26,0,$inp
558 addi $inp,$inp,8
559 vperm v26,v26,v27,v30
560 vxor v9, v9, v26
561 bdz .Lprocess_block
562 lvdx_u v26,0,$inp
563 addi $inp,$inp,8
564 vperm v26,v26,v27,v31
565 vxor v5, v5, v26
566 bdz .Lprocess_block
567 lvdx_u v26,0,$inp
568 addi $inp,$inp,8
569 vperm v26,v26,v27,v31
570 vxor v6, v6, v26
571 bdz .Lprocess_block
572 lvdx_u v26,0,$inp
573 addi $inp,$inp,8
574 vperm v26,v26,v27,v31
575 vxor v7, v7, v26
576 bdz .Lprocess_block
577 lvdx_u v26,0,$inp
578 addi $inp,$inp,8
579 vperm v26,v26,v27,v31
580 vxor v8, v8, v26
581 bdz .Lprocess_block
582 lvdx_u v26,0,$inp
583 addi $inp,$inp,8
584 vperm v26,v26,v27,v31
585 vxor v9, v9, v26
586 bdz .Lprocess_block
587 lvdx_u v26,0,$inp
588 addi $inp,$inp,8
589 vperm v26,v26,v27,v30
590 vxor v10, v10, v26
591 bdz .Lprocess_block
592 lvdx_u v26,0,$inp
593 addi $inp,$inp,8
594 vperm v26,v26,v27,v31
595 vxor v10, v10, v26
596 bdz .Lprocess_block
597 lvdx_u v26,0,$inp
598 addi $inp,$inp,8
599 vperm v26,v26,v27,v30
600 vxor v11, v11, v26
601 bdz .Lprocess_block
602 lvdx_u v26,0,$inp
603 addi $inp,$inp,8
604 vperm v26,v26,v27,v31
605 vxor v11, v11, v26
606 bdz .Lprocess_block
607 lvdx_u v26,0,$inp
608 addi $inp,$inp,8
609 vperm v26,v26,v27,v31
610 vxor v12, v12, v26
611
612.Lprocess_block:
613 bl KeccakF1600_int
614
615 b .Loop_absorb
616
617.align 4
618.Labsorbed:
619 li r11,16
620 stvx_4w v0,0,$A_jagged ; return A[5][5]
621 li r10,32
622 stvx_4w v1,r11,$A_jagged
623 addi r11,r11,32
624 stvx_4w v2,r10,$A_jagged
625 addi r10,r10,32
626 stvx_4w v3,r11,$A_jagged
627 addi r11,r11,32
628 stvx_4w v4,r10,$A_jagged
629 addi r10,r10,32
630 stvx_4w v5,r11,$A_jagged
631 addi r11,r11,32
632 stvx_4w v6,r10,$A_jagged
633 addi r10,r10,32
634 stvx_4w v7,r11,$A_jagged
635 addi r11,r11,32
636 stvx_4w v8,r10,$A_jagged
637 addi r10,r10,32
638 stvx_4w v9,r11,$A_jagged
639 addi r11,r11,32
640 stvx_4w v10,r10,$A_jagged
641 addi r10,r10,32
642 stvx_4w v11,r11,$A_jagged
643 stvdx_u v12,r10,$A_jagged
644
645 mr r3,$len ; return value
646 li r10,`15+6*$SIZE_T`
647 li r11,`31+6*$SIZE_T`
648 mtlr r8
649 mtspr 256, r7 ; restore vrsave
650 lvx v20,r10,$sp
651 addi r10,r10,32
652 lvx v21,r11,$sp
653 addi r11,r11,32
654 lvx v22,r10,$sp
655 addi r10,r10,32
656 lvx v23,r11,$sp
657 addi r11,r11,32
658 lvx v24,r10,$sp
659 addi r10,r10,32
660 lvx v25,r11,$sp
661 addi r11,r11,32
662 lvx v26,r10,$sp
663 addi r10,r10,32
664 lvx v27,r11,$sp
665 addi r11,r11,32
666 lvx v28,r10,$sp
667 addi r10,r10,32
668 lvx v29,r11,$sp
669 addi r11,r11,32
670 lvx v30,r10,$sp
671 lvx v31,r11,$sp
672 addi $sp,$sp,$FRAME
673 blr
674 .long 0
675 .byte 0,12,0x04,1,0x80,0,4,0
676 .long 0
677.size SHA3_absorb,.-SHA3_absorb
678___
679}
680{
681my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
682
683$code.=<<___;
684.globl SHA3_squeeze
685.type SHA3_squeeze,\@function
686.align 5
687SHA3_squeeze:
688 mflr r9 ; r9 is not touched by KeccakF1600
689 subi $out,$out,1 ; prepare for stbu
690 addi r8,$A_jagged,4 ; prepare volatiles
691 mr r10,$bsz
692 li r11,0
693 b .Loop_squeeze
694.align 4
695.Loop_squeeze:
696 lwzx r7,r11,r8 ; lo
697 lwzx r0,r11,$A_jagged ; hi
698 ${UCMP}i $len,8
699 blt .Lsqueeze_tail
700
701 stbu r7,1($out) ; write lo
702 srwi r7,r7,8
703 stbu r7,1($out)
704 srwi r7,r7,8
705 stbu r7,1($out)
706 srwi r7,r7,8
707 stbu r7,1($out)
708 stbu r0,1($out) ; write hi
709 srwi r0,r0,8
710 stbu r0,1($out)
711 srwi r0,r0,8
712 stbu r0,1($out)
713 srwi r0,r0,8
714 stbu r0,1($out)
715
716 subic. $len,$len,8
717 beqlr ; return if done
718
719 subic. r10,r10,8
720 ble .Loutput_expand
721
722 addi r11,r11,16 ; calculate jagged index
723 cmplwi r11,`16*5`
724 blt .Loop_squeeze
725 subi r11,r11,72
726 beq .Loop_squeeze
727 addi r11,r11,72
728 cmplwi r11,`16*5+8`
729 subi r11,r11,8
730 beq .Loop_squeeze
731 addi r11,r11,8
732 cmplwi r11,`16*10`
733 subi r11,r11,72
734 beq .Loop_squeeze
735 addi r11,r11,72
736 blt .Loop_squeeze
737 subi r11,r11,8
738 b .Loop_squeeze
739
740.align 4
741.Loutput_expand:
742 bl KeccakF1600
743 mtlr r9
744
745 addi r8,$A_jagged,4 ; restore volatiles
746 mr r10,$bsz
747 li r11,0
748 b .Loop_squeeze
749
750.align 4
751.Lsqueeze_tail:
752 mtctr $len
753 subic. $len,$len,4
754 ble .Loop_tail_lo
755 li r8,4
756 mtctr r8
757.Loop_tail_lo:
758 stbu r7,1($out)
759 srdi r7,r7,8
760 bdnz .Loop_tail_lo
761 ble .Lsqueeze_done
762 mtctr $len
763.Loop_tail_hi:
764 stbu r0,1($out)
765 srdi r0,r0,8
766 bdnz .Loop_tail_hi
767
768.Lsqueeze_done:
769 blr
770 .long 0
771 .byte 0,12,0x14,0,0,0,4,0
772 .long 0
773.size SHA3_squeeze,.-SHA3_squeeze
774___
775}
776$code.=<<___;
777.align 6
778PICmeup:
779 mflr r0
780 bcl 20,31,\$+4
781 mflr r12 ; vvvvvv "distance" between . and 1st data entry
782 addi r12,r12,`64-8`
783 mtlr r0
784 blr
785 .long 0
786 .byte 0,12,0x14,0,0,0,0,0
787 .space `64-9*4`
788.type rhotates,\@object
789.align 6
790rhotates:
791 .quad 0, 36
792 .quad 1, 44
793 .quad 62, 6
794 .quad 28, 55
795 .quad 27, 20
796 .quad 3, 41
797 .quad 10, 45
798 .quad 43, 15
799 .quad 25, 21
800 .quad 39, 8
801 .quad 18, 2
802 .quad 61, 56
803 .quad 14, 14
804.size rhotates,.-rhotates
805 .quad 0,0
806 .quad 0x0001020304050607,0x1011121314151617
807 .quad 0x1011121314151617,0x0001020304050607
808.type iotas,\@object
809iotas:
810 .quad 0x0000000000000001,0
811 .quad 0x0000000000008082,0
812 .quad 0x800000000000808a,0
813 .quad 0x8000000080008000,0
814 .quad 0x000000000000808b,0
815 .quad 0x0000000080000001,0
816 .quad 0x8000000080008081,0
817 .quad 0x8000000000008009,0
818 .quad 0x000000000000008a,0
819 .quad 0x0000000000000088,0
820 .quad 0x0000000080008009,0
821 .quad 0x000000008000000a,0
822 .quad 0x000000008000808b,0
823 .quad 0x800000000000008b,0
824 .quad 0x8000000000008089,0
825 .quad 0x8000000000008003,0
826 .quad 0x8000000000008002,0
827 .quad 0x8000000000000080,0
828 .quad 0x000000000000800a,0
829 .quad 0x800000008000000a,0
830 .quad 0x8000000080008081,0
831 .quad 0x8000000000008080,0
832 .quad 0x0000000080000001,0
833 .quad 0x8000000080008008,0
834.size iotas,.-iotas
835.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
836___
837
838foreach (split("\n",$code)) {
839 s/\`([^\`]*)\`/eval $1/ge;
840
841 if ($flavour =~ /le$/) { # little-endian
842 s/\?([a-z]+)/;$1/;
843 } else { # big-endian
844 s/\?([a-z]+)/$1/;
845 }
846
847 print $_,"\n";
848}
849
850close STDOUT;