]>
Commit | Line | Data |
---|---|---|
53ddf7dd | 1 | #!/usr/bin/env perl |
83cf7abf | 2 | # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. |
53ddf7dd AP |
3 | # |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | # | |
9 | # ==================================================================== | |
10 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
11 | # project. The module is, however, dual licensed under OpenSSL and | |
12 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
13 | # details see http://www.openssl.org/~appro/cryptogams/. | |
14 | # ==================================================================== | |
15 | # | |
16 | # Keccak-1600 for PowerISA 2.07. | |
17 | # | |
18 | # June 2017. | |
19 | # | |
20 | # This is straightforward KECCAK_1X_ALT SIMD implementation, but with | |
21 | # disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral. | |
22 | # POWER8 processor spends 9.8 cycles to process byte out of large | |
23 | # buffer for r=1088, which matches SHA3-256. This is 17% better than | |
24 | # scalar PPC64 code. It probably should be noted that if POWER8's | |
25 | # successor can achieve higher scalar instruction issue rate, then | |
41013cd6 | 26 | # this module will loose... And it does on POWER9 with 12.0 vs. 9.4. |
53ddf7dd AP |
27 | |
28 | $flavour = shift; | |
29 | ||
30 | if ($flavour =~ /64/) { | |
31 | $SIZE_T =8; | |
32 | $LRSAVE =2*$SIZE_T; | |
33 | $UCMP ="cmpld"; | |
34 | $STU ="stdu"; | |
35 | $POP ="ld"; | |
36 | $PUSH ="std"; | |
37 | } elsif ($flavour =~ /32/) { | |
38 | $SIZE_T =4; | |
39 | $LRSAVE =$SIZE_T; | |
40 | $STU ="stwu"; | |
41 | $POP ="lwz"; | |
42 | $PUSH ="stw"; | |
43 | $UCMP ="cmplw"; | |
44 | } else { die "nonsense $flavour"; } | |
45 | ||
46 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
47 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
48 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
49 | die "can't locate ppc-xlate.pl"; | |
50 | ||
51 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
52 | ||
53 | $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload | |
54 | ||
55 | my $sp ="r1"; | |
56 | ||
57 | my $iotas = "r12"; | |
58 | ||
59 | ######################################################################## | |
60 | # Register layout: | |
61 | # | |
62 | # v0 A[0][0] A[1][0] | |
63 | # v1 A[0][1] A[1][1] | |
64 | # v2 A[0][2] A[1][2] | |
65 | # v3 A[0][3] A[1][3] | |
66 | # v4 A[0][4] A[1][4] | |
67 | # | |
68 | # v5 A[2][0] A[3][0] | |
69 | # v6 A[2][1] A[3][1] | |
70 | # v7 A[2][2] A[3][2] | |
71 | # v8 A[2][3] A[3][3] | |
72 | # v9 A[2][4] A[3][4] | |
73 | # | |
74 | # v10 A[4][0] A[4][1] | |
75 | # v11 A[4][2] A[4][3] | |
76 | # v12 A[4][4] A[4][4] | |
77 | # | |
78 | # v13..25 rhotates[][] | |
79 | # v26..31 volatile | |
80 | # | |
81 | $code.=<<___; | |
82 | .machine "any" | |
83 | .text | |
84 | ||
85 | .type KeccakF1600_int,\@function | |
86 | .align 5 | |
87 | KeccakF1600_int: | |
88 | li r0,24 | |
89 | mtctr r0 | |
90 | li r0,0 | |
91 | b .Loop | |
92 | ||
93 | .align 4 | |
94 | .Loop: | |
95 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta | |
96 | vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0] | |
97 | vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1] | |
98 | vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2] | |
99 | vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3] | |
100 | vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4] | |
101 | vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1] | |
102 | vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1] | |
103 | vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3] | |
104 | vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3] | |
105 | vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4] | |
106 | vxor v26,v26,v31 ; C[0..1] | |
107 | vxor v27,v27,v28 ; C[2..3] | |
108 | vxor v28,v29,v30 ; C[4..4] | |
109 | vspltisb v31,1 | |
110 | vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1] | |
111 | vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3] | |
112 | vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low! | |
113 | ||
114 | vrld v29,v26,v31 ; ROL64(C[0..1],1) | |
115 | vrld v30,v27,v31 ; ROL64(C[2..3],1) | |
116 | vrld v31,v28,v31 ; ROL64(C[4..4],1) | |
117 | vpermdi v31,v31,v29,0b10 | |
118 | vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1) | |
119 | vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1) | |
120 | vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low! | |
121 | ||
122 | vpermdi v29,v26,v26,0b00 ; C[0..0] | |
123 | vpermdi v30,v28,v26,0b10 ; C[4..0] | |
124 | vpermdi v31,v28,v28,0b11 ; C[4..4] | |
125 | vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0] | |
126 | vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0] | |
127 | vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0] | |
128 | vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4] | |
129 | vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4] | |
130 | ||
131 | vpermdi v29,v27,v27,0b00 ; C[2..2] | |
132 | vpermdi v30,v26,v26,0b11 ; C[1..1] | |
133 | vpermdi v31,v26,v27,0b10 ; C[1..2] | |
134 | vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2] | |
135 | vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2] | |
136 | vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1] | |
137 | vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1] | |
138 | vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2] | |
139 | ||
140 | vpermdi v29,v27,v27,0b11 ; C[3..3] | |
141 | vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3] | |
142 | vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3] | |
143 | vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3] | |
144 | ||
145 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho | |
146 | vrld v26,v0, v13 ; v0 | |
147 | vrld v1, v1, v14 | |
148 | vrld v27,v2, v15 ; v2 | |
149 | vrld v28,v3, v16 ; v3 | |
150 | vrld v4, v4, v17 | |
151 | vrld v5, v5, v18 | |
152 | vrld v6, v6, v19 | |
153 | vrld v29,v7, v20 ; v7 | |
154 | vrld v8, v8, v21 | |
155 | vrld v9, v9, v22 | |
156 | vrld v10,v10,v23 | |
157 | vrld v30,v11,v24 ; v11 | |
158 | vrld v12,v12,v25 | |
159 | ||
160 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi | |
161 | vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3] | |
162 | vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0] | |
163 | vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0] | |
164 | vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4] | |
165 | vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4] | |
166 | vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1] | |
167 | vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2] | |
168 | vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1] | |
169 | vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0] | |
170 | vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2] | |
171 | vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1] | |
172 | vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3] | |
173 | vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3] | |
174 | ||
175 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota | |
176 | lvx_u v31,$iotas,r0 ; iotas[index] | |
177 | addic r0,r0,16 ; index++ | |
178 | ||
179 | vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2]) | |
180 | vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3]) | |
181 | vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4]) | |
182 | vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0]) | |
183 | vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1]) | |
184 | vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2]) | |
185 | vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3]) | |
186 | vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) | |
187 | vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) | |
188 | vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) | |
189 | ||
190 | vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2]) | |
191 | vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3]) | |
192 | vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4]) | |
193 | vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0]) | |
194 | vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1]) | |
195 | vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) | |
196 | vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) | |
197 | vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) | |
198 | vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) | |
199 | vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) | |
200 | ||
201 | vxor v0, v0, v31 ; A[0][0] ^= iotas[index++] | |
202 | ||
203 | vpermdi v26,v10,v11,0b10 ; A[4][1..2] | |
204 | vpermdi v27,v12,v10,0b00 ; A[4][4..0] | |
205 | vpermdi v28,v11,v12,0b10 ; A[4][3..4] | |
206 | vpermdi v29,v10,v10,0b10 ; A[4][1..0] | |
207 | vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3]) | |
208 | vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0]) | |
209 | vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1]) | |
210 | vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3]) | |
211 | vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0]) | |
212 | vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0]) | |
213 | ||
214 | bdnz .Loop | |
215 | ||
216 | vpermdi v12,v12,v12,0b11 ; broadcast A[4][4] | |
217 | blr | |
218 | .long 0 | |
219 | .byte 0,12,0x14,0,0,0,0,0 | |
220 | .size KeccakF1600_int,.-KeccakF1600_int | |
221 | ||
222 | .type KeccakF1600,\@function | |
223 | .align 5 | |
224 | KeccakF1600: | |
225 | $STU $sp,-$FRAME($sp) | |
226 | li r10,`15+6*$SIZE_T` | |
227 | li r11,`31+6*$SIZE_T` | |
228 | mflr r8 | |
229 | mfspr r7, 256 ; save vrsave | |
230 | stvx v20,r10,$sp | |
231 | addi r10,r10,32 | |
232 | stvx v21,r11,$sp | |
233 | addi r11,r11,32 | |
234 | stvx v22,r10,$sp | |
235 | addi r10,r10,32 | |
236 | stvx v23,r11,$sp | |
237 | addi r11,r11,32 | |
238 | stvx v24,r10,$sp | |
239 | addi r10,r10,32 | |
240 | stvx v25,r11,$sp | |
241 | addi r11,r11,32 | |
242 | stvx v26,r10,$sp | |
243 | addi r10,r10,32 | |
244 | stvx v27,r11,$sp | |
245 | addi r11,r11,32 | |
246 | stvx v28,r10,$sp | |
247 | addi r10,r10,32 | |
248 | stvx v29,r11,$sp | |
249 | addi r11,r11,32 | |
250 | stvx v30,r10,$sp | |
251 | stvx v31,r11,$sp | |
252 | stw r7,`$FRAME-4`($sp) ; save vrsave | |
253 | li r0, -1 | |
254 | $PUSH r8,`$FRAME+$LRSAVE`($sp) | |
255 | mtspr 256, r0 ; preserve all AltiVec registers | |
256 | ||
257 | li r11,16 | |
258 | lvx_4w v0,0,r3 ; load A[5][5] | |
259 | li r10,32 | |
260 | lvx_4w v1,r11,r3 | |
261 | addi r11,r11,32 | |
262 | lvx_4w v2,r10,r3 | |
263 | addi r10,r10,32 | |
264 | lvx_4w v3,r11,r3 | |
265 | addi r11,r11,32 | |
266 | lvx_4w v4,r10,r3 | |
267 | addi r10,r10,32 | |
268 | lvx_4w v5,r11,r3 | |
269 | addi r11,r11,32 | |
270 | lvx_4w v6,r10,r3 | |
271 | addi r10,r10,32 | |
272 | lvx_4w v7,r11,r3 | |
273 | addi r11,r11,32 | |
274 | lvx_4w v8,r10,r3 | |
275 | addi r10,r10,32 | |
276 | lvx_4w v9,r11,r3 | |
277 | addi r11,r11,32 | |
278 | lvx_4w v10,r10,r3 | |
279 | addi r10,r10,32 | |
280 | lvx_4w v11,r11,r3 | |
281 | lvx_splt v12,r10,r3 | |
282 | ||
283 | bl PICmeup | |
284 | ||
285 | li r11,16 | |
286 | lvx_u v13,0,r12 ; load rhotates | |
287 | li r10,32 | |
288 | lvx_u v14,r11,r12 | |
289 | addi r11,r11,32 | |
290 | lvx_u v15,r10,r12 | |
291 | addi r10,r10,32 | |
292 | lvx_u v16,r11,r12 | |
293 | addi r11,r11,32 | |
294 | lvx_u v17,r10,r12 | |
295 | addi r10,r10,32 | |
296 | lvx_u v18,r11,r12 | |
297 | addi r11,r11,32 | |
298 | lvx_u v19,r10,r12 | |
299 | addi r10,r10,32 | |
300 | lvx_u v20,r11,r12 | |
301 | addi r11,r11,32 | |
302 | lvx_u v21,r10,r12 | |
303 | addi r10,r10,32 | |
304 | lvx_u v22,r11,r12 | |
305 | addi r11,r11,32 | |
306 | lvx_u v23,r10,r12 | |
307 | addi r10,r10,32 | |
308 | lvx_u v24,r11,r12 | |
309 | lvx_u v25,r10,r12 | |
310 | addi r12,r12,`16*16` ; points at iotas | |
311 | ||
312 | bl KeccakF1600_int | |
313 | ||
314 | li r11,16 | |
315 | stvx_4w v0,0,r3 ; return A[5][5] | |
316 | li r10,32 | |
317 | stvx_4w v1,r11,r3 | |
318 | addi r11,r11,32 | |
319 | stvx_4w v2,r10,r3 | |
320 | addi r10,r10,32 | |
321 | stvx_4w v3,r11,r3 | |
322 | addi r11,r11,32 | |
323 | stvx_4w v4,r10,r3 | |
324 | addi r10,r10,32 | |
325 | stvx_4w v5,r11,r3 | |
326 | addi r11,r11,32 | |
327 | stvx_4w v6,r10,r3 | |
328 | addi r10,r10,32 | |
329 | stvx_4w v7,r11,r3 | |
330 | addi r11,r11,32 | |
331 | stvx_4w v8,r10,r3 | |
332 | addi r10,r10,32 | |
333 | stvx_4w v9,r11,r3 | |
334 | addi r11,r11,32 | |
335 | stvx_4w v10,r10,r3 | |
336 | addi r10,r10,32 | |
337 | stvx_4w v11,r11,r3 | |
338 | stvdx_u v12,r10,r3 | |
339 | ||
340 | li r10,`15+6*$SIZE_T` | |
341 | li r11,`31+6*$SIZE_T` | |
342 | mtlr r8 | |
343 | mtspr 256, r7 ; restore vrsave | |
344 | lvx v20,r10,$sp | |
345 | addi r10,r10,32 | |
346 | lvx v21,r11,$sp | |
347 | addi r11,r11,32 | |
348 | lvx v22,r10,$sp | |
349 | addi r10,r10,32 | |
350 | lvx v23,r11,$sp | |
351 | addi r11,r11,32 | |
352 | lvx v24,r10,$sp | |
353 | addi r10,r10,32 | |
354 | lvx v25,r11,$sp | |
355 | addi r11,r11,32 | |
356 | lvx v26,r10,$sp | |
357 | addi r10,r10,32 | |
358 | lvx v27,r11,$sp | |
359 | addi r11,r11,32 | |
360 | lvx v28,r10,$sp | |
361 | addi r10,r10,32 | |
362 | lvx v29,r11,$sp | |
363 | addi r11,r11,32 | |
364 | lvx v30,r10,$sp | |
365 | lvx v31,r11,$sp | |
366 | addi $sp,$sp,$FRAME | |
367 | blr | |
368 | .long 0 | |
369 | .byte 0,12,0x04,1,0x80,0,1,0 | |
370 | .long 0 | |
371 | .size KeccakF1600,.-KeccakF1600 | |
372 | ___ | |
373 | { | |
374 | my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6)); | |
375 | ||
376 | $code.=<<___; | |
377 | .globl SHA3_absorb | |
378 | .type SHA3_absorb,\@function | |
379 | .align 5 | |
380 | SHA3_absorb: | |
381 | $STU $sp,-$FRAME($sp) | |
382 | li r10,`15+6*$SIZE_T` | |
383 | li r11,`31+6*$SIZE_T` | |
384 | mflr r8 | |
385 | mfspr r7, 256 ; save vrsave | |
386 | stvx v20,r10,$sp | |
387 | addi r10,r10,32 | |
388 | stvx v21,r11,$sp | |
389 | addi r11,r11,32 | |
390 | stvx v22,r10,$sp | |
391 | addi r10,r10,32 | |
392 | stvx v23,r11,$sp | |
393 | addi r11,r11,32 | |
394 | stvx v24,r10,$sp | |
395 | addi r10,r10,32 | |
396 | stvx v25,r11,$sp | |
397 | addi r11,r11,32 | |
398 | stvx v26,r10,$sp | |
399 | addi r10,r10,32 | |
400 | stvx v27,r11,$sp | |
401 | addi r11,r11,32 | |
402 | stvx v28,r10,$sp | |
403 | addi r10,r10,32 | |
404 | stvx v29,r11,$sp | |
405 | addi r11,r11,32 | |
406 | stvx v30,r10,$sp | |
407 | stvx v31,r11,$sp | |
408 | stw r7,`$FRAME-4`($sp) ; save vrsave | |
409 | li r0, -1 | |
410 | $PUSH r8,`$FRAME+$LRSAVE`($sp) | |
411 | mtspr 256, r0 ; preserve all AltiVec registers | |
412 | ||
413 | li r11,16 | |
414 | lvx_4w v0,0,$A_jagged ; load A[5][5] | |
415 | li r10,32 | |
416 | lvx_4w v1,r11,$A_jagged | |
417 | addi r11,r11,32 | |
418 | lvx_4w v2,r10,$A_jagged | |
419 | addi r10,r10,32 | |
420 | lvx_4w v3,r11,$A_jagged | |
421 | addi r11,r11,32 | |
422 | lvx_4w v4,r10,$A_jagged | |
423 | addi r10,r10,32 | |
424 | lvx_4w v5,r11,$A_jagged | |
425 | addi r11,r11,32 | |
426 | lvx_4w v6,r10,$A_jagged | |
427 | addi r10,r10,32 | |
428 | lvx_4w v7,r11,$A_jagged | |
429 | addi r11,r11,32 | |
430 | lvx_4w v8,r10,$A_jagged | |
431 | addi r10,r10,32 | |
432 | lvx_4w v9,r11,$A_jagged | |
433 | addi r11,r11,32 | |
434 | lvx_4w v10,r10,$A_jagged | |
435 | addi r10,r10,32 | |
436 | lvx_4w v11,r11,$A_jagged | |
437 | lvx_splt v12,r10,$A_jagged | |
438 | ||
439 | bl PICmeup | |
440 | ||
441 | li r11,16 | |
442 | lvx_u v13,0,r12 ; load rhotates | |
443 | li r10,32 | |
444 | lvx_u v14,r11,r12 | |
445 | addi r11,r11,32 | |
446 | lvx_u v15,r10,r12 | |
447 | addi r10,r10,32 | |
448 | lvx_u v16,r11,r12 | |
449 | addi r11,r11,32 | |
450 | lvx_u v17,r10,r12 | |
451 | addi r10,r10,32 | |
452 | lvx_u v18,r11,r12 | |
453 | addi r11,r11,32 | |
454 | lvx_u v19,r10,r12 | |
455 | addi r10,r10,32 | |
456 | lvx_u v20,r11,r12 | |
457 | addi r11,r11,32 | |
458 | lvx_u v21,r10,r12 | |
459 | addi r10,r10,32 | |
460 | lvx_u v22,r11,r12 | |
461 | addi r11,r11,32 | |
462 | lvx_u v23,r10,r12 | |
463 | addi r10,r10,32 | |
464 | lvx_u v24,r11,r12 | |
465 | lvx_u v25,r10,r12 | |
466 | li r10,-32 | |
467 | li r11,-16 | |
468 | addi r12,r12,`16*16` ; points at iotas | |
469 | b .Loop_absorb | |
470 | ||
471 | .align 4 | |
472 | .Loop_absorb: | |
473 | $UCMP $len,$bsz ; len < bsz? | |
474 | blt .Labsorbed | |
475 | ||
476 | sub $len,$len,$bsz ; len -= bsz | |
477 | srwi r0,$bsz,3 | |
478 | mtctr r0 | |
479 | ||
480 | lvx_u v30,r10,r12 ; permutation masks | |
481 | lvx_u v31,r11,r12 | |
482 | ?vspltisb v27,7 ; prepare masks for byte swap | |
483 | ?vxor v30,v30,v27 ; on big-endian | |
484 | ?vxor v31,v31,v27 | |
485 | ||
486 | vxor v27,v27,v27 ; zero | |
487 | lvdx_u v26,0,$inp | |
488 | addi $inp,$inp,8 | |
489 | vperm v26,v26,v27,v30 | |
490 | vxor v0, v0, v26 | |
491 | bdz .Lprocess_block | |
492 | lvdx_u v26,0,$inp | |
493 | addi $inp,$inp,8 | |
494 | vperm v26,v26,v27,v30 | |
495 | vxor v1, v1, v26 | |
496 | bdz .Lprocess_block | |
497 | lvdx_u v26,0,$inp | |
498 | addi $inp,$inp,8 | |
499 | vperm v26,v26,v27,v30 | |
500 | vxor v2, v2, v26 | |
501 | bdz .Lprocess_block | |
502 | lvdx_u v26,0,$inp | |
503 | addi $inp,$inp,8 | |
504 | vperm v26,v26,v27,v30 | |
505 | vxor v3, v3, v26 | |
506 | bdz .Lprocess_block | |
507 | lvdx_u v26,0,$inp | |
508 | addi $inp,$inp,8 | |
509 | vperm v26,v26,v27,v30 | |
510 | vxor v4, v4, v26 | |
511 | bdz .Lprocess_block | |
512 | lvdx_u v26,0,$inp | |
513 | addi $inp,$inp,8 | |
514 | vperm v26,v26,v27,v31 | |
515 | vxor v0, v0, v26 | |
516 | bdz .Lprocess_block | |
517 | lvdx_u v26,0,$inp | |
518 | addi $inp,$inp,8 | |
519 | vperm v26,v26,v27,v31 | |
520 | vxor v1, v1, v26 | |
521 | bdz .Lprocess_block | |
522 | lvdx_u v26,0,$inp | |
523 | addi $inp,$inp,8 | |
524 | vperm v26,v26,v27,v31 | |
525 | vxor v2, v2, v26 | |
526 | bdz .Lprocess_block | |
527 | lvdx_u v26,0,$inp | |
528 | addi $inp,$inp,8 | |
529 | vperm v26,v26,v27,v31 | |
530 | vxor v3, v3, v26 | |
531 | bdz .Lprocess_block | |
532 | lvdx_u v26,0,$inp | |
533 | addi $inp,$inp,8 | |
534 | vperm v26,v26,v27,v31 | |
535 | vxor v4, v4, v26 | |
536 | bdz .Lprocess_block | |
537 | lvdx_u v26,0,$inp | |
538 | addi $inp,$inp,8 | |
539 | vperm v26,v26,v27,v30 | |
540 | vxor v5, v5, v26 | |
541 | bdz .Lprocess_block | |
542 | lvdx_u v26,0,$inp | |
543 | addi $inp,$inp,8 | |
544 | vperm v26,v26,v27,v30 | |
545 | vxor v6, v6, v26 | |
546 | bdz .Lprocess_block | |
547 | lvdx_u v26,0,$inp | |
548 | addi $inp,$inp,8 | |
549 | vperm v26,v26,v27,v30 | |
550 | vxor v7, v7, v26 | |
551 | bdz .Lprocess_block | |
552 | lvdx_u v26,0,$inp | |
553 | addi $inp,$inp,8 | |
554 | vperm v26,v26,v27,v30 | |
555 | vxor v8, v8, v26 | |
556 | bdz .Lprocess_block | |
557 | lvdx_u v26,0,$inp | |
558 | addi $inp,$inp,8 | |
559 | vperm v26,v26,v27,v30 | |
560 | vxor v9, v9, v26 | |
561 | bdz .Lprocess_block | |
562 | lvdx_u v26,0,$inp | |
563 | addi $inp,$inp,8 | |
564 | vperm v26,v26,v27,v31 | |
565 | vxor v5, v5, v26 | |
566 | bdz .Lprocess_block | |
567 | lvdx_u v26,0,$inp | |
568 | addi $inp,$inp,8 | |
569 | vperm v26,v26,v27,v31 | |
570 | vxor v6, v6, v26 | |
571 | bdz .Lprocess_block | |
572 | lvdx_u v26,0,$inp | |
573 | addi $inp,$inp,8 | |
574 | vperm v26,v26,v27,v31 | |
575 | vxor v7, v7, v26 | |
576 | bdz .Lprocess_block | |
577 | lvdx_u v26,0,$inp | |
578 | addi $inp,$inp,8 | |
579 | vperm v26,v26,v27,v31 | |
580 | vxor v8, v8, v26 | |
581 | bdz .Lprocess_block | |
582 | lvdx_u v26,0,$inp | |
583 | addi $inp,$inp,8 | |
584 | vperm v26,v26,v27,v31 | |
585 | vxor v9, v9, v26 | |
586 | bdz .Lprocess_block | |
587 | lvdx_u v26,0,$inp | |
588 | addi $inp,$inp,8 | |
589 | vperm v26,v26,v27,v30 | |
590 | vxor v10, v10, v26 | |
591 | bdz .Lprocess_block | |
592 | lvdx_u v26,0,$inp | |
593 | addi $inp,$inp,8 | |
594 | vperm v26,v26,v27,v31 | |
595 | vxor v10, v10, v26 | |
596 | bdz .Lprocess_block | |
597 | lvdx_u v26,0,$inp | |
598 | addi $inp,$inp,8 | |
599 | vperm v26,v26,v27,v30 | |
600 | vxor v11, v11, v26 | |
601 | bdz .Lprocess_block | |
602 | lvdx_u v26,0,$inp | |
603 | addi $inp,$inp,8 | |
604 | vperm v26,v26,v27,v31 | |
605 | vxor v11, v11, v26 | |
606 | bdz .Lprocess_block | |
607 | lvdx_u v26,0,$inp | |
608 | addi $inp,$inp,8 | |
609 | vperm v26,v26,v27,v31 | |
610 | vxor v12, v12, v26 | |
611 | ||
612 | .Lprocess_block: | |
613 | bl KeccakF1600_int | |
614 | ||
615 | b .Loop_absorb | |
616 | ||
617 | .align 4 | |
618 | .Labsorbed: | |
619 | li r11,16 | |
620 | stvx_4w v0,0,$A_jagged ; return A[5][5] | |
621 | li r10,32 | |
622 | stvx_4w v1,r11,$A_jagged | |
623 | addi r11,r11,32 | |
624 | stvx_4w v2,r10,$A_jagged | |
625 | addi r10,r10,32 | |
626 | stvx_4w v3,r11,$A_jagged | |
627 | addi r11,r11,32 | |
628 | stvx_4w v4,r10,$A_jagged | |
629 | addi r10,r10,32 | |
630 | stvx_4w v5,r11,$A_jagged | |
631 | addi r11,r11,32 | |
632 | stvx_4w v6,r10,$A_jagged | |
633 | addi r10,r10,32 | |
634 | stvx_4w v7,r11,$A_jagged | |
635 | addi r11,r11,32 | |
636 | stvx_4w v8,r10,$A_jagged | |
637 | addi r10,r10,32 | |
638 | stvx_4w v9,r11,$A_jagged | |
639 | addi r11,r11,32 | |
640 | stvx_4w v10,r10,$A_jagged | |
641 | addi r10,r10,32 | |
642 | stvx_4w v11,r11,$A_jagged | |
643 | stvdx_u v12,r10,$A_jagged | |
644 | ||
645 | mr r3,$len ; return value | |
646 | li r10,`15+6*$SIZE_T` | |
647 | li r11,`31+6*$SIZE_T` | |
648 | mtlr r8 | |
649 | mtspr 256, r7 ; restore vrsave | |
650 | lvx v20,r10,$sp | |
651 | addi r10,r10,32 | |
652 | lvx v21,r11,$sp | |
653 | addi r11,r11,32 | |
654 | lvx v22,r10,$sp | |
655 | addi r10,r10,32 | |
656 | lvx v23,r11,$sp | |
657 | addi r11,r11,32 | |
658 | lvx v24,r10,$sp | |
659 | addi r10,r10,32 | |
660 | lvx v25,r11,$sp | |
661 | addi r11,r11,32 | |
662 | lvx v26,r10,$sp | |
663 | addi r10,r10,32 | |
664 | lvx v27,r11,$sp | |
665 | addi r11,r11,32 | |
666 | lvx v28,r10,$sp | |
667 | addi r10,r10,32 | |
668 | lvx v29,r11,$sp | |
669 | addi r11,r11,32 | |
670 | lvx v30,r10,$sp | |
671 | lvx v31,r11,$sp | |
672 | addi $sp,$sp,$FRAME | |
673 | blr | |
674 | .long 0 | |
675 | .byte 0,12,0x04,1,0x80,0,4,0 | |
676 | .long 0 | |
677 | .size SHA3_absorb,.-SHA3_absorb | |
678 | ___ | |
679 | } | |
680 | { | |
681 | my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6)); | |
682 | ||
683 | $code.=<<___; | |
684 | .globl SHA3_squeeze | |
685 | .type SHA3_squeeze,\@function | |
686 | .align 5 | |
687 | SHA3_squeeze: | |
688 | mflr r9 ; r9 is not touched by KeccakF1600 | |
689 | subi $out,$out,1 ; prepare for stbu | |
690 | addi r8,$A_jagged,4 ; prepare volatiles | |
691 | mr r10,$bsz | |
692 | li r11,0 | |
693 | b .Loop_squeeze | |
694 | .align 4 | |
695 | .Loop_squeeze: | |
696 | lwzx r7,r11,r8 ; lo | |
697 | lwzx r0,r11,$A_jagged ; hi | |
698 | ${UCMP}i $len,8 | |
699 | blt .Lsqueeze_tail | |
700 | ||
701 | stbu r7,1($out) ; write lo | |
702 | srwi r7,r7,8 | |
703 | stbu r7,1($out) | |
704 | srwi r7,r7,8 | |
705 | stbu r7,1($out) | |
706 | srwi r7,r7,8 | |
707 | stbu r7,1($out) | |
708 | stbu r0,1($out) ; write hi | |
709 | srwi r0,r0,8 | |
710 | stbu r0,1($out) | |
711 | srwi r0,r0,8 | |
712 | stbu r0,1($out) | |
713 | srwi r0,r0,8 | |
714 | stbu r0,1($out) | |
715 | ||
716 | subic. $len,$len,8 | |
717 | beqlr ; return if done | |
718 | ||
719 | subic. r10,r10,8 | |
720 | ble .Loutput_expand | |
721 | ||
722 | addi r11,r11,16 ; calculate jagged index | |
723 | cmplwi r11,`16*5` | |
724 | blt .Loop_squeeze | |
725 | subi r11,r11,72 | |
726 | beq .Loop_squeeze | |
727 | addi r11,r11,72 | |
728 | cmplwi r11,`16*5+8` | |
729 | subi r11,r11,8 | |
730 | beq .Loop_squeeze | |
731 | addi r11,r11,8 | |
732 | cmplwi r11,`16*10` | |
733 | subi r11,r11,72 | |
734 | beq .Loop_squeeze | |
735 | addi r11,r11,72 | |
736 | blt .Loop_squeeze | |
737 | subi r11,r11,8 | |
738 | b .Loop_squeeze | |
739 | ||
740 | .align 4 | |
741 | .Loutput_expand: | |
742 | bl KeccakF1600 | |
743 | mtlr r9 | |
744 | ||
745 | addi r8,$A_jagged,4 ; restore volatiles | |
746 | mr r10,$bsz | |
747 | li r11,0 | |
748 | b .Loop_squeeze | |
749 | ||
750 | .align 4 | |
751 | .Lsqueeze_tail: | |
752 | mtctr $len | |
753 | subic. $len,$len,4 | |
754 | ble .Loop_tail_lo | |
755 | li r8,4 | |
756 | mtctr r8 | |
757 | .Loop_tail_lo: | |
758 | stbu r7,1($out) | |
759 | srdi r7,r7,8 | |
760 | bdnz .Loop_tail_lo | |
761 | ble .Lsqueeze_done | |
762 | mtctr $len | |
763 | .Loop_tail_hi: | |
764 | stbu r0,1($out) | |
765 | srdi r0,r0,8 | |
766 | bdnz .Loop_tail_hi | |
767 | ||
768 | .Lsqueeze_done: | |
769 | blr | |
770 | .long 0 | |
771 | .byte 0,12,0x14,0,0,0,4,0 | |
772 | .long 0 | |
773 | .size SHA3_squeeze,.-SHA3_squeeze | |
774 | ___ | |
775 | } | |
776 | $code.=<<___; | |
777 | .align 6 | |
778 | PICmeup: | |
779 | mflr r0 | |
780 | bcl 20,31,\$+4 | |
781 | mflr r12 ; vvvvvv "distance" between . and 1st data entry | |
782 | addi r12,r12,`64-8` | |
783 | mtlr r0 | |
784 | blr | |
785 | .long 0 | |
786 | .byte 0,12,0x14,0,0,0,0,0 | |
787 | .space `64-9*4` | |
788 | .type rhotates,\@object | |
789 | .align 6 | |
790 | rhotates: | |
791 | .quad 0, 36 | |
792 | .quad 1, 44 | |
793 | .quad 62, 6 | |
794 | .quad 28, 55 | |
795 | .quad 27, 20 | |
796 | .quad 3, 41 | |
797 | .quad 10, 45 | |
798 | .quad 43, 15 | |
799 | .quad 25, 21 | |
800 | .quad 39, 8 | |
801 | .quad 18, 2 | |
802 | .quad 61, 56 | |
803 | .quad 14, 14 | |
804 | .size rhotates,.-rhotates | |
805 | .quad 0,0 | |
806 | .quad 0x0001020304050607,0x1011121314151617 | |
807 | .quad 0x1011121314151617,0x0001020304050607 | |
808 | .type iotas,\@object | |
809 | iotas: | |
810 | .quad 0x0000000000000001,0 | |
811 | .quad 0x0000000000008082,0 | |
812 | .quad 0x800000000000808a,0 | |
813 | .quad 0x8000000080008000,0 | |
814 | .quad 0x000000000000808b,0 | |
815 | .quad 0x0000000080000001,0 | |
816 | .quad 0x8000000080008081,0 | |
817 | .quad 0x8000000000008009,0 | |
818 | .quad 0x000000000000008a,0 | |
819 | .quad 0x0000000000000088,0 | |
820 | .quad 0x0000000080008009,0 | |
821 | .quad 0x000000008000000a,0 | |
822 | .quad 0x000000008000808b,0 | |
823 | .quad 0x800000000000008b,0 | |
824 | .quad 0x8000000000008089,0 | |
825 | .quad 0x8000000000008003,0 | |
826 | .quad 0x8000000000008002,0 | |
827 | .quad 0x8000000000000080,0 | |
828 | .quad 0x000000000000800a,0 | |
829 | .quad 0x800000008000000a,0 | |
830 | .quad 0x8000000080008081,0 | |
831 | .quad 0x8000000000008080,0 | |
832 | .quad 0x0000000080000001,0 | |
833 | .quad 0x8000000080008008,0 | |
834 | .size iotas,.-iotas | |
835 | .asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" | |
836 | ___ | |
837 | ||
838 | foreach (split("\n",$code)) { | |
839 | s/\`([^\`]*)\`/eval $1/ge; | |
840 | ||
841 | if ($flavour =~ /le$/) { # little-endian | |
842 | s/\?([a-z]+)/;$1/; | |
843 | } else { # big-endian | |
844 | s/\?([a-z]+)/$1/; | |
845 | } | |
846 | ||
847 | print $_,"\n"; | |
848 | } | |
849 | ||
850 | close STDOUT; |