]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/modes/asm/ghashp8-ppc.pl
Update copyright year
[thirdparty/openssl.git] / crypto / modes / asm / ghashp8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for for PowerISA v2.07.
18 #
19 # July 2014
20 #
21 # Accurate performance measurements are problematic, because it's
22 # always virtualized setup with possibly throttled processor.
23 # Relative comparison is therefore more informative. This initial
24 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25 # faster than "4-bit" integer-only compiler-generated 64-bit code.
26 # "Initial version" means that there is room for further improvement.
27
28 # May 2016
29 #
30 # 2x aggregated reduction improves performance by 50% (resulting
31 # performance on POWER8 is 1 cycle per processed byte), and 4x
32 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33 # POWER9 delivers 0.51 cpb.
34
35 # $output is the last argument if it looks like a file (it has an extension)
36 # $flavour is the first argument if it doesn't look like a file
37 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
38 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
39
40 if ($flavour =~ /64/) {
41 $SIZE_T=8;
42 $LRSAVE=2*$SIZE_T;
43 $STU="stdu";
44 $POP="ld";
45 $PUSH="std";
46 $UCMP="cmpld";
47 $SHRI="srdi";
48 } elsif ($flavour =~ /32/) {
49 $SIZE_T=4;
50 $LRSAVE=$SIZE_T;
51 $STU="stwu";
52 $POP="lwz";
53 $PUSH="stw";
54 $UCMP="cmplw";
55 $SHRI="srwi";
56 } else { die "nonsense $flavour"; }
57
58 $sp="r1";
59 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
60
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
64 die "can't locate ppc-xlate.pl";
65
66 open STDOUT,"| $^X $xlate $flavour \"$output\""
67 or die "can't call $xlate: $!";
68
69 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
70
71 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
72 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
73 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
74 my $vrsave="r12";
75
76 $code=<<___;
77 .machine "any"
78
79 .text
80
81 .globl .gcm_init_p8
82 .align 5
83 .gcm_init_p8:
84 li r0,-4096
85 li r8,0x10
86 mfspr $vrsave,256
87 li r9,0x20
88 mtspr 256,r0
89 li r10,0x30
90 lvx_u $H,0,r4 # load H
91
92 vspltisb $xC2,-16 # 0xf0
93 vspltisb $t0,1 # one
94 vaddubm $xC2,$xC2,$xC2 # 0xe0
95 vxor $zero,$zero,$zero
96 vor $xC2,$xC2,$t0 # 0xe1
97 vsldoi $xC2,$xC2,$zero,15 # 0xe1...
98 vsldoi $t1,$zero,$t0,1 # ...1
99 vaddubm $xC2,$xC2,$xC2 # 0xc2...
100 vspltisb $t2,7
101 vor $xC2,$xC2,$t1 # 0xc2....01
102 vspltb $t1,$H,0 # most significant byte
103 vsl $H,$H,$t0 # H<<=1
104 vsrab $t1,$t1,$t2 # broadcast carry bit
105 vand $t1,$t1,$xC2
106 vxor $IN,$H,$t1 # twisted H
107
108 vsldoi $H,$IN,$IN,8 # twist even more ...
109 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
110 vsldoi $Hl,$zero,$H,8 # ... and split
111 vsldoi $Hh,$H,$zero,8
112
113 stvx_u $xC2,0,r3 # save pre-computed table
114 stvx_u $Hl,r8,r3
115 li r8,0x40
116 stvx_u $H, r9,r3
117 li r9,0x50
118 stvx_u $Hh,r10,r3
119 li r10,0x60
120
121 vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
122 vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
123 vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
124
125 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
126
127 vsldoi $t0,$Xm,$zero,8
128 vsldoi $t1,$zero,$Xm,8
129 vxor $Xl,$Xl,$t0
130 vxor $Xh,$Xh,$t1
131
132 vsldoi $Xl,$Xl,$Xl,8
133 vxor $Xl,$Xl,$t2
134
135 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
136 vpmsumd $Xl,$Xl,$xC2
137 vxor $t1,$t1,$Xh
138 vxor $IN1,$Xl,$t1
139
140 vsldoi $H2,$IN1,$IN1,8
141 vsldoi $H2l,$zero,$H2,8
142 vsldoi $H2h,$H2,$zero,8
143
144 stvx_u $H2l,r8,r3 # save H^2
145 li r8,0x70
146 stvx_u $H2,r9,r3
147 li r9,0x80
148 stvx_u $H2h,r10,r3
149 li r10,0x90
150 ___
151 {
152 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
153 $code.=<<___;
154 vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
155 vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
156 vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
157 vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
158 vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
159 vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
160
161 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
162 vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
163
164 vsldoi $t0,$Xm,$zero,8
165 vsldoi $t1,$zero,$Xm,8
166 vsldoi $t4,$Xm1,$zero,8
167 vsldoi $t5,$zero,$Xm1,8
168 vxor $Xl,$Xl,$t0
169 vxor $Xh,$Xh,$t1
170 vxor $Xl1,$Xl1,$t4
171 vxor $Xh1,$Xh1,$t5
172
173 vsldoi $Xl,$Xl,$Xl,8
174 vsldoi $Xl1,$Xl1,$Xl1,8
175 vxor $Xl,$Xl,$t2
176 vxor $Xl1,$Xl1,$t6
177
178 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
179 vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
180 vpmsumd $Xl,$Xl,$xC2
181 vpmsumd $Xl1,$Xl1,$xC2
182 vxor $t1,$t1,$Xh
183 vxor $t5,$t5,$Xh1
184 vxor $Xl,$Xl,$t1
185 vxor $Xl1,$Xl1,$t5
186
187 vsldoi $H,$Xl,$Xl,8
188 vsldoi $H2,$Xl1,$Xl1,8
189 vsldoi $Hl,$zero,$H,8
190 vsldoi $Hh,$H,$zero,8
191 vsldoi $H2l,$zero,$H2,8
192 vsldoi $H2h,$H2,$zero,8
193
194 stvx_u $Hl,r8,r3 # save H^3
195 li r8,0xa0
196 stvx_u $H,r9,r3
197 li r9,0xb0
198 stvx_u $Hh,r10,r3
199 li r10,0xc0
200 stvx_u $H2l,r8,r3 # save H^4
201 stvx_u $H2,r9,r3
202 stvx_u $H2h,r10,r3
203
204 mtspr 256,$vrsave
205 blr
206 .long 0
207 .byte 0,12,0x14,0,0,0,2,0
208 .long 0
209 .size .gcm_init_p8,.-.gcm_init_p8
210 ___
211 }
212 $code.=<<___;
213 .globl .gcm_gmult_p8
214 .align 5
215 .gcm_gmult_p8:
216 lis r0,0xfff8
217 li r8,0x10
218 mfspr $vrsave,256
219 li r9,0x20
220 mtspr 256,r0
221 li r10,0x30
222 lvx_u $IN,0,$Xip # load Xi
223
224 lvx_u $Hl,r8,$Htbl # load pre-computed table
225 le?lvsl $lemask,r0,r0
226 lvx_u $H, r9,$Htbl
227 le?vspltisb $t0,0x07
228 lvx_u $Hh,r10,$Htbl
229 le?vxor $lemask,$lemask,$t0
230 lvx_u $xC2,0,$Htbl
231 le?vperm $IN,$IN,$IN,$lemask
232 vxor $zero,$zero,$zero
233
234 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
235 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
236 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
237
238 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
239
240 vsldoi $t0,$Xm,$zero,8
241 vsldoi $t1,$zero,$Xm,8
242 vxor $Xl,$Xl,$t0
243 vxor $Xh,$Xh,$t1
244
245 vsldoi $Xl,$Xl,$Xl,8
246 vxor $Xl,$Xl,$t2
247
248 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
249 vpmsumd $Xl,$Xl,$xC2
250 vxor $t1,$t1,$Xh
251 vxor $Xl,$Xl,$t1
252
253 le?vperm $Xl,$Xl,$Xl,$lemask
254 stvx_u $Xl,0,$Xip # write out Xi
255
256 mtspr 256,$vrsave
257 blr
258 .long 0
259 .byte 0,12,0x14,0,0,0,2,0
260 .long 0
261 .size .gcm_gmult_p8,.-.gcm_gmult_p8
262
263 .globl .gcm_ghash_p8
264 .align 5
265 .gcm_ghash_p8:
266 li r0,-4096
267 li r8,0x10
268 mfspr $vrsave,256
269 li r9,0x20
270 mtspr 256,r0
271 li r10,0x30
272 lvx_u $Xl,0,$Xip # load Xi
273
274 lvx_u $Hl,r8,$Htbl # load pre-computed table
275 li r8,0x40
276 le?lvsl $lemask,r0,r0
277 lvx_u $H, r9,$Htbl
278 li r9,0x50
279 le?vspltisb $t0,0x07
280 lvx_u $Hh,r10,$Htbl
281 li r10,0x60
282 le?vxor $lemask,$lemask,$t0
283 lvx_u $xC2,0,$Htbl
284 le?vperm $Xl,$Xl,$Xl,$lemask
285 vxor $zero,$zero,$zero
286
287 ${UCMP}i $len,64
288 bge Lgcm_ghash_p8_4x
289
290 lvx_u $IN,0,$inp
291 addi $inp,$inp,16
292 subic. $len,$len,16
293 le?vperm $IN,$IN,$IN,$lemask
294 vxor $IN,$IN,$Xl
295 beq Lshort
296
297 lvx_u $H2l,r8,$Htbl # load H^2
298 li r8,16
299 lvx_u $H2, r9,$Htbl
300 add r9,$inp,$len # end of input
301 lvx_u $H2h,r10,$Htbl
302 be?b Loop_2x
303
304 .align 5
305 Loop_2x:
306 lvx_u $IN1,0,$inp
307 le?vperm $IN1,$IN1,$IN1,$lemask
308
309 subic $len,$len,32
310 vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
311 vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
312 subfe r0,r0,r0 # borrow?-1:0
313 vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
314 vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
315 and r0,r0,$len
316 vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
317 vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
318 add $inp,$inp,r0
319
320 vxor $Xl,$Xl,$Xl1
321 vxor $Xm,$Xm,$Xm1
322
323 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
324
325 vsldoi $t0,$Xm,$zero,8
326 vsldoi $t1,$zero,$Xm,8
327 vxor $Xh,$Xh,$Xh1
328 vxor $Xl,$Xl,$t0
329 vxor $Xh,$Xh,$t1
330
331 vsldoi $Xl,$Xl,$Xl,8
332 vxor $Xl,$Xl,$t2
333 lvx_u $IN,r8,$inp
334 addi $inp,$inp,32
335
336 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
337 vpmsumd $Xl,$Xl,$xC2
338 le?vperm $IN,$IN,$IN,$lemask
339 vxor $t1,$t1,$Xh
340 vxor $IN,$IN,$t1
341 vxor $IN,$IN,$Xl
342 $UCMP r9,$inp
343 bgt Loop_2x # done yet?
344
345 cmplwi $len,0
346 bne Leven
347
348 Lshort:
349 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
350 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
351 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
352
353 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
354
355 vsldoi $t0,$Xm,$zero,8
356 vsldoi $t1,$zero,$Xm,8
357 vxor $Xl,$Xl,$t0
358 vxor $Xh,$Xh,$t1
359
360 vsldoi $Xl,$Xl,$Xl,8
361 vxor $Xl,$Xl,$t2
362
363 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
364 vpmsumd $Xl,$Xl,$xC2
365 vxor $t1,$t1,$Xh
366
367 Leven:
368 vxor $Xl,$Xl,$t1
369 le?vperm $Xl,$Xl,$Xl,$lemask
370 stvx_u $Xl,0,$Xip # write out Xi
371
372 mtspr 256,$vrsave
373 blr
374 .long 0
375 .byte 0,12,0x14,0,0,0,4,0
376 .long 0
377 ___
378 {
379 my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
380 $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
381 my $IN0=$IN;
382 my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
383
384 $code.=<<___;
385 .align 5
386 .gcm_ghash_p8_4x:
387 Lgcm_ghash_p8_4x:
388 $STU $sp,-$FRAME($sp)
389 li r10,`15+6*$SIZE_T`
390 li r11,`31+6*$SIZE_T`
391 stvx v20,r10,$sp
392 addi r10,r10,32
393 stvx v21,r11,$sp
394 addi r11,r11,32
395 stvx v22,r10,$sp
396 addi r10,r10,32
397 stvx v23,r11,$sp
398 addi r11,r11,32
399 stvx v24,r10,$sp
400 addi r10,r10,32
401 stvx v25,r11,$sp
402 addi r11,r11,32
403 stvx v26,r10,$sp
404 addi r10,r10,32
405 stvx v27,r11,$sp
406 addi r11,r11,32
407 stvx v28,r10,$sp
408 addi r10,r10,32
409 stvx v29,r11,$sp
410 addi r11,r11,32
411 stvx v30,r10,$sp
412 li r10,0x60
413 stvx v31,r11,$sp
414 li r0,-1
415 stw $vrsave,`$FRAME-4`($sp) # save vrsave
416 mtspr 256,r0 # preserve all AltiVec registers
417
418 lvsl $t0,0,r8 # 0x0001..0e0f
419 #lvx_u $H2l,r8,$Htbl # load H^2
420 li r8,0x70
421 lvx_u $H2, r9,$Htbl
422 li r9,0x80
423 vspltisb $t1,8 # 0x0808..0808
424 #lvx_u $H2h,r10,$Htbl
425 li r10,0x90
426 lvx_u $H3l,r8,$Htbl # load H^3
427 li r8,0xa0
428 lvx_u $H3, r9,$Htbl
429 li r9,0xb0
430 lvx_u $H3h,r10,$Htbl
431 li r10,0xc0
432 lvx_u $H4l,r8,$Htbl # load H^4
433 li r8,0x10
434 lvx_u $H4, r9,$Htbl
435 li r9,0x20
436 lvx_u $H4h,r10,$Htbl
437 li r10,0x30
438
439 vsldoi $t2,$zero,$t1,8 # 0x0000..0808
440 vaddubm $hiperm,$t0,$t2 # 0x0001..1617
441 vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
442
443 $SHRI $len,$len,4 # this allows to use sign bit
444 # as carry
445 lvx_u $IN0,0,$inp # load input
446 lvx_u $IN1,r8,$inp
447 subic. $len,$len,8
448 lvx_u $IN2,r9,$inp
449 lvx_u $IN3,r10,$inp
450 addi $inp,$inp,0x40
451 le?vperm $IN0,$IN0,$IN0,$lemask
452 le?vperm $IN1,$IN1,$IN1,$lemask
453 le?vperm $IN2,$IN2,$IN2,$lemask
454 le?vperm $IN3,$IN3,$IN3,$lemask
455
456 vxor $Xh,$IN0,$Xl
457
458 vpmsumd $Xl1,$IN1,$H3l
459 vpmsumd $Xm1,$IN1,$H3
460 vpmsumd $Xh1,$IN1,$H3h
461
462 vperm $H21l,$H2,$H,$hiperm
463 vperm $t0,$IN2,$IN3,$loperm
464 vperm $H21h,$H2,$H,$loperm
465 vperm $t1,$IN2,$IN3,$hiperm
466 vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
467 vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
468 vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
469 vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
470
471 vxor $Xm2,$Xm2,$Xm1
472 vxor $Xl3,$Xl3,$Xl1
473 vxor $Xm3,$Xm3,$Xm2
474 vxor $Xh3,$Xh3,$Xh1
475
476 blt Ltail_4x
477
478 Loop_4x:
479 lvx_u $IN0,0,$inp
480 lvx_u $IN1,r8,$inp
481 subic. $len,$len,4
482 lvx_u $IN2,r9,$inp
483 lvx_u $IN3,r10,$inp
484 addi $inp,$inp,0x40
485 le?vperm $IN1,$IN1,$IN1,$lemask
486 le?vperm $IN2,$IN2,$IN2,$lemask
487 le?vperm $IN3,$IN3,$IN3,$lemask
488 le?vperm $IN0,$IN0,$IN0,$lemask
489
490 vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
491 vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
492 vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
493 vpmsumd $Xl1,$IN1,$H3l
494 vpmsumd $Xm1,$IN1,$H3
495 vpmsumd $Xh1,$IN1,$H3h
496
497 vxor $Xl,$Xl,$Xl3
498 vxor $Xm,$Xm,$Xm3
499 vxor $Xh,$Xh,$Xh3
500 vperm $t0,$IN2,$IN3,$loperm
501 vperm $t1,$IN2,$IN3,$hiperm
502
503 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
504 vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
505 vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
506
507 vsldoi $t0,$Xm,$zero,8
508 vsldoi $t1,$zero,$Xm,8
509 vxor $Xl,$Xl,$t0
510 vxor $Xh,$Xh,$t1
511
512 vsldoi $Xl,$Xl,$Xl,8
513 vxor $Xl,$Xl,$t2
514
515 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
516 vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
517 vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
518 vpmsumd $Xl,$Xl,$xC2
519
520 vxor $Xl3,$Xl3,$Xl1
521 vxor $Xh3,$Xh3,$Xh1
522 vxor $Xh,$Xh,$IN0
523 vxor $Xm2,$Xm2,$Xm1
524 vxor $Xh,$Xh,$t1
525 vxor $Xm3,$Xm3,$Xm2
526 vxor $Xh,$Xh,$Xl
527 bge Loop_4x
528
529 Ltail_4x:
530 vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
531 vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
532 vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
533
534 vxor $Xl,$Xl,$Xl3
535 vxor $Xm,$Xm,$Xm3
536
537 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
538
539 vsldoi $t0,$Xm,$zero,8
540 vsldoi $t1,$zero,$Xm,8
541 vxor $Xh,$Xh,$Xh3
542 vxor $Xl,$Xl,$t0
543 vxor $Xh,$Xh,$t1
544
545 vsldoi $Xl,$Xl,$Xl,8
546 vxor $Xl,$Xl,$t2
547
548 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
549 vpmsumd $Xl,$Xl,$xC2
550 vxor $t1,$t1,$Xh
551 vxor $Xl,$Xl,$t1
552
553 addic. $len,$len,4
554 beq Ldone_4x
555
556 lvx_u $IN0,0,$inp
557 ${UCMP}i $len,2
558 li $len,-4
559 blt Lone
560 lvx_u $IN1,r8,$inp
561 beq Ltwo
562
563 Lthree:
564 lvx_u $IN2,r9,$inp
565 le?vperm $IN0,$IN0,$IN0,$lemask
566 le?vperm $IN1,$IN1,$IN1,$lemask
567 le?vperm $IN2,$IN2,$IN2,$lemask
568
569 vxor $Xh,$IN0,$Xl
570 vmr $H4l,$H3l
571 vmr $H4, $H3
572 vmr $H4h,$H3h
573
574 vperm $t0,$IN1,$IN2,$loperm
575 vperm $t1,$IN1,$IN2,$hiperm
576 vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
577 vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
578 vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
579 vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
580
581 vxor $Xm3,$Xm3,$Xm2
582 b Ltail_4x
583
584 .align 4
585 Ltwo:
586 le?vperm $IN0,$IN0,$IN0,$lemask
587 le?vperm $IN1,$IN1,$IN1,$lemask
588
589 vxor $Xh,$IN0,$Xl
590 vperm $t0,$zero,$IN1,$loperm
591 vperm $t1,$zero,$IN1,$hiperm
592
593 vsldoi $H4l,$zero,$H2,8
594 vmr $H4, $H2
595 vsldoi $H4h,$H2,$zero,8
596
597 vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
598 vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
599 vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
600
601 b Ltail_4x
602
603 .align 4
604 Lone:
605 le?vperm $IN0,$IN0,$IN0,$lemask
606
607 vsldoi $H4l,$zero,$H,8
608 vmr $H4, $H
609 vsldoi $H4h,$H,$zero,8
610
611 vxor $Xh,$IN0,$Xl
612 vxor $Xl3,$Xl3,$Xl3
613 vxor $Xm3,$Xm3,$Xm3
614 vxor $Xh3,$Xh3,$Xh3
615
616 b Ltail_4x
617
618 Ldone_4x:
619 le?vperm $Xl,$Xl,$Xl,$lemask
620 stvx_u $Xl,0,$Xip # write out Xi
621
622 li r10,`15+6*$SIZE_T`
623 li r11,`31+6*$SIZE_T`
624 mtspr 256,$vrsave
625 lvx v20,r10,$sp
626 addi r10,r10,32
627 lvx v21,r11,$sp
628 addi r11,r11,32
629 lvx v22,r10,$sp
630 addi r10,r10,32
631 lvx v23,r11,$sp
632 addi r11,r11,32
633 lvx v24,r10,$sp
634 addi r10,r10,32
635 lvx v25,r11,$sp
636 addi r11,r11,32
637 lvx v26,r10,$sp
638 addi r10,r10,32
639 lvx v27,r11,$sp
640 addi r11,r11,32
641 lvx v28,r10,$sp
642 addi r10,r10,32
643 lvx v29,r11,$sp
644 addi r11,r11,32
645 lvx v30,r10,$sp
646 lvx v31,r11,$sp
647 addi $sp,$sp,$FRAME
648 blr
649 .long 0
650 .byte 0,12,0x04,0,0x80,0,4,0
651 .long 0
652 ___
653 }
654 $code.=<<___;
655 .size .gcm_ghash_p8,.-.gcm_ghash_p8
656
657 .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
658 .align 2
659 ___
660
661 foreach (split("\n",$code)) {
662 s/\`([^\`]*)\`/eval $1/geo;
663
664 if ($flavour =~ /le$/o) { # little-endian
665 s/le\?//o or
666 s/be\?/#be#/o;
667 } else {
668 s/le\?/#le#/o or
669 s/be\?//o;
670 }
671 print $_,"\n";
672 }
673
674 close STDOUT or die "error closing STDOUT: $!"; # enforce flush