crypto/modes/asm/ghashp8-ppc.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # GHASH for for PowerISA v2.07.
  18 #
  19 # July 2014
  20 #
  21 # Accurate performance measurements are problematic, because it's
  22 # always virtualized setup with possibly throttled processor.
  23 # Relative comparison is therefore more informative. This initial
  24 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
  25 # faster than "4-bit" integer-only compiler-generated 64-bit code.
  26 # "Initial version" means that there is room for further improvement.
  27
  28 # May 2016
  29 #
  30 # 2x aggregated reduction improves performance by 50% (resulting
  31 # performance on POWER8 is 1 cycle per processed byte), and 4x
  32 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
  33 # POWER9 delivers 0.51 cpb.
  34
  35 # $output is the last argument if it looks like a file (it has an extension)
  36 # $flavour is the first argument if it doesn't look like a file
  37 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  38 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  39
  40 if ($flavour =~ /64/) {
  41         $SIZE_T=8;
  42         $LRSAVE=2*$SIZE_T;
  43         $STU="stdu";
  44         $POP="ld";
  45         $PUSH="std";
  46         $UCMP="cmpld";
  47         $SHRI="srdi";
  48 } elsif ($flavour =~ /32/) {
  49         $SIZE_T=4;
  50         $LRSAVE=$SIZE_T;
  51         $STU="stwu";
  52         $POP="lwz";
  53         $PUSH="stw";
  54         $UCMP="cmplw";
  55         $SHRI="srwi";
  56 } else { die "nonsense $flavour"; }
  57
  58 $sp="r1";
  59 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
  60
  61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  62 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  63 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  64 die "can't locate ppc-xlate.pl";
  65
  66 open STDOUT,"| $^X $xlate $flavour \"$output\""
  67     or die "can't call $xlate: $!";
  68
  69 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));    # argument block
  70
  71 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
  72 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
  73 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
  74 my $vrsave="r12";
  75
  76 $code=<<___;
  77 .machine        "any"
  78
  79 .text
  80
  81 .globl  .gcm_init_p8
  82 .align  5
  83 .gcm_init_p8:
  84         li              r0,-4096
  85         li              r8,0x10
  86         mfspr           $vrsave,256
  87         li              r9,0x20
  88         mtspr           256,r0
  89         li              r10,0x30
  90         lvx_u           $H,0,r4                 # load H
  91
  92         vspltisb        $xC2,-16                # 0xf0
  93         vspltisb        $t0,1                   # one
  94         vaddubm         $xC2,$xC2,$xC2          # 0xe0
  95         vxor            $zero,$zero,$zero
  96         vor             $xC2,$xC2,$t0           # 0xe1
  97         vsldoi          $xC2,$xC2,$zero,15      # 0xe1...
  98         vsldoi          $t1,$zero,$t0,1         # ...1
  99         vaddubm         $xC2,$xC2,$xC2          # 0xc2...
 100         vspltisb        $t2,7
 101         vor             $xC2,$xC2,$t1           # 0xc2....01
 102         vspltb          $t1,$H,0                # most significant byte
 103         vsl             $H,$H,$t0               # H<<=1
 104         vsrab           $t1,$t1,$t2             # broadcast carry bit
 105         vand            $t1,$t1,$xC2
 106         vxor            $IN,$H,$t1              # twisted H
 107
 108         vsldoi          $H,$IN,$IN,8            # twist even more ...
 109         vsldoi          $xC2,$zero,$xC2,8       # 0xc2.0
 110         vsldoi          $Hl,$zero,$H,8          # ... and split
 111         vsldoi          $Hh,$H,$zero,8
 112
 113         stvx_u          $xC2,0,r3               # save pre-computed table
 114         stvx_u          $Hl,r8,r3
 115         li              r8,0x40
 116         stvx_u          $H, r9,r3
 117         li              r9,0x50
 118         stvx_u          $Hh,r10,r3
 119         li              r10,0x60
 120
 121         vpmsumd         $Xl,$IN,$Hl             # H.lo·H.lo
 122         vpmsumd         $Xm,$IN,$H              # H.hi·H.lo+H.lo·H.hi
 123         vpmsumd         $Xh,$IN,$Hh             # H.hi·H.hi
 124
 125         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 126
 127         vsldoi          $t0,$Xm,$zero,8
 128         vsldoi          $t1,$zero,$Xm,8
 129         vxor            $Xl,$Xl,$t0
 130         vxor            $Xh,$Xh,$t1
 131
 132         vsldoi          $Xl,$Xl,$Xl,8
 133         vxor            $Xl,$Xl,$t2
 134
 135         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 136         vpmsumd         $Xl,$Xl,$xC2
 137         vxor            $t1,$t1,$Xh
 138         vxor            $IN1,$Xl,$t1
 139
 140         vsldoi          $H2,$IN1,$IN1,8
 141         vsldoi          $H2l,$zero,$H2,8
 142         vsldoi          $H2h,$H2,$zero,8
 143
 144         stvx_u          $H2l,r8,r3              # save H^2
 145         li              r8,0x70
 146         stvx_u          $H2,r9,r3
 147         li              r9,0x80
 148         stvx_u          $H2h,r10,r3
 149         li              r10,0x90
 150 ___
 151 {
 152 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
 153 $code.=<<___;
 154         vpmsumd         $Xl,$IN,$H2l            # H.lo·H^2.lo
 155          vpmsumd        $Xl1,$IN1,$H2l          # H^2.lo·H^2.lo
 156         vpmsumd         $Xm,$IN,$H2             # H.hi·H^2.lo+H.lo·H^2.hi
 157          vpmsumd        $Xm1,$IN1,$H2           # H^2.hi·H^2.lo+H^2.lo·H^2.hi
 158         vpmsumd         $Xh,$IN,$H2h            # H.hi·H^2.hi
 159          vpmsumd        $Xh1,$IN1,$H2h          # H^2.hi·H^2.hi
 160
 161         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 162          vpmsumd        $t6,$Xl1,$xC2           # 1st reduction phase
 163
 164         vsldoi          $t0,$Xm,$zero,8
 165         vsldoi          $t1,$zero,$Xm,8
 166          vsldoi         $t4,$Xm1,$zero,8
 167          vsldoi         $t5,$zero,$Xm1,8
 168         vxor            $Xl,$Xl,$t0
 169         vxor            $Xh,$Xh,$t1
 170          vxor           $Xl1,$Xl1,$t4
 171          vxor           $Xh1,$Xh1,$t5
 172
 173         vsldoi          $Xl,$Xl,$Xl,8
 174          vsldoi         $Xl1,$Xl1,$Xl1,8
 175         vxor            $Xl,$Xl,$t2
 176          vxor           $Xl1,$Xl1,$t6
 177
 178         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 179          vsldoi         $t5,$Xl1,$Xl1,8         # 2nd reduction phase
 180         vpmsumd         $Xl,$Xl,$xC2
 181          vpmsumd        $Xl1,$Xl1,$xC2
 182         vxor            $t1,$t1,$Xh
 183          vxor           $t5,$t5,$Xh1
 184         vxor            $Xl,$Xl,$t1
 185          vxor           $Xl1,$Xl1,$t5
 186
 187         vsldoi          $H,$Xl,$Xl,8
 188          vsldoi         $H2,$Xl1,$Xl1,8
 189         vsldoi          $Hl,$zero,$H,8
 190         vsldoi          $Hh,$H,$zero,8
 191          vsldoi         $H2l,$zero,$H2,8
 192          vsldoi         $H2h,$H2,$zero,8
 193
 194         stvx_u          $Hl,r8,r3               # save H^3
 195         li              r8,0xa0
 196         stvx_u          $H,r9,r3
 197         li              r9,0xb0
 198         stvx_u          $Hh,r10,r3
 199         li              r10,0xc0
 200          stvx_u         $H2l,r8,r3              # save H^4
 201          stvx_u         $H2,r9,r3
 202          stvx_u         $H2h,r10,r3
 203
 204         mtspr           256,$vrsave
 205         blr
 206         .long           0
 207         .byte           0,12,0x14,0,0,0,2,0
 208         .long           0
 209 .size   .gcm_init_p8,.-.gcm_init_p8
 210 ___
 211 }
 212 $code.=<<___;
 213 .globl  .gcm_gmult_p8
 214 .align  5
 215 .gcm_gmult_p8:
 216         lis             r0,0xfff8
 217         li              r8,0x10
 218         mfspr           $vrsave,256
 219         li              r9,0x20
 220         mtspr           256,r0
 221         li              r10,0x30
 222         lvx_u           $IN,0,$Xip              # load Xi
 223
 224         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
 225          le?lvsl        $lemask,r0,r0
 226         lvx_u           $H, r9,$Htbl
 227          le?vspltisb    $t0,0x07
 228         lvx_u           $Hh,r10,$Htbl
 229          le?vxor        $lemask,$lemask,$t0
 230         lvx_u           $xC2,0,$Htbl
 231          le?vperm       $IN,$IN,$IN,$lemask
 232         vxor            $zero,$zero,$zero
 233
 234         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
 235         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
 236         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
 237
 238         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 239
 240         vsldoi          $t0,$Xm,$zero,8
 241         vsldoi          $t1,$zero,$Xm,8
 242         vxor            $Xl,$Xl,$t0
 243         vxor            $Xh,$Xh,$t1
 244
 245         vsldoi          $Xl,$Xl,$Xl,8
 246         vxor            $Xl,$Xl,$t2
 247
 248         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 249         vpmsumd         $Xl,$Xl,$xC2
 250         vxor            $t1,$t1,$Xh
 251         vxor            $Xl,$Xl,$t1
 252
 253         le?vperm        $Xl,$Xl,$Xl,$lemask
 254         stvx_u          $Xl,0,$Xip              # write out Xi
 255
 256         mtspr           256,$vrsave
 257         blr
 258         .long           0
 259         .byte           0,12,0x14,0,0,0,2,0
 260         .long           0
 261 .size   .gcm_gmult_p8,.-.gcm_gmult_p8
 262
 263 .globl  .gcm_ghash_p8
 264 .align  5
 265 .gcm_ghash_p8:
 266         li              r0,-4096
 267         li              r8,0x10
 268         mfspr           $vrsave,256
 269         li              r9,0x20
 270         mtspr           256,r0
 271         li              r10,0x30
 272         lvx_u           $Xl,0,$Xip              # load Xi
 273
 274         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
 275         li              r8,0x40
 276          le?lvsl        $lemask,r0,r0
 277         lvx_u           $H, r9,$Htbl
 278         li              r9,0x50
 279          le?vspltisb    $t0,0x07
 280         lvx_u           $Hh,r10,$Htbl
 281         li              r10,0x60
 282          le?vxor        $lemask,$lemask,$t0
 283         lvx_u           $xC2,0,$Htbl
 284          le?vperm       $Xl,$Xl,$Xl,$lemask
 285         vxor            $zero,$zero,$zero
 286
 287         ${UCMP}i        $len,64
 288         bge             Lgcm_ghash_p8_4x
 289
 290         lvx_u           $IN,0,$inp
 291         addi            $inp,$inp,16
 292         subic.          $len,$len,16
 293          le?vperm       $IN,$IN,$IN,$lemask
 294         vxor            $IN,$IN,$Xl
 295         beq             Lshort
 296
 297         lvx_u           $H2l,r8,$Htbl           # load H^2
 298         li              r8,16
 299         lvx_u           $H2, r9,$Htbl
 300         add             r9,$inp,$len            # end of input
 301         lvx_u           $H2h,r10,$Htbl
 302         be?b            Loop_2x
 303
 304 .align  5
 305 Loop_2x:
 306         lvx_u           $IN1,0,$inp
 307         le?vperm        $IN1,$IN1,$IN1,$lemask
 308
 309          subic          $len,$len,32
 310         vpmsumd         $Xl,$IN,$H2l            # H^2.lo·Xi.lo
 311          vpmsumd        $Xl1,$IN1,$Hl           # H.lo·Xi+1.lo
 312          subfe          r0,r0,r0                # borrow?-1:0
 313         vpmsumd         $Xm,$IN,$H2             # H^2.hi·Xi.lo+H^2.lo·Xi.hi
 314          vpmsumd        $Xm1,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+1.hi
 315          and            r0,r0,$len
 316         vpmsumd         $Xh,$IN,$H2h            # H^2.hi·Xi.hi
 317          vpmsumd        $Xh1,$IN1,$Hh           # H.hi·Xi+1.hi
 318          add            $inp,$inp,r0
 319
 320         vxor            $Xl,$Xl,$Xl1
 321         vxor            $Xm,$Xm,$Xm1
 322
 323         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 324
 325         vsldoi          $t0,$Xm,$zero,8
 326         vsldoi          $t1,$zero,$Xm,8
 327          vxor           $Xh,$Xh,$Xh1
 328         vxor            $Xl,$Xl,$t0
 329         vxor            $Xh,$Xh,$t1
 330
 331         vsldoi          $Xl,$Xl,$Xl,8
 332         vxor            $Xl,$Xl,$t2
 333          lvx_u          $IN,r8,$inp
 334          addi           $inp,$inp,32
 335
 336         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 337         vpmsumd         $Xl,$Xl,$xC2
 338          le?vperm       $IN,$IN,$IN,$lemask
 339         vxor            $t1,$t1,$Xh
 340         vxor            $IN,$IN,$t1
 341         vxor            $IN,$IN,$Xl
 342         $UCMP           r9,$inp
 343         bgt             Loop_2x                 # done yet?
 344
 345         cmplwi          $len,0
 346         bne             Leven
 347
 348 Lshort:
 349         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
 350         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
 351         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
 352
 353         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 354
 355         vsldoi          $t0,$Xm,$zero,8
 356         vsldoi          $t1,$zero,$Xm,8
 357         vxor            $Xl,$Xl,$t0
 358         vxor            $Xh,$Xh,$t1
 359
 360         vsldoi          $Xl,$Xl,$Xl,8
 361         vxor            $Xl,$Xl,$t2
 362
 363         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 364         vpmsumd         $Xl,$Xl,$xC2
 365         vxor            $t1,$t1,$Xh
 366
 367 Leven:
 368         vxor            $Xl,$Xl,$t1
 369         le?vperm        $Xl,$Xl,$Xl,$lemask
 370         stvx_u          $Xl,0,$Xip              # write out Xi
 371
 372         mtspr           256,$vrsave
 373         blr
 374         .long           0
 375         .byte           0,12,0x14,0,0,0,4,0
 376         .long           0
 377 ___
 378 {
 379 my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
 380     $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
 381 my $IN0=$IN;
 382 my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
 383
 384 $code.=<<___;
 385 .align  5
 386 .gcm_ghash_p8_4x:
 387 Lgcm_ghash_p8_4x:
 388         $STU            $sp,-$FRAME($sp)
 389         li              r10,`15+6*$SIZE_T`
 390         li              r11,`31+6*$SIZE_T`
 391         stvx            v20,r10,$sp
 392         addi            r10,r10,32
 393         stvx            v21,r11,$sp
 394         addi            r11,r11,32
 395         stvx            v22,r10,$sp
 396         addi            r10,r10,32
 397         stvx            v23,r11,$sp
 398         addi            r11,r11,32
 399         stvx            v24,r10,$sp
 400         addi            r10,r10,32
 401         stvx            v25,r11,$sp
 402         addi            r11,r11,32
 403         stvx            v26,r10,$sp
 404         addi            r10,r10,32
 405         stvx            v27,r11,$sp
 406         addi            r11,r11,32
 407         stvx            v28,r10,$sp
 408         addi            r10,r10,32
 409         stvx            v29,r11,$sp
 410         addi            r11,r11,32
 411         stvx            v30,r10,$sp
 412         li              r10,0x60
 413         stvx            v31,r11,$sp
 414         li              r0,-1
 415         stw             $vrsave,`$FRAME-4`($sp) # save vrsave
 416         mtspr           256,r0                  # preserve all AltiVec registers
 417
 418         lvsl            $t0,0,r8                # 0x0001..0e0f
 419         #lvx_u          $H2l,r8,$Htbl           # load H^2
 420         li              r8,0x70
 421         lvx_u           $H2, r9,$Htbl
 422         li              r9,0x80
 423         vspltisb        $t1,8                   # 0x0808..0808
 424         #lvx_u          $H2h,r10,$Htbl
 425         li              r10,0x90
 426         lvx_u           $H3l,r8,$Htbl           # load H^3
 427         li              r8,0xa0
 428         lvx_u           $H3, r9,$Htbl
 429         li              r9,0xb0
 430         lvx_u           $H3h,r10,$Htbl
 431         li              r10,0xc0
 432         lvx_u           $H4l,r8,$Htbl           # load H^4
 433         li              r8,0x10
 434         lvx_u           $H4, r9,$Htbl
 435         li              r9,0x20
 436         lvx_u           $H4h,r10,$Htbl
 437         li              r10,0x30
 438
 439         vsldoi          $t2,$zero,$t1,8         # 0x0000..0808
 440         vaddubm         $hiperm,$t0,$t2         # 0x0001..1617
 441         vaddubm         $loperm,$t1,$hiperm     # 0x0809..1e1f
 442
 443         $SHRI           $len,$len,4             # this allows to use sign bit
 444                                                 # as carry
 445         lvx_u           $IN0,0,$inp             # load input
 446         lvx_u           $IN1,r8,$inp
 447         subic.          $len,$len,8
 448         lvx_u           $IN2,r9,$inp
 449         lvx_u           $IN3,r10,$inp
 450         addi            $inp,$inp,0x40
 451         le?vperm        $IN0,$IN0,$IN0,$lemask
 452         le?vperm        $IN1,$IN1,$IN1,$lemask
 453         le?vperm        $IN2,$IN2,$IN2,$lemask
 454         le?vperm        $IN3,$IN3,$IN3,$lemask
 455
 456         vxor            $Xh,$IN0,$Xl
 457
 458          vpmsumd        $Xl1,$IN1,$H3l
 459          vpmsumd        $Xm1,$IN1,$H3
 460          vpmsumd        $Xh1,$IN1,$H3h
 461
 462          vperm          $H21l,$H2,$H,$hiperm
 463          vperm          $t0,$IN2,$IN3,$loperm
 464          vperm          $H21h,$H2,$H,$loperm
 465          vperm          $t1,$IN2,$IN3,$hiperm
 466          vpmsumd        $Xm2,$IN2,$H2           # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
 467          vpmsumd        $Xl3,$t0,$H21l          # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
 468          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
 469          vpmsumd        $Xh3,$t1,$H21h          # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
 470
 471          vxor           $Xm2,$Xm2,$Xm1
 472          vxor           $Xl3,$Xl3,$Xl1
 473          vxor           $Xm3,$Xm3,$Xm2
 474          vxor           $Xh3,$Xh3,$Xh1
 475
 476         blt             Ltail_4x
 477
 478 Loop_4x:
 479         lvx_u           $IN0,0,$inp
 480         lvx_u           $IN1,r8,$inp
 481         subic.          $len,$len,4
 482         lvx_u           $IN2,r9,$inp
 483         lvx_u           $IN3,r10,$inp
 484         addi            $inp,$inp,0x40
 485         le?vperm        $IN1,$IN1,$IN1,$lemask
 486         le?vperm        $IN2,$IN2,$IN2,$lemask
 487         le?vperm        $IN3,$IN3,$IN3,$lemask
 488         le?vperm        $IN0,$IN0,$IN0,$lemask
 489
 490         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
 491         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
 492         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
 493          vpmsumd        $Xl1,$IN1,$H3l
 494          vpmsumd        $Xm1,$IN1,$H3
 495          vpmsumd        $Xh1,$IN1,$H3h
 496
 497         vxor            $Xl,$Xl,$Xl3
 498         vxor            $Xm,$Xm,$Xm3
 499         vxor            $Xh,$Xh,$Xh3
 500          vperm          $t0,$IN2,$IN3,$loperm
 501          vperm          $t1,$IN2,$IN3,$hiperm
 502
 503         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 504          vpmsumd        $Xl3,$t0,$H21l          # H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
 505          vpmsumd        $Xh3,$t1,$H21h          # H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
 506
 507         vsldoi          $t0,$Xm,$zero,8
 508         vsldoi          $t1,$zero,$Xm,8
 509         vxor            $Xl,$Xl,$t0
 510         vxor            $Xh,$Xh,$t1
 511
 512         vsldoi          $Xl,$Xl,$Xl,8
 513         vxor            $Xl,$Xl,$t2
 514
 515         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 516          vpmsumd        $Xm2,$IN2,$H2           # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
 517          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
 518         vpmsumd         $Xl,$Xl,$xC2
 519
 520          vxor           $Xl3,$Xl3,$Xl1
 521          vxor           $Xh3,$Xh3,$Xh1
 522         vxor            $Xh,$Xh,$IN0
 523          vxor           $Xm2,$Xm2,$Xm1
 524         vxor            $Xh,$Xh,$t1
 525          vxor           $Xm3,$Xm3,$Xm2
 526         vxor            $Xh,$Xh,$Xl
 527         bge             Loop_4x
 528
 529 Ltail_4x:
 530         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
 531         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
 532         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
 533
 534         vxor            $Xl,$Xl,$Xl3
 535         vxor            $Xm,$Xm,$Xm3
 536
 537         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
 538
 539         vsldoi          $t0,$Xm,$zero,8
 540         vsldoi          $t1,$zero,$Xm,8
 541          vxor           $Xh,$Xh,$Xh3
 542         vxor            $Xl,$Xl,$t0
 543         vxor            $Xh,$Xh,$t1
 544
 545         vsldoi          $Xl,$Xl,$Xl,8
 546         vxor            $Xl,$Xl,$t2
 547
 548         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
 549         vpmsumd         $Xl,$Xl,$xC2
 550         vxor            $t1,$t1,$Xh
 551         vxor            $Xl,$Xl,$t1
 552
 553         addic.          $len,$len,4
 554         beq             Ldone_4x
 555
 556         lvx_u           $IN0,0,$inp
 557         ${UCMP}i        $len,2
 558         li              $len,-4
 559         blt             Lone
 560         lvx_u           $IN1,r8,$inp
 561         beq             Ltwo
 562
 563 Lthree:
 564         lvx_u           $IN2,r9,$inp
 565         le?vperm        $IN0,$IN0,$IN0,$lemask
 566         le?vperm        $IN1,$IN1,$IN1,$lemask
 567         le?vperm        $IN2,$IN2,$IN2,$lemask
 568
 569         vxor            $Xh,$IN0,$Xl
 570         vmr             $H4l,$H3l
 571         vmr             $H4, $H3
 572         vmr             $H4h,$H3h
 573
 574         vperm           $t0,$IN1,$IN2,$loperm
 575         vperm           $t1,$IN1,$IN2,$hiperm
 576         vpmsumd         $Xm2,$IN1,$H2           # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
 577         vpmsumd         $Xm3,$IN2,$H            # H.hi·Xi+2.lo  +H.lo·Xi+2.hi
 578         vpmsumd         $Xl3,$t0,$H21l          # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
 579         vpmsumd         $Xh3,$t1,$H21h          # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
 580
 581         vxor            $Xm3,$Xm3,$Xm2
 582         b               Ltail_4x
 583
 584 .align  4
 585 Ltwo:
 586         le?vperm        $IN0,$IN0,$IN0,$lemask
 587         le?vperm        $IN1,$IN1,$IN1,$lemask
 588
 589         vxor            $Xh,$IN0,$Xl
 590         vperm           $t0,$zero,$IN1,$loperm
 591         vperm           $t1,$zero,$IN1,$hiperm
 592
 593         vsldoi          $H4l,$zero,$H2,8
 594         vmr             $H4, $H2
 595         vsldoi          $H4h,$H2,$zero,8
 596
 597         vpmsumd         $Xl3,$t0, $H21l         # H.lo·Xi+1.lo
 598         vpmsumd         $Xm3,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+2.hi
 599         vpmsumd         $Xh3,$t1, $H21h         # H.hi·Xi+1.hi
 600
 601         b               Ltail_4x
 602
 603 .align  4
 604 Lone:
 605         le?vperm        $IN0,$IN0,$IN0,$lemask
 606
 607         vsldoi          $H4l,$zero,$H,8
 608         vmr             $H4, $H
 609         vsldoi          $H4h,$H,$zero,8
 610
 611         vxor            $Xh,$IN0,$Xl
 612         vxor            $Xl3,$Xl3,$Xl3
 613         vxor            $Xm3,$Xm3,$Xm3
 614         vxor            $Xh3,$Xh3,$Xh3
 615
 616         b               Ltail_4x
 617
 618 Ldone_4x:
 619         le?vperm        $Xl,$Xl,$Xl,$lemask
 620         stvx_u          $Xl,0,$Xip              # write out Xi
 621
 622         li              r10,`15+6*$SIZE_T`
 623         li              r11,`31+6*$SIZE_T`
 624         mtspr           256,$vrsave
 625         lvx             v20,r10,$sp
 626         addi            r10,r10,32
 627         lvx             v21,r11,$sp
 628         addi            r11,r11,32
 629         lvx             v22,r10,$sp
 630         addi            r10,r10,32
 631         lvx             v23,r11,$sp
 632         addi            r11,r11,32
 633         lvx             v24,r10,$sp
 634         addi            r10,r10,32
 635         lvx             v25,r11,$sp
 636         addi            r11,r11,32
 637         lvx             v26,r10,$sp
 638         addi            r10,r10,32
 639         lvx             v27,r11,$sp
 640         addi            r11,r11,32
 641         lvx             v28,r10,$sp
 642         addi            r10,r10,32
 643         lvx             v29,r11,$sp
 644         addi            r11,r11,32
 645         lvx             v30,r10,$sp
 646         lvx             v31,r11,$sp
 647         addi            $sp,$sp,$FRAME
 648         blr
 649         .long           0
 650         .byte           0,12,0x04,0,0x80,0,4,0
 651         .long           0
 652 ___
 653 }
 654 $code.=<<___;
 655 .size   .gcm_ghash_p8,.-.gcm_ghash_p8
 656
 657 .asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
 658 .align  2
 659 ___
 660
 661 foreach (split("\n",$code)) {
 662         s/\`([^\`]*)\`/eval $1/geo;
 663
 664         if ($flavour =~ /le$/o) {       # little-endian
 665             s/le\?//o           or
 666             s/be\?/#be#/o;
 667         } else {
 668             s/le\?/#le#/o       or
 669             s/be\?//o;
 670         }
 671         print $_,"\n";
 672 }
 673
 674 close STDOUT or die "error closing STDOUT: $!"; # enforce flush