crypto/modes/asm/ghash-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # March 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  15 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  16 # and are expressed in cycles per processed byte, less is better:
  17 #
  18 #               gcc 3.3.x       cc 5.2          this assembler
  19 #
  20 # 32-bit build  81.4            43.3            12.6    (+546%/+244%)
  21 # 64-bit build  20.2            21.2            12.6    (+60%/+68%)
  22 #
  23 # Here is data collected on UltraSPARC T1 system running Linux:
  24 #
  25 #               gcc 4.4.1                       this assembler
  26 #
  27 # 32-bit build  566                             50      (+1000%)
  28 # 64-bit build  56                              50      (+12%)
  29 #
  30 # I don't quite understand why difference between 32-bit and 64-bit
  31 # compiler-generated code is so big. Compilers *were* instructed to
  32 # generate code for UltraSPARC and should have used 64-bit registers
  33 # for Z vector (see C code) even in 32-bit build... Oh well, it only
  34 # means more impressive improvement coefficients for this assembler
  35 # module;-) Loops are aggressively modulo-scheduled in respect to
  36 # references to input data and Z.hi updates to achieve 12 cycles
  37 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  38 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  39 #
  40 # October 2012
  41 #
  42 # Add VIS3 lookup-table-free implementation using polynomial
  43 # multiplication xmulx[hi] and extended addition addxc[cc]
  44 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
  45 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
  46 # saturates at ~15.5x single-process result on 8-core processor,
  47 # or ~20.5GBps per 2.85GHz socket.
  48
  49 $bits=32;
  50 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  51 if ($bits==64)  { $bias=2047; $frame=192; }
  52 else            { $bias=0;    $frame=112; }
  53
  54 $output=shift;
  55 open STDOUT,">$output";
  56
  57 $Zhi="%o0";     # 64-bit values
  58 $Zlo="%o1";
  59 $Thi="%o2";
  60 $Tlo="%o3";
  61 $rem="%o4";
  62 $tmp="%o5";
  63
  64 $nhi="%l0";     # small values and pointers
  65 $nlo="%l1";
  66 $xi0="%l2";
  67 $xi1="%l3";
  68 $rem_4bit="%l4";
  69 $remi="%l5";
  70 $Htblo="%l6";
  71 $cnt="%l7";
  72
  73 $Xi="%i0";      # input argument block
  74 $Htbl="%i1";
  75 $inp="%i2";
  76 $len="%i3";
  77
  78 $code.=<<___ if ($bits==64);
  79 .register       %g2,#scratch
  80 .register       %g3,#scratch
  81 ___
  82 $code.=<<___;
  83 .section        ".text",#alloc,#execinstr
  84
  85 .align  64
  86 rem_4bit:
  87         .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  88         .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  89         .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  90         .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  91 .type   rem_4bit,#object
  92 .size   rem_4bit,(.-rem_4bit)
  93
  94 .globl  gcm_ghash_4bit
  95 .align  32
  96 gcm_ghash_4bit:
  97         save    %sp,-$frame,%sp
  98         ldub    [$inp+15],$nlo
  99         ldub    [$Xi+15],$xi0
 100         ldub    [$Xi+14],$xi1
 101         add     $len,$inp,$len
 102         add     $Htbl,8,$Htblo
 103
 104 1:      call    .+8
 105         add     %o7,rem_4bit-1b,$rem_4bit
 106
 107 .Louter:
 108         xor     $xi0,$nlo,$nlo
 109         and     $nlo,0xf0,$nhi
 110         and     $nlo,0x0f,$nlo
 111         sll     $nlo,4,$nlo
 112         ldx     [$Htblo+$nlo],$Zlo
 113         ldx     [$Htbl+$nlo],$Zhi
 114
 115         ldub    [$inp+14],$nlo
 116
 117         ldx     [$Htblo+$nhi],$Tlo
 118         and     $Zlo,0xf,$remi
 119         ldx     [$Htbl+$nhi],$Thi
 120         sll     $remi,3,$remi
 121         ldx     [$rem_4bit+$remi],$rem
 122         srlx    $Zlo,4,$Zlo
 123         mov     13,$cnt
 124         sllx    $Zhi,60,$tmp
 125         xor     $Tlo,$Zlo,$Zlo
 126         srlx    $Zhi,4,$Zhi
 127         xor     $Zlo,$tmp,$Zlo
 128
 129         xor     $xi1,$nlo,$nlo
 130         and     $Zlo,0xf,$remi
 131         and     $nlo,0xf0,$nhi
 132         and     $nlo,0x0f,$nlo
 133         ba      .Lghash_inner
 134         sll     $nlo,4,$nlo
 135 .align  32
 136 .Lghash_inner:
 137         ldx     [$Htblo+$nlo],$Tlo
 138         sll     $remi,3,$remi
 139         xor     $Thi,$Zhi,$Zhi
 140         ldx     [$Htbl+$nlo],$Thi
 141         srlx    $Zlo,4,$Zlo
 142         xor     $rem,$Zhi,$Zhi
 143         ldx     [$rem_4bit+$remi],$rem
 144         sllx    $Zhi,60,$tmp
 145         xor     $Tlo,$Zlo,$Zlo
 146         ldub    [$inp+$cnt],$nlo
 147         srlx    $Zhi,4,$Zhi
 148         xor     $Zlo,$tmp,$Zlo
 149         ldub    [$Xi+$cnt],$xi1
 150         xor     $Thi,$Zhi,$Zhi
 151         and     $Zlo,0xf,$remi
 152
 153         ldx     [$Htblo+$nhi],$Tlo
 154         sll     $remi,3,$remi
 155         xor     $rem,$Zhi,$Zhi
 156         ldx     [$Htbl+$nhi],$Thi
 157         srlx    $Zlo,4,$Zlo
 158         ldx     [$rem_4bit+$remi],$rem
 159         sllx    $Zhi,60,$tmp
 160         xor     $xi1,$nlo,$nlo
 161         srlx    $Zhi,4,$Zhi
 162         and     $nlo,0xf0,$nhi
 163         addcc   $cnt,-1,$cnt
 164         xor     $Zlo,$tmp,$Zlo
 165         and     $nlo,0x0f,$nlo
 166         xor     $Tlo,$Zlo,$Zlo
 167         sll     $nlo,4,$nlo
 168         blu     .Lghash_inner
 169         and     $Zlo,0xf,$remi
 170
 171         ldx     [$Htblo+$nlo],$Tlo
 172         sll     $remi,3,$remi
 173         xor     $Thi,$Zhi,$Zhi
 174         ldx     [$Htbl+$nlo],$Thi
 175         srlx    $Zlo,4,$Zlo
 176         xor     $rem,$Zhi,$Zhi
 177         ldx     [$rem_4bit+$remi],$rem
 178         sllx    $Zhi,60,$tmp
 179         xor     $Tlo,$Zlo,$Zlo
 180         srlx    $Zhi,4,$Zhi
 181         xor     $Zlo,$tmp,$Zlo
 182         xor     $Thi,$Zhi,$Zhi
 183
 184         add     $inp,16,$inp
 185         cmp     $inp,$len
 186         be,pn   `$bits==64?"%xcc":"%icc"`,.Ldone
 187         and     $Zlo,0xf,$remi
 188
 189         ldx     [$Htblo+$nhi],$Tlo
 190         sll     $remi,3,$remi
 191         xor     $rem,$Zhi,$Zhi
 192         ldx     [$Htbl+$nhi],$Thi
 193         srlx    $Zlo,4,$Zlo
 194         ldx     [$rem_4bit+$remi],$rem
 195         sllx    $Zhi,60,$tmp
 196         xor     $Tlo,$Zlo,$Zlo
 197         ldub    [$inp+15],$nlo
 198         srlx    $Zhi,4,$Zhi
 199         xor     $Zlo,$tmp,$Zlo
 200         xor     $Thi,$Zhi,$Zhi
 201         stx     $Zlo,[$Xi+8]
 202         xor     $rem,$Zhi,$Zhi
 203         stx     $Zhi,[$Xi]
 204         srl     $Zlo,8,$xi1
 205         and     $Zlo,0xff,$xi0
 206         ba      .Louter
 207         and     $xi1,0xff,$xi1
 208 .align  32
 209 .Ldone:
 210         ldx     [$Htblo+$nhi],$Tlo
 211         sll     $remi,3,$remi
 212         xor     $rem,$Zhi,$Zhi
 213         ldx     [$Htbl+$nhi],$Thi
 214         srlx    $Zlo,4,$Zlo
 215         ldx     [$rem_4bit+$remi],$rem
 216         sllx    $Zhi,60,$tmp
 217         xor     $Tlo,$Zlo,$Zlo
 218         srlx    $Zhi,4,$Zhi
 219         xor     $Zlo,$tmp,$Zlo
 220         xor     $Thi,$Zhi,$Zhi
 221         stx     $Zlo,[$Xi+8]
 222         xor     $rem,$Zhi,$Zhi
 223         stx     $Zhi,[$Xi]
 224
 225         ret
 226         restore
 227 .type   gcm_ghash_4bit,#function
 228 .size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
 229 ___
 230
 231 undef $inp;
 232 undef $len;
 233
 234 $code.=<<___;
 235 .globl  gcm_gmult_4bit
 236 .align  32
 237 gcm_gmult_4bit:
 238         save    %sp,-$frame,%sp
 239         ldub    [$Xi+15],$nlo
 240         add     $Htbl,8,$Htblo
 241
 242 1:      call    .+8
 243         add     %o7,rem_4bit-1b,$rem_4bit
 244
 245         and     $nlo,0xf0,$nhi
 246         and     $nlo,0x0f,$nlo
 247         sll     $nlo,4,$nlo
 248         ldx     [$Htblo+$nlo],$Zlo
 249         ldx     [$Htbl+$nlo],$Zhi
 250
 251         ldub    [$Xi+14],$nlo
 252
 253         ldx     [$Htblo+$nhi],$Tlo
 254         and     $Zlo,0xf,$remi
 255         ldx     [$Htbl+$nhi],$Thi
 256         sll     $remi,3,$remi
 257         ldx     [$rem_4bit+$remi],$rem
 258         srlx    $Zlo,4,$Zlo
 259         mov     13,$cnt
 260         sllx    $Zhi,60,$tmp
 261         xor     $Tlo,$Zlo,$Zlo
 262         srlx    $Zhi,4,$Zhi
 263         xor     $Zlo,$tmp,$Zlo
 264
 265         and     $Zlo,0xf,$remi
 266         and     $nlo,0xf0,$nhi
 267         and     $nlo,0x0f,$nlo
 268         ba      .Lgmult_inner
 269         sll     $nlo,4,$nlo
 270 .align  32
 271 .Lgmult_inner:
 272         ldx     [$Htblo+$nlo],$Tlo
 273         sll     $remi,3,$remi
 274         xor     $Thi,$Zhi,$Zhi
 275         ldx     [$Htbl+$nlo],$Thi
 276         srlx    $Zlo,4,$Zlo
 277         xor     $rem,$Zhi,$Zhi
 278         ldx     [$rem_4bit+$remi],$rem
 279         sllx    $Zhi,60,$tmp
 280         xor     $Tlo,$Zlo,$Zlo
 281         ldub    [$Xi+$cnt],$nlo
 282         srlx    $Zhi,4,$Zhi
 283         xor     $Zlo,$tmp,$Zlo
 284         xor     $Thi,$Zhi,$Zhi
 285         and     $Zlo,0xf,$remi
 286
 287         ldx     [$Htblo+$nhi],$Tlo
 288         sll     $remi,3,$remi
 289         xor     $rem,$Zhi,$Zhi
 290         ldx     [$Htbl+$nhi],$Thi
 291         srlx    $Zlo,4,$Zlo
 292         ldx     [$rem_4bit+$remi],$rem
 293         sllx    $Zhi,60,$tmp
 294         srlx    $Zhi,4,$Zhi
 295         and     $nlo,0xf0,$nhi
 296         addcc   $cnt,-1,$cnt
 297         xor     $Zlo,$tmp,$Zlo
 298         and     $nlo,0x0f,$nlo
 299         xor     $Tlo,$Zlo,$Zlo
 300         sll     $nlo,4,$nlo
 301         blu     .Lgmult_inner
 302         and     $Zlo,0xf,$remi
 303
 304         ldx     [$Htblo+$nlo],$Tlo
 305         sll     $remi,3,$remi
 306         xor     $Thi,$Zhi,$Zhi
 307         ldx     [$Htbl+$nlo],$Thi
 308         srlx    $Zlo,4,$Zlo
 309         xor     $rem,$Zhi,$Zhi
 310         ldx     [$rem_4bit+$remi],$rem
 311         sllx    $Zhi,60,$tmp
 312         xor     $Tlo,$Zlo,$Zlo
 313         srlx    $Zhi,4,$Zhi
 314         xor     $Zlo,$tmp,$Zlo
 315         xor     $Thi,$Zhi,$Zhi
 316         and     $Zlo,0xf,$remi
 317
 318         ldx     [$Htblo+$nhi],$Tlo
 319         sll     $remi,3,$remi
 320         xor     $rem,$Zhi,$Zhi
 321         ldx     [$Htbl+$nhi],$Thi
 322         srlx    $Zlo,4,$Zlo
 323         ldx     [$rem_4bit+$remi],$rem
 324         sllx    $Zhi,60,$tmp
 325         xor     $Tlo,$Zlo,$Zlo
 326         srlx    $Zhi,4,$Zhi
 327         xor     $Zlo,$tmp,$Zlo
 328         xor     $Thi,$Zhi,$Zhi
 329         stx     $Zlo,[$Xi+8]
 330         xor     $rem,$Zhi,$Zhi
 331         stx     $Zhi,[$Xi]
 332
 333         ret
 334         restore
 335 .type   gcm_gmult_4bit,#function
 336 .size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
 337 ___
 338 \f
 339 {{{
 340 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
 341 # followed by pair of 64-bit reductions [with a shortcut in first one,
 342 # which allowed to break dependency between reductions and remove one
 343 # multiplication from critical path]. While it might be suboptimal
 344 # with regard to sheer number of multiplications, other methods [such
 345 # as aggregate reduction] would require more 64-bit registers, which
 346 # we don't have in 32-bit application context.
 347
 348 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
 349
 350 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
 351         (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
 352
 353 ($shl,$shr)=map("%l$_",(0..7));
 354
 355 # For details regarding "twisted H" see ghash-x86.pl.
 356 $code.=<<___;
 357 .globl  gcm_init_vis3
 358 .align  32
 359 gcm_init_vis3:
 360         save    %sp,-$frame,%sp
 361
 362         ldx     [%i1+0],$Hhi
 363         ldx     [%i1+8],$Hlo
 364         mov     0xE1,$Xhi
 365         mov     1,$Xlo
 366         sllx    $Xhi,57,$Xhi
 367         srax    $Hhi,63,$C0             ! broadcast carry
 368         addcc   $Hlo,$Hlo,$Hlo          ! H<<=1
 369         addxc   $Hhi,$Hhi,$Hhi
 370         and     $C0,$Xlo,$Xlo
 371         and     $C0,$Xhi,$Xhi
 372         xor     $Xlo,$Hlo,$Hlo
 373         xor     $Xhi,$Hhi,$Hhi
 374         stx     $Hlo,[%i0+8]            ! save twisted H
 375         stx     $Hhi,[%i0+0]
 376
 377         sethi   %hi(0xA0406080),$V
 378         sethi   %hi(0x20C0E000),%l0
 379         or      $V,%lo(0xA0406080),$V
 380         or      %l0,%lo(0x20C0E000),%l0
 381         sllx    $V,32,$V
 382         or      %l0,$V,$V               ! (0xE0·i)&0xff=0xA040608020C0E000
 383         stx     $V,[%i0+16]
 384
 385         ret
 386         restore
 387 .type   gcm_init_vis3,#function
 388 .size   gcm_init_vis3,.-gcm_init_vis3
 389
 390 .globl  gcm_gmult_vis3
 391 .align  32
 392 gcm_gmult_vis3:
 393         save    %sp,-$frame,%sp
 394
 395         ldx     [$Xip+8],$Xlo           ! load Xi
 396         ldx     [$Xip+0],$Xhi
 397         ldx     [$Htable+8],$Hlo        ! load twisted H
 398         ldx     [$Htable+0],$Hhi
 399
 400         mov     0xE1,%l7
 401         sllx    %l7,57,$xE1             ! 57 is not a typo
 402         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 403
 404         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 405         xmulx   $Xlo,$Hlo,$C0
 406         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 407         xmulx   $C2,$Hhl,$C1
 408         xmulxhi $Xlo,$Hlo,$Xlo
 409         xmulxhi $C2,$Hhl,$C2
 410         xmulxhi $Xhi,$Hhi,$C3
 411         xmulx   $Xhi,$Hhi,$Xhi
 412
 413         sll     $C0,3,$sqr
 414         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 415         xor     $C0,$sqr,$sqr
 416         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 417
 418         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 419         xor     $Xlo,$C2,$C2
 420          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 421         xor     $C3,$C2,$C2
 422         xor     $Xlo,$C1,$C1
 423         xor     $Xhi,$C2,$C2
 424         xor     $Xhi,$C1,$C1
 425
 426         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 427          xor    $C0,$C2,$C2
 428         xmulx   $C1,$xE1,$C0
 429          xor    $C1,$C3,$C3
 430         xmulxhi $C1,$xE1,$C1
 431
 432         xor     $Xlo,$C2,$C2
 433         xor     $C0,$C2,$C2
 434         xor     $C1,$C3,$C3
 435
 436         stx     $C2,[$Xip+8]            ! save Xi
 437         stx     $C3,[$Xip+0]
 438
 439         ret
 440         restore
 441 .type   gcm_gmult_vis3,#function
 442 .size   gcm_gmult_vis3,.-gcm_gmult_vis3
 443
 444 .globl  gcm_ghash_vis3
 445 .align  32
 446 gcm_ghash_vis3:
 447         save    %sp,-$frame,%sp
 448
 449         ldx     [$Xip+8],$C2            ! load Xi
 450         ldx     [$Xip+0],$C3
 451         ldx     [$Htable+8],$Hlo        ! load twisted H
 452         ldx     [$Htable+0],$Hhi
 453
 454         mov     0xE1,%l7
 455         sllx    %l7,57,$xE1             ! 57 is not a typo
 456         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 457
 458         and     $inp,7,$shl
 459         andn    $inp,7,$inp
 460         sll     $shl,3,$shl
 461         prefetch [$inp+63], 20
 462         sub     %g0,$shl,$shr
 463
 464         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 465 .Loop:
 466         ldx     [$inp+8],$Xlo
 467         brz,pt  $shl,1f
 468         ldx     [$inp+0],$Xhi
 469
 470         ldx     [$inp+16],$C1           ! align data
 471         srlx    $Xlo,$shr,$C0
 472         sllx    $Xlo,$shl,$Xlo
 473         sllx    $Xhi,$shl,$Xhi
 474         srlx    $C1,$shr,$C1
 475         or      $C0,$Xhi,$Xhi
 476         or      $C1,$Xlo,$Xlo
 477 1:
 478         add     $inp,16,$inp
 479         sub     $len,16,$len
 480         xor     $C2,$Xlo,$Xlo
 481         xor     $C3,$Xhi,$Xhi
 482         prefetch [$inp+63], 20
 483
 484         xmulx   $Xlo,$Hlo,$C0
 485         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 486         xmulx   $C2,$Hhl,$C1
 487         xmulxhi $Xlo,$Hlo,$Xlo
 488         xmulxhi $C2,$Hhl,$C2
 489         xmulxhi $Xhi,$Hhi,$C3
 490         xmulx   $Xhi,$Hhi,$Xhi
 491
 492         sll     $C0,3,$sqr
 493         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 494         xor     $C0,$sqr,$sqr
 495         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 496
 497         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 498         xor     $Xlo,$C2,$C2
 499          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 500         xor     $C3,$C2,$C2
 501         xor     $Xlo,$C1,$C1
 502         xor     $Xhi,$C2,$C2
 503         xor     $Xhi,$C1,$C1
 504
 505         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 506          xor    $C0,$C2,$C2
 507         xmulx   $C1,$xE1,$C0
 508          xor    $C1,$C3,$C3
 509         xmulxhi $C1,$xE1,$C1
 510
 511         xor     $Xlo,$C2,$C2
 512         xor     $C0,$C2,$C2
 513         brnz,pt $len,.Loop
 514         xor     $C1,$C3,$C3
 515
 516         stx     $C2,[$Xip+8]            ! save Xi
 517         stx     $C3,[$Xip+0]
 518
 519         ret
 520         restore
 521 .type   gcm_ghash_vis3,#function
 522 .size   gcm_ghash_vis3,.-gcm_ghash_vis3
 523 ___
 524 }}}
 525 $code.=<<___;
 526 .asciz  "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 527 .align  4
 528 ___
 529
 530 \f
 531 # Purpose of these subroutines is to explicitly encode VIS instructions,
 532 # so that one can compile the module without having to specify VIS
 533 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 534 # Idea is to reserve for option to produce "universal" binary and let
 535 # programmer detect if current CPU is VIS capable at run-time.
 536 sub unvis3 {
 537 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 538 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 539 my ($ref,$opf);
 540 my %visopf = (  "addxc"         => 0x011,
 541                 "addxccc"       => 0x013,
 542                 "xmulx"         => 0x115,
 543                 "xmulxhi"       => 0x116        );
 544
 545     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 546
 547     if ($opf=$visopf{$mnemonic}) {
 548         foreach ($rs1,$rs2,$rd) {
 549             return $ref if (!/%([goli])([0-9])/);
 550             $_=$bias{$1}+$2;
 551         }
 552
 553         return  sprintf ".word\t0x%08x !%s",
 554                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 555                         $ref;
 556     } else {
 557         return $ref;
 558     }
 559 }
 560
 561 foreach (split("\n",$code)) {
 562         s/\`([^\`]*)\`/eval $1/ge;
 563
 564         s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 565                 &unvis3($1,$2,$3,$4)
 566          /ge;
 567
 568         print $_,"\n";
 569 }
 570
 571 close STDOUT;