[thirdparty/openssl.git] / crypto / modes / asm / ghash-sparcv9.pl

#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html


# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
# and are expressed in cycles per processed byte, less is better:
#
#		gcc 3.3.x	cc 5.2		this assembler
#
# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
#
# Here is data collected on UltraSPARC T1 system running Linux:
#
#		gcc 4.4.1			this assembler
#
# 32-bit build	566				50	(+1000%)
# 64-bit build	56				50	(+12%)
#
# I don't quite understand why difference between 32-bit and 64-bit
# compiler-generated code is so big. Compilers *were* instructed to
# generate code for UltraSPARC and should have used 64-bit registers
# for Z vector (see C code) even in 32-bit build... Oh well, it only
# means more impressive improvement coefficients for this assembler
# module;-) Loops are aggressively modulo-scheduled in respect to
# references to input data and Z.hi updates to achieve 12 cycles
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
#
# October 2012
#
# Add VIS3 lookup-table-free implementation using polynomial
# multiplication xmulx[hi] and extended addition addxc[cc]
# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
# saturates at ~15.5x single-process result on 8-core processor,
# or ~20.5GBps per 2.85GHz socket.

$output=pop;
open STDOUT,">$output";

$frame="STACK_FRAME";
$bias="STACK_BIAS";

$Zhi="%o0";	# 64-bit values
$Zlo="%o1";
$Thi="%o2";
$Tlo="%o3";
$rem="%o4";
$tmp="%o5";

$nhi="%l0";	# small values and pointers
$nlo="%l1";
$xi0="%l2";
$xi1="%l3";
$rem_4bit="%l4";
$remi="%l5";
$Htblo="%l6";
$cnt="%l7";

$Xi="%i0";	# input argument block
$Htbl="%i1";
$inp="%i2";
$len="%i3";

$code.=<<___;
#include "sparc_arch.h"

#ifdef  __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.section	".text",#alloc,#execinstr

.align	64
rem_4bit:
	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
.type	rem_4bit,#object
.size	rem_4bit,(.-rem_4bit)

.globl	gcm_ghash_4bit
.align	32
gcm_ghash_4bit:
	save	%sp,-$frame,%sp
	ldub	[$inp+15],$nlo
	ldub	[$Xi+15],$xi0
	ldub	[$Xi+14],$xi1
	add	$len,$inp,$len
	add	$Htbl,8,$Htblo

1:	call	.+8
	add	%o7,rem_4bit-1b,$rem_4bit

.Louter:
	xor	$xi0,$nlo,$nlo
	and	$nlo,0xf0,$nhi
	and	$nlo,0x0f,$nlo
	sll	$nlo,4,$nlo
	ldx	[$Htblo+$nlo],$Zlo
	ldx	[$Htbl+$nlo],$Zhi

	ldub	[$inp+14],$nlo

	ldx	[$Htblo+$nhi],$Tlo
	and	$Zlo,0xf,$remi
	ldx	[$Htbl+$nhi],$Thi
	sll	$remi,3,$remi
	ldx	[$rem_4bit+$remi],$rem
	srlx	$Zlo,4,$Zlo
	mov	13,$cnt
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo

	xor	$xi1,$nlo,$nlo
	and	$Zlo,0xf,$remi
	and	$nlo,0xf0,$nhi
	and	$nlo,0x0f,$nlo
	ba	.Lghash_inner
	sll	$nlo,4,$nlo
.align	32
.Lghash_inner:
	ldx	[$Htblo+$nlo],$Tlo
	sll	$remi,3,$remi
	xor	$Thi,$Zhi,$Zhi
	ldx	[$Htbl+$nlo],$Thi
	srlx	$Zlo,4,$Zlo
	xor	$rem,$Zhi,$Zhi
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	ldub	[$inp+$cnt],$nlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	ldub	[$Xi+$cnt],$xi1
	xor	$Thi,$Zhi,$Zhi
	and	$Zlo,0xf,$remi

	ldx	[$Htblo+$nhi],$Tlo
	sll	$remi,3,$remi
	xor	$rem,$Zhi,$Zhi
	ldx	[$Htbl+$nhi],$Thi
	srlx	$Zlo,4,$Zlo
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$xi1,$nlo,$nlo
	srlx	$Zhi,4,$Zhi
	and	$nlo,0xf0,$nhi
	addcc	$cnt,-1,$cnt
	xor	$Zlo,$tmp,$Zlo
	and	$nlo,0x0f,$nlo
	xor	$Tlo,$Zlo,$Zlo
	sll	$nlo,4,$nlo
	blu	.Lghash_inner
	and	$Zlo,0xf,$remi

	ldx	[$Htblo+$nlo],$Tlo
	sll	$remi,3,$remi
	xor	$Thi,$Zhi,$Zhi
	ldx	[$Htbl+$nlo],$Thi
	srlx	$Zlo,4,$Zlo
	xor	$rem,$Zhi,$Zhi
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	xor	$Thi,$Zhi,$Zhi

	add	$inp,16,$inp
	cmp	$inp,$len
	be,pn	SIZE_T_CC,.Ldone
	and	$Zlo,0xf,$remi

	ldx	[$Htblo+$nhi],$Tlo
	sll	$remi,3,$remi
	xor	$rem,$Zhi,$Zhi
	ldx	[$Htbl+$nhi],$Thi
	srlx	$Zlo,4,$Zlo
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	ldub	[$inp+15],$nlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	xor	$Thi,$Zhi,$Zhi
	stx	$Zlo,[$Xi+8]
	xor	$rem,$Zhi,$Zhi
	stx	$Zhi,[$Xi]
	srl	$Zlo,8,$xi1
	and	$Zlo,0xff,$xi0
	ba	.Louter
	and	$xi1,0xff,$xi1
.align	32
.Ldone:
	ldx	[$Htblo+$nhi],$Tlo
	sll	$remi,3,$remi
	xor	$rem,$Zhi,$Zhi
	ldx	[$Htbl+$nhi],$Thi
	srlx	$Zlo,4,$Zlo
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	xor	$Thi,$Zhi,$Zhi
	stx	$Zlo,[$Xi+8]
	xor	$rem,$Zhi,$Zhi
	stx	$Zhi,[$Xi]

	ret
	restore
.type	gcm_ghash_4bit,#function
.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
___

undef $inp;
undef $len;

$code.=<<___;
.globl	gcm_gmult_4bit
.align	32
gcm_gmult_4bit:
	save	%sp,-$frame,%sp
	ldub	[$Xi+15],$nlo
	add	$Htbl,8,$Htblo

1:	call	.+8
	add	%o7,rem_4bit-1b,$rem_4bit

	and	$nlo,0xf0,$nhi
	and	$nlo,0x0f,$nlo
	sll	$nlo,4,$nlo
	ldx	[$Htblo+$nlo],$Zlo
	ldx	[$Htbl+$nlo],$Zhi

	ldub	[$Xi+14],$nlo

	ldx	[$Htblo+$nhi],$Tlo
	and	$Zlo,0xf,$remi
	ldx	[$Htbl+$nhi],$Thi
	sll	$remi,3,$remi
	ldx	[$rem_4bit+$remi],$rem
	srlx	$Zlo,4,$Zlo
	mov	13,$cnt
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo

	and	$Zlo,0xf,$remi
	and	$nlo,0xf0,$nhi
	and	$nlo,0x0f,$nlo
	ba	.Lgmult_inner
	sll	$nlo,4,$nlo
.align	32
.Lgmult_inner:
	ldx	[$Htblo+$nlo],$Tlo
	sll	$remi,3,$remi
	xor	$Thi,$Zhi,$Zhi
	ldx	[$Htbl+$nlo],$Thi
	srlx	$Zlo,4,$Zlo
	xor	$rem,$Zhi,$Zhi
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	ldub	[$Xi+$cnt],$nlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	xor	$Thi,$Zhi,$Zhi
	and	$Zlo,0xf,$remi

	ldx	[$Htblo+$nhi],$Tlo
	sll	$remi,3,$remi
	xor	$rem,$Zhi,$Zhi
	ldx	[$Htbl+$nhi],$Thi
	srlx	$Zlo,4,$Zlo
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	srlx	$Zhi,4,$Zhi
	and	$nlo,0xf0,$nhi
	addcc	$cnt,-1,$cnt
	xor	$Zlo,$tmp,$Zlo
	and	$nlo,0x0f,$nlo
	xor	$Tlo,$Zlo,$Zlo
	sll	$nlo,4,$nlo
	blu	.Lgmult_inner
	and	$Zlo,0xf,$remi

	ldx	[$Htblo+$nlo],$Tlo
	sll	$remi,3,$remi
	xor	$Thi,$Zhi,$Zhi
	ldx	[$Htbl+$nlo],$Thi
	srlx	$Zlo,4,$Zlo
	xor	$rem,$Zhi,$Zhi
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	xor	$Thi,$Zhi,$Zhi
	and	$Zlo,0xf,$remi

	ldx	[$Htblo+$nhi],$Tlo
	sll	$remi,3,$remi
	xor	$rem,$Zhi,$Zhi
	ldx	[$Htbl+$nhi],$Thi
	srlx	$Zlo,4,$Zlo
	ldx	[$rem_4bit+$remi],$rem
	sllx	$Zhi,60,$tmp
	xor	$Tlo,$Zlo,$Zlo
	srlx	$Zhi,4,$Zhi
	xor	$Zlo,$tmp,$Zlo
	xor	$Thi,$Zhi,$Zhi
	stx	$Zlo,[$Xi+8]
	xor	$rem,$Zhi,$Zhi
	stx	$Zhi,[$Xi]

	ret
	restore
.type	gcm_gmult_4bit,#function
.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
___
\f
{{{
# Straightforward 128x128-bit multiplication using Karatsuba algorithm
# followed by pair of 64-bit reductions [with a shortcut in first one,
# which allowed to break dependency between reductions and remove one
# multiplication from critical path]. While it might be suboptimal
# with regard to sheer number of multiplications, other methods [such
# as aggregate reduction] would require more 64-bit registers, which
# we don't have in 32-bit application context.

($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));

($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));

($shl,$shr)=map("%l$_",(0..7));

# For details regarding "twisted H" see ghash-x86.pl.
$code.=<<___;
.globl	gcm_init_vis3
.align	32
gcm_init_vis3:
	save	%sp,-$frame,%sp

	ldx	[%i1+0],$Hhi
	ldx	[%i1+8],$Hlo
	mov	0xE1,$Xhi
	mov	1,$Xlo
	sllx	$Xhi,57,$Xhi
	srax	$Hhi,63,$C0		! broadcast carry
	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
	addxc	$Hhi,$Hhi,$Hhi
	and	$C0,$Xlo,$Xlo
	and	$C0,$Xhi,$Xhi
	xor	$Xlo,$Hlo,$Hlo
	xor	$Xhi,$Hhi,$Hhi
	stx	$Hlo,[%i0+8]		! save twisted H
	stx	$Hhi,[%i0+0]

	sethi	%hi(0xA0406080),$V
	sethi	%hi(0x20C0E000),%l0
	or	$V,%lo(0xA0406080),$V
	or	%l0,%lo(0x20C0E000),%l0
	sllx	$V,32,$V
	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
	stx	$V,[%i0+16]

	ret
	restore
.type	gcm_init_vis3,#function
.size	gcm_init_vis3,.-gcm_init_vis3

.globl	gcm_gmult_vis3
.align	32
gcm_gmult_vis3:
	save	%sp,-$frame,%sp

	ldx	[$Xip+8],$Xlo		! load Xi
	ldx	[$Xip+0],$Xhi
	ldx	[$Htable+8],$Hlo	! load twisted H
	ldx	[$Htable+0],$Hhi

	mov	0xE1,%l7
	sllx	%l7,57,$xE1		! 57 is not a typo
	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000

	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
	xmulx	$Xlo,$Hlo,$C0
	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
	xmulx	$C2,$Hhl,$C1
	xmulxhi	$Xlo,$Hlo,$Xlo
	xmulxhi	$C2,$Hhl,$C2
	xmulxhi	$Xhi,$Hhi,$C3
	xmulx	$Xhi,$Hhi,$Xhi

	sll	$C0,3,$sqr
	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
	xor	$C0,$sqr,$sqr
	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]

	xor	$C0,$C1,$C1		! Karatsuba post-processing
	xor	$Xlo,$C2,$C2
	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
	xor	$C3,$C2,$C2
	xor	$Xlo,$C1,$C1
	xor	$Xhi,$C2,$C2
	xor	$Xhi,$C1,$C1

	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
	 xor	$C0,$C2,$C2
	xmulx	$C1,$xE1,$C0
	 xor	$C1,$C3,$C3
	xmulxhi	$C1,$xE1,$C1

	xor	$Xlo,$C2,$C2
	xor	$C0,$C2,$C2
	xor	$C1,$C3,$C3

	stx	$C2,[$Xip+8]		! save Xi
	stx	$C3,[$Xip+0]

	ret
	restore
.type	gcm_gmult_vis3,#function
.size	gcm_gmult_vis3,.-gcm_gmult_vis3

.globl	gcm_ghash_vis3
.align	32
gcm_ghash_vis3:
	save	%sp,-$frame,%sp
	nop
	srln	$len,0,$len		! needed on v8+, "nop" on v9

	ldx	[$Xip+8],$C2		! load Xi
	ldx	[$Xip+0],$C3
	ldx	[$Htable+8],$Hlo	! load twisted H
	ldx	[$Htable+0],$Hhi

	mov	0xE1,%l7
	sllx	%l7,57,$xE1		! 57 is not a typo
	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000

	and	$inp,7,$shl
	andn	$inp,7,$inp
	sll	$shl,3,$shl
	prefetch [$inp+63], 20
	sub	%g0,$shl,$shr

	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
.Loop:
	ldx	[$inp+8],$Xlo
	brz,pt	$shl,1f
	ldx	[$inp+0],$Xhi

	ldx	[$inp+16],$C1		! align data
	srlx	$Xlo,$shr,$C0
	sllx	$Xlo,$shl,$Xlo
	sllx	$Xhi,$shl,$Xhi
	srlx	$C1,$shr,$C1
	or	$C0,$Xhi,$Xhi
	or	$C1,$Xlo,$Xlo
1:
	add	$inp,16,$inp
	sub	$len,16,$len
	xor	$C2,$Xlo,$Xlo
	xor	$C3,$Xhi,$Xhi
	prefetch [$inp+63], 20

	xmulx	$Xlo,$Hlo,$C0
	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
	xmulx	$C2,$Hhl,$C1
	xmulxhi	$Xlo,$Hlo,$Xlo
	xmulxhi	$C2,$Hhl,$C2
	xmulxhi	$Xhi,$Hhi,$C3
	xmulx	$Xhi,$Hhi,$Xhi

	sll	$C0,3,$sqr
	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
	xor	$C0,$sqr,$sqr
	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]

	xor	$C0,$C1,$C1		! Karatsuba post-processing
	xor	$Xlo,$C2,$C2
	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
	xor	$C3,$C2,$C2
	xor	$Xlo,$C1,$C1
	xor	$Xhi,$C2,$C2
	xor	$Xhi,$C1,$C1

	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
	 xor	$C0,$C2,$C2
	xmulx	$C1,$xE1,$C0
	 xor	$C1,$C3,$C3
	xmulxhi	$C1,$xE1,$C1

	xor	$Xlo,$C2,$C2
	xor	$C0,$C2,$C2
	brnz,pt	$len,.Loop
	xor	$C1,$C3,$C3

	stx	$C2,[$Xip+8]		! save Xi
	stx	$C3,[$Xip+0]

	ret
	restore
.type	gcm_ghash_vis3,#function
.size	gcm_ghash_vis3,.-gcm_ghash_vis3
___
}}}
$code.=<<___;
.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align	4
___

\f
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = (	"addxc"		=> 0x011,
		"addxccc"	=> 0x013,
		"xmulx"		=> 0x115,
		"xmulxhi"	=> 0x116	);

    $ref = "$mnemonic\t$rs1,$rs2,$rd";

    if ($opf=$visopf{$mnemonic}) {
	foreach ($rs1,$rs2,$rd) {
	    return $ref if (!/%([goli])([0-9])/);
	    $_=$bias{$1}+$2;
	}

	return	sprintf ".word\t0x%08x !%s",
			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
			$ref;
    } else {
	return $ref;
    }
}

foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/ge;

	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
		&unvis3($1,$2,$3,$4)
	 /ge;

	print $_,"\n";
}

close STDOUT;
Commit	Line	Data
6aa36e8e RS	1	#! /usr/bin/env perl
	2	# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
	3	#
81cae8ce	4	# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e RS	5	# this file except in compliance with the License. You can obtain a copy
	6	# in the file LICENSE in the source distribution or at
	7	# https://www.openssl.org/source/license.html
	8
c3473126 AP	9
	10	# ====================================================================
	11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	12	# project. The module is, however, dual licensed under OpenSSL and
	13	# CRYPTOGAMS licenses depending on where you obtain it. For further
	14	# details see http://www.openssl.org/~appro/cryptogams/.
	15	# ====================================================================
	16
	17	# March 2010
	18	#
	19	# The module implements "4-bit" GCM GHASH function and underlying
	20	# single multiplication operation in GF(2^128). "4-bit" means that it
	21	# uses 256 bytes per-key table [+128 bytes shared table]. Performance
	22	# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
	23	# and are expressed in cycles per processed byte, less is better:
	24	#
	25	# gcc 3.3.x cc 5.2 this assembler
	26	#
d52d5ad1 AP	27	# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
d52d5ad1 AP	28	# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
c3473126	29	#
b2875087 AP	30	# Here is data collected on UltraSPARC T1 system running Linux:
	31	#
	32	# gcc 4.4.1 this assembler
	33	#
	34	# 32-bit build 566 50 (+1000%)
	35	# 64-bit build 56 50 (+12%)
	36	#
c3473126 AP	37	# I don't quite understand why difference between 32-bit and 64-bit
	38	# compiler-generated code is so big. Compilers were instructed to
	39	# generate code for UltraSPARC and should have used 64-bit registers
	40	# for Z vector (see C code) even in 32-bit build... Oh well, it only
	41	# means more impressive improvement coefficients for this assembler
	42	# module;-) Loops are aggressively modulo-scheduled in respect to
	43	# references to input data and Z.hi updates to achieve 12 cycles
	44	# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
b2875087	45	# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
23328d4b AP	46	#
	47	# October 2012
	48	#
	49	# Add VIS3 lookup-table-free implementation using polynomial
	50	# multiplication xmulx[hi] and extended addition addxc[cc]
3766e7cc AP	51	# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
	52	# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
	53	# saturates at ~15.5x single-process result on 8-core processor,
	54	# or ~20.5GBps per 2.85GHz socket.
c3473126	55
eb77e888	56	$output=pop;
c3473126 AP	57	open STDOUT,">$output";
c3473126 AP	58
eb77e888 AP	59	$frame="STACK_FRAME";
	60	$bias="STACK_BIAS";
	61
c3473126 AP	62	$Zhi="%o0"; # 64-bit values
	63	$Zlo="%o1";
	64	$Thi="%o2";
	65	$Tlo="%o3";
	66	$rem="%o4";
	67	$tmp="%o5";
	68
	69	$nhi="%l0"; # small values and pointers
	70	$nlo="%l1";
	71	$xi0="%l2";
	72	$xi1="%l3";
	73	$rem_4bit="%l4";
	74	$remi="%l5";
	75	$Htblo="%l6";
	76	$cnt="%l7";
	77
4f39edbf AP	78	$Xi="%i0"; # input argument block
	79	$Htbl="%i1";
	80	$inp="%i2";
	81	$len="%i3";
c3473126	82
eb77e888 AP	83	$code.=<<___;
	84	#include "sparc_arch.h"
	85
	86	#ifdef __arch64__
23328d4b AP	87	.register %g2,#scratch
23328d4b AP	88	.register %g3,#scratch
eb77e888 AP	89	#endif
eb77e888 AP	90
c3473126 AP	91	.section ".text",#alloc,#execinstr
	92
	93	.align 64
	94	rem_4bit:
	95	.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
	96	.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
	97	.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
	98	.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
	99	.type rem_4bit,#object
	100	.size rem_4bit,(.-rem_4bit)
	101
	102	.globl gcm_ghash_4bit
	103	.align 32
	104	gcm_ghash_4bit:
	105	save %sp,-$frame,%sp
	106	ldub [$inp+15],$nlo
	107	ldub [$Xi+15],$xi0
	108	ldub [$Xi+14],$xi1
	109	add $len,$inp,$len
	110	add $Htbl,8,$Htblo
	111
	112	1: call .+8
	113	add %o7,rem_4bit-1b,$rem_4bit
	114
	115	.Louter:
	116	xor $xi0,$nlo,$nlo
	117	and $nlo,0xf0,$nhi
	118	and $nlo,0x0f,$nlo
	119	sll $nlo,4,$nlo
	120	ldx [$Htblo+$nlo],$Zlo
	121	ldx [$Htbl+$nlo],$Zhi
	122
	123	ldub [$inp+14],$nlo
	124
	125	ldx [$Htblo+$nhi],$Tlo
	126	and $Zlo,0xf,$remi
	127	ldx [$Htbl+$nhi],$Thi
	128	sll $remi,3,$remi
	129	ldx [$rem_4bit+$remi],$rem
	130	srlx $Zlo,4,$Zlo
	131	mov 13,$cnt
	132	sllx $Zhi,60,$tmp
	133	xor $Tlo,$Zlo,$Zlo
	134	srlx $Zhi,4,$Zhi
	135	xor $Zlo,$tmp,$Zlo
	136
	137	xor $xi1,$nlo,$nlo
	138	and $Zlo,0xf,$remi
	139	and $nlo,0xf0,$nhi
	140	and $nlo,0x0f,$nlo
	141	ba .Lghash_inner
	142	sll $nlo,4,$nlo
	143	.align 32
	144	.Lghash_inner:
	145	ldx [$Htblo+$nlo],$Tlo
	146	sll $remi,3,$remi
	147	xor $Thi,$Zhi,$Zhi
	148	ldx [$Htbl+$nlo],$Thi
	149	srlx $Zlo,4,$Zlo
	150	xor $rem,$Zhi,$Zhi
	151	ldx [$rem_4bit+$remi],$rem
	152	sllx $Zhi,60,$tmp
	153	xor $Tlo,$Zlo,$Zlo
	154	ldub [$inp+$cnt],$nlo
155	srlx $Zhi,4,$Zhi
156	xor $Zlo,$tmp,$Zlo
157	ldub [$Xi+$cnt],$xi1
158	xor $Thi,$Zhi,$Zhi
159	and $Zlo,0xf,$remi
160
161	ldx [$Htblo+$nhi],$Tlo
162	sll $remi,3,$remi
163	xor $rem,$Zhi,$Zhi
164	ldx [$Htbl+$nhi],$Thi
165	srlx $Zlo,4,$Zlo
166	ldx [$rem_4bit+$remi],$rem
167	sllx $Zhi,60,$tmp
168	xor $xi1,$nlo,$nlo
169	srlx $Zhi,4,$Zhi
170	and $nlo,0xf0,$nhi
171	addcc $cnt,-1,$cnt
172	xor $Zlo,$tmp,$Zlo
173	and $nlo,0x0f,$nlo
174	xor $Tlo,$Zlo,$Zlo
175	sll $nlo,4,$nlo
176	blu .Lghash_inner
177	and $Zlo,0xf,$remi
178
179	ldx [$Htblo+$nlo],$Tlo
180	sll $remi,3,$remi
181	xor $Thi,$Zhi,$Zhi
182	ldx [$Htbl+$nlo],$Thi
183	srlx $Zlo,4,$Zlo
184	xor $rem,$Zhi,$Zhi
185	ldx [$rem_4bit+$remi],$rem
186	sllx $Zhi,60,$tmp
187	xor $Tlo,$Zlo,$Zlo
188	srlx $Zhi,4,$Zhi
189	xor $Zlo,$tmp,$Zlo
190	xor $Thi,$Zhi,$Zhi
191
192	add $inp,16,$inp
193	cmp $inp,$len
eb77e888	194	be,pn SIZE_T_CC,.Ldone
c3473126 AP	195	and $Zlo,0xf,$remi
	196
	197	ldx [$Htblo+$nhi],$Tlo
	198	sll $remi,3,$remi
	199	xor $rem,$Zhi,$Zhi
	200	ldx [$Htbl+$nhi],$Thi
	201	srlx $Zlo,4,$Zlo
	202	ldx [$rem_4bit+$remi],$rem
	203	sllx $Zhi,60,$tmp
	204	xor $Tlo,$Zlo,$Zlo
	205	ldub [$inp+15],$nlo
	206	srlx $Zhi,4,$Zhi
	207	xor $Zlo,$tmp,$Zlo
	208	xor $Thi,$Zhi,$Zhi
	209	stx $Zlo,[$Xi+8]
	210	xor $rem,$Zhi,$Zhi
	211	stx $Zhi,[$Xi]
	212	srl $Zlo,8,$xi1
	213	and $Zlo,0xff,$xi0
	214	ba .Louter
	215	and $xi1,0xff,$xi1
	216	.align 32
	217	.Ldone:
	218	ldx [$Htblo+$nhi],$Tlo
	219	sll $remi,3,$remi
	220	xor $rem,$Zhi,$Zhi
	221	ldx [$Htbl+$nhi],$Thi
	222	srlx $Zlo,4,$Zlo
	223	ldx [$rem_4bit+$remi],$rem
	224	sllx $Zhi,60,$tmp
	225	xor $Tlo,$Zlo,$Zlo
	226	srlx $Zhi,4,$Zhi
	227	xor $Zlo,$tmp,$Zlo
	228	xor $Thi,$Zhi,$Zhi
	229	stx $Zlo,[$Xi+8]
	230	xor $rem,$Zhi,$Zhi
	231	stx $Zhi,[$Xi]
	232
	233	ret
	234	restore
	235	.type gcm_ghash_4bit,#function
	236	.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
	237	___
	238
c3473126 AP	239	undef $inp;
	240	undef $len;
	241
	242	$code.=<<___;
	243	.globl gcm_gmult_4bit
	244	.align 32
	245	gcm_gmult_4bit:
	246	save %sp,-$frame,%sp
	247	ldub [$Xi+15],$nlo
	248	add $Htbl,8,$Htblo
	249
	250	1: call .+8
	251	add %o7,rem_4bit-1b,$rem_4bit
	252
	253	and $nlo,0xf0,$nhi
	254	and $nlo,0x0f,$nlo
	255	sll $nlo,4,$nlo
	256	ldx [$Htblo+$nlo],$Zlo
	257	ldx [$Htbl+$nlo],$Zhi
	258
	259	ldub [$Xi+14],$nlo
	260
	261	ldx [$Htblo+$nhi],$Tlo
	262	and $Zlo,0xf,$remi
	263	ldx [$Htbl+$nhi],$Thi
	264	sll $remi,3,$remi
	265	ldx [$rem_4bit+$remi],$rem
	266	srlx $Zlo,4,$Zlo
	267	mov 13,$cnt
	268	sllx $Zhi,60,$tmp
	269	xor $Tlo,$Zlo,$Zlo
	270	srlx $Zhi,4,$Zhi
	271	xor $Zlo,$tmp,$Zlo
	272
	273	and $Zlo,0xf,$remi
	274	and $nlo,0xf0,$nhi
	275	and $nlo,0x0f,$nlo
	276	ba .Lgmult_inner
	277	sll $nlo,4,$nlo
	278	.align 32
	279	.Lgmult_inner:
	280	ldx [$Htblo+$nlo],$Tlo
	281	sll $remi,3,$remi
	282	xor $Thi,$Zhi,$Zhi
	283	ldx [$Htbl+$nlo],$Thi
	284	srlx $Zlo,4,$Zlo
	285	xor $rem,$Zhi,$Zhi
	286	ldx [$rem_4bit+$remi],$rem
	287	sllx $Zhi,60,$tmp
	288	xor $Tlo,$Zlo,$Zlo
	289	ldub [$Xi+$cnt],$nlo
	290	srlx $Zhi,4,$Zhi
	291	xor $Zlo,$tmp,$Zlo
	292	xor $Thi,$Zhi,$Zhi
	293	and $Zlo,0xf,$remi
	294
	295	ldx [$Htblo+$nhi],$Tlo
	296	sll $remi,3,$remi
	297	xor $rem,$Zhi,$Zhi
	298	ldx [$Htbl+$nhi],$Thi
	299	srlx $Zlo,4,$Zlo
	300	ldx [$rem_4bit+$remi],$rem
	301	sllx $Zhi,60,$tmp
	302	srlx $Zhi,4,$Zhi
303	and $nlo,0xf0,$nhi
304	addcc $cnt,-1,$cnt
305	xor $Zlo,$tmp,$Zlo
306	and $nlo,0x0f,$nlo
307	xor $Tlo,$Zlo,$Zlo
308	sll $nlo,4,$nlo
309	blu .Lgmult_inner
310	and $Zlo,0xf,$remi
311
312	ldx [$Htblo+$nlo],$Tlo
313	sll $remi,3,$remi
314	xor $Thi,$Zhi,$Zhi
315	ldx [$Htbl+$nlo],$Thi
316	srlx $Zlo,4,$Zlo
317	xor $rem,$Zhi,$Zhi
318	ldx [$rem_4bit+$remi],$rem
319	sllx $Zhi,60,$tmp
320	xor $Tlo,$Zlo,$Zlo
321	srlx $Zhi,4,$Zhi
322	xor $Zlo,$tmp,$Zlo
323	xor $Thi,$Zhi,$Zhi
324	and $Zlo,0xf,$remi
325
326	ldx [$Htblo+$nhi],$Tlo
327	sll $remi,3,$remi
328	xor $rem,$Zhi,$Zhi
329	ldx [$Htbl+$nhi],$Thi
330	srlx $Zlo,4,$Zlo
331	ldx [$rem_4bit+$remi],$rem
332	sllx $Zhi,60,$tmp
333	xor $Tlo,$Zlo,$Zlo
334	srlx $Zhi,4,$Zhi
335	xor $Zlo,$tmp,$Zlo
336	xor $Thi,$Zhi,$Zhi
337	stx $Zlo,[$Xi+8]
338	xor $rem,$Zhi,$Zhi
339	stx $Zhi,[$Xi]
340
341	ret
342	restore
343	.type gcm_gmult_4bit,#function
344	.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
23328d4b AP	345	___
	346	\f
	347	{{{
24798c5e AP	348	# Straightforward 128x128-bit multiplication using Karatsuba algorithm
	349	# followed by pair of 64-bit reductions [with a shortcut in first one,
	350	# which allowed to break dependency between reductions and remove one
3766e7cc	351	# multiplication from critical path]. While it might be suboptimal
24798c5e AP	352	# with regard to sheer number of multiplications, other methods [such
	353	# as aggregate reduction] would require more 64-bit registers, which
	354	# we don't have in 32-bit application context.
23328d4b AP	355
	356	($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
	357
3766e7cc	358	($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
24798c5e	359	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
23328d4b	360
3766e7cc	361	($shl,$shr)=map("%l$_",(0..7));
24798c5e AP	362
24798c5e AP	363	# For details regarding "twisted H" see ghash-x86.pl.
23328d4b	364	$code.=<<___;
24798c5e	365	.globl gcm_init_vis3
23328d4b	366	.align 32
24798c5e	367	gcm_init_vis3:
23328d4b AP	368	save %sp,-$frame,%sp
23328d4b AP	369
24798c5e AP	370	ldx [%i1+0],$Hhi
	371	ldx [%i1+8],$Hlo
	372	mov 0xE1,$Xhi
	373	mov 1,$Xlo
	374	sllx $Xhi,57,$Xhi
3766e7cc	375	srax $Hhi,63,$C0 ! broadcast carry
24798c5e AP	376	addcc $Hlo,$Hlo,$Hlo ! H<<=1
24798c5e AP	377	addxc $Hhi,$Hhi,$Hhi
3766e7cc AP	378	and $C0,$Xlo,$Xlo
3766e7cc AP	379	and $C0,$Xhi,$Xhi
24798c5e AP	380	xor $Xlo,$Hlo,$Hlo
	381	xor $Xhi,$Hhi,$Hhi
	382	stx $Hlo,[%i0+8] ! save twisted H
	383	stx $Hhi,[%i0+0]
23328d4b	384
3766e7cc AP	385	sethi %hi(0xA0406080),$V
	386	sethi %hi(0x20C0E000),%l0
	387	or $V,%lo(0xA0406080),$V
	388	or %l0,%lo(0x20C0E000),%l0
	389	sllx $V,32,$V
053fa39a	390	or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
3766e7cc AP	391	stx $V,[%i0+16]
3766e7cc AP	392
24798c5e AP	393	ret
	394	restore
	395	.type gcm_init_vis3,#function
	396	.size gcm_init_vis3,.-gcm_init_vis3
23328d4b	397
24798c5e AP	398	.globl gcm_gmult_vis3
	399	.align 32
	400	gcm_gmult_vis3:
	401	save %sp,-$frame,%sp
23328d4b	402
24798c5e AP	403	ldx [$Xip+8],$Xlo ! load Xi
	404	ldx [$Xip+0],$Xhi
	405	ldx [$Htable+8],$Hlo ! load twisted H
	406	ldx [$Htable+0],$Hhi
	407
3766e7cc AP	408	mov 0xE1,%l7
3766e7cc AP	409	sllx %l7,57,$xE1 ! 57 is not a typo
053fa39a	410	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
24798c5e	411
3766e7cc	412	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
24798c5e AP	413	xmulx $Xlo,$Hlo,$C0
	414	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
	415	xmulx $C2,$Hhl,$C1
	416	xmulxhi $Xlo,$Hlo,$Xlo
	417	xmulxhi $C2,$Hhl,$C2
	418	xmulxhi $Xhi,$Hhi,$C3
	419	xmulx $Xhi,$Hhi,$Xhi
	420
	421	sll $C0,3,$sqr
053fa39a	422	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
24798c5e	423	xor $C0,$sqr,$sqr
053fa39a	424	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
24798c5e AP	425
	426	xor $C0,$C1,$C1 ! Karatsuba post-processing
	427	xor $Xlo,$C2,$C2
3766e7cc	428	xor $sqr,$Xlo,$Xlo ! real destination is $C1
24798c5e AP	429	xor $C3,$C2,$C2
24798c5e AP	430	xor $Xlo,$C1,$C1
3766e7cc AP	431	xor $Xhi,$C2,$C2
3766e7cc AP	432	xor $Xhi,$C1,$C1
24798c5e	433
053fa39a	434	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
24798c5e AP	435	xor $C0,$C2,$C2
	436	xmulx $C1,$xE1,$C0
	437	xor $C1,$C3,$C3
	438	xmulxhi $C1,$xE1,$C1
	439
	440	xor $Xlo,$C2,$C2
24798c5e AP	441	xor $C0,$C2,$C2
	442	xor $C1,$C3,$C3
	443
	444	stx $C2,[$Xip+8] ! save Xi
	445	stx $C3,[$Xip+0]
23328d4b AP	446
	447	ret
	448	restore
	449	.type gcm_gmult_vis3,#function
	450	.size gcm_gmult_vis3,.-gcm_gmult_vis3
	451
	452	.globl gcm_ghash_vis3
	453	.align 32
	454	gcm_ghash_vis3:
	455	save %sp,-$frame,%sp
f198cc43 AP	456	nop
f198cc43 AP	457	srln $len,0,$len ! needed on v8+, "nop" on v9
23328d4b	458
24798c5e AP	459	ldx [$Xip+8],$C2 ! load Xi
	460	ldx [$Xip+0],$C3
	461	ldx [$Htable+8],$Hlo ! load twisted H
	462	ldx [$Htable+0],$Hhi
	463
24798c5e	464	mov 0xE1,%l7
24798c5e	465	sllx %l7,57,$xE1 ! 57 is not a typo
053fa39a	466	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
24798c5e	467
23328d4b AP	468	and $inp,7,$shl
23328d4b AP	469	andn $inp,7,$inp
23328d4b AP	470	sll $shl,3,$shl
23328d4b AP	471	prefetch [$inp+63], 20
23328d4b	472	sub %g0,$shl,$shr
24798c5e AP	473
24798c5e AP	474	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
23328d4b	475	.Loop:
24798c5e	476	ldx [$inp+8],$Xlo
23328d4b	477	brz,pt $shl,1f
24798c5e AP	478	ldx [$inp+0],$Xhi
	479
	480	ldx [$inp+16],$C1 ! align data
	481	srlx $Xlo,$shr,$C0
	482	sllx $Xlo,$shl,$Xlo
	483	sllx $Xhi,$shl,$Xhi
	484	srlx $C1,$shr,$C1
	485	or $C0,$Xhi,$Xhi
	486	or $C1,$Xlo,$Xlo
23328d4b AP	487	1:
	488	add $inp,16,$inp
	489	sub $len,16,$len
24798c5e AP	490	xor $C2,$Xlo,$Xlo
24798c5e AP	491	xor $C3,$Xhi,$Xhi
23328d4b AP	492	prefetch [$inp+63], 20
23328d4b AP	493
24798c5e AP	494	xmulx $Xlo,$Hlo,$C0
	495	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
	496	xmulx $C2,$Hhl,$C1
	497	xmulxhi $Xlo,$Hlo,$Xlo
	498	xmulxhi $C2,$Hhl,$C2
	499	xmulxhi $Xhi,$Hhi,$C3
	500	xmulx $Xhi,$Hhi,$Xhi
	501
	502	sll $C0,3,$sqr
053fa39a	503	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
24798c5e	504	xor $C0,$sqr,$sqr
053fa39a	505	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
24798c5e AP	506
	507	xor $C0,$C1,$C1 ! Karatsuba post-processing
	508	xor $Xlo,$C2,$C2
3766e7cc	509	xor $sqr,$Xlo,$Xlo ! real destination is $C1
24798c5e AP	510	xor $C3,$C2,$C2
24798c5e AP	511	xor $Xlo,$C1,$C1
3766e7cc AP	512	xor $Xhi,$C2,$C2
3766e7cc AP	513	xor $Xhi,$C1,$C1
24798c5e	514
053fa39a	515	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
24798c5e AP	516	xor $C0,$C2,$C2
	517	xmulx $C1,$xE1,$C0
	518	xor $C1,$C3,$C3
	519	xmulxhi $C1,$xE1,$C1
	520
	521	xor $Xlo,$C2,$C2
24798c5e	522	xor $C0,$C2,$C2
23328d4b	523	brnz,pt $len,.Loop
24798c5e	524	xor $C1,$C3,$C3
23328d4b	525
24798c5e AP	526	stx $C2,[$Xip+8] ! save Xi
24798c5e AP	527	stx $C3,[$Xip+0]
23328d4b AP	528
	529	ret
	530	restore
	531	.type gcm_ghash_vis3,#function
	532	.size gcm_ghash_vis3,.-gcm_ghash_vis3
	533	___
	534	}}}
	535	$code.=<<___;
	536	.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
c32fcca6	537	.align 4
c3473126 AP	538	___
c3473126 AP	539
23328d4b AP	540	\f
	541	# Purpose of these subroutines is to explicitly encode VIS instructions,
	542	# so that one can compile the module without having to specify VIS
478b50cf	543	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
23328d4b AP	544	# Idea is to reserve for option to produce "universal" binary and let
	545	# programmer detect if current CPU is VIS capable at run-time.
	546	sub unvis3 {
	547	my ($mnemonic,$rs1,$rs2,$rd)=@_;
	548	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
	549	my ($ref,$opf);
	550	my %visopf = ( "addxc" => 0x011,
	551	"addxccc" => 0x013,
	552	"xmulx" => 0x115,
	553	"xmulxhi" => 0x116 );
	554
	555	$ref = "$mnemonic\t$rs1,$rs2,$rd";
	556
	557	if ($opf=$visopf{$mnemonic}) {
	558	foreach ($rs1,$rs2,$rd) {
	559	return $ref if (!/%([goli])([0-9])/);
	560	$_=$bias{$1}+$2;
	561	}
	562
	563	return sprintf ".word\t0x%08x !%s",
	564	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
	565	$ref;
	566	} else {
	567	return $ref;
	568	}
	569	}
	570
	571	foreach (split("\n",$code)) {
	572	s/\`([^\`]*)\`/eval $1/ge;
	573
	574	s/\b(xmulx[hi]\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s*(%[goli][0-7])/
	575	&unvis3($1,$2,$3,$4)
	576	/ge;
	577
	578	print $_,"\n";
	579	}
	580
c3473126	581	close STDOUT;