[thirdparty/openssl.git] / crypto / md5 / asm / md5-sparcv9.pl

#! /usr/bin/env perl
# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html


# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller.
# ====================================================================

# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
# code generated by Sun C 5.2.

# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
# faster than software. Multi-process benchmark saturates at 12x
# single-process result on 8-core processor, or ~11GBps per 2.85GHz
# socket.

# $output is the last argument if it looks like a file (it has an extension)
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;

$output and open STDOUT,">$output";

use integer;

($ctx,$inp,$len)=("%i0","%i1","%i2");	# input arguments

# 64-bit values
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
$tx="%g3";
($AB,$CD)=("%g4","%g5");

# 32-bit values
@V=($A,$B,$C,$D)=map("%l$_",(0..3));
($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
($shr,$shl1,$shl2)=("%i3","%i4","%i5");

my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
	0x6b901122,0xfd987193,0xa679438e,0x49b40821,

	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,

	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,

	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0	);

sub R0 {
  my ($i,$a,$b,$c,$d) = @_;
  my $rot = (7,12,17,22)[$i%4];
  my $j   = ($i+1)/2;

  if ($i&1) {
    $code.=<<___;
	 srlx	@X[$j],$shr,@X[$j]	! align X[`$i+1`]
	and	$b,$t1,$t1		! round $i
	 sllx	@X[$j+1],$shl1,$tx
	add	$t2,$a,$a
	 sllx	$tx,$shl2,$tx
	xor	$d,$t1,$t1
	 or	$tx,@X[$j],@X[$j]
	 sethi	%hi(@K[$i+1]),$t2
	add	$t1,$a,$a
	 or	$t2,%lo(@K[$i+1]),$t2
	sll	$a,$rot,$t3
	 add	@X[$j],$t2,$t2		! X[`$i+1`]+K[`$i+1`]
	srl	$a,32-$rot,$a
	add	$b,$t3,$t3
	 xor	 $b,$c,$t1
	add	$t3,$a,$a
___
  } else {
    $code.=<<___;
	 srlx	@X[$j],32,$tx		! extract X[`2*$j+1`]
	and	$b,$t1,$t1		! round $i
	add	$t2,$a,$a
	xor	$d,$t1,$t1
	 sethi	%hi(@K[$i+1]),$t2
	add	$t1,$a,$a
	 or	$t2,%lo(@K[$i+1]),$t2
	sll	$a,$rot,$t3
	 add	$tx,$t2,$t2		! X[`2*$j+1`]+K[`$i+1`]
	srl	$a,32-$rot,$a
	add	$b,$t3,$t3
	 xor	 $b,$c,$t1
	add	$t3,$a,$a
___
  }
}

sub R0_1 {
  my ($i,$a,$b,$c,$d) = @_;
  my $rot = (7,12,17,22)[$i%4];

$code.=<<___;
	 srlx	@X[0],32,$tx		! extract X[1]
	and	$b,$t1,$t1		! round $i
	add	$t2,$a,$a
	xor	$d,$t1,$t1
	 sethi	%hi(@K[$i+1]),$t2
	add	$t1,$a,$a
	 or	$t2,%lo(@K[$i+1]),$t2
	sll	$a,$rot,$t3
	 add	$tx,$t2,$t2		! X[1]+K[`$i+1`]
	srl	$a,32-$rot,$a
	add	$b,$t3,$t3
	 andn	 $b,$c,$t1
	add	$t3,$a,$a
___
}

sub R1 {
  my ($i,$a,$b,$c,$d) = @_;
  my $rot = (5,9,14,20)[$i%4];
  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
  my $xi  = @X[$j/2];

$code.=<<___ if ($j&1 && ($xi=$tx));
	 srlx	@X[$j/2],32,$xi		! extract X[$j]
___
$code.=<<___;
	and	$b,$d,$t3		! round $i
	add	$t2,$a,$a
	or	$t3,$t1,$t1
	 sethi	%hi(@K[$i+1]),$t2
	add	$t1,$a,$a
	 or	$t2,%lo(@K[$i+1]),$t2
	sll	$a,$rot,$t3
	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
	srl	$a,32-$rot,$a
	add	$b,$t3,$t3
	 `$i<31?"andn":"xor"`	 $b,$c,$t1
	add	$t3,$a,$a
___
}

sub R2 {
  my ($i,$a,$b,$c,$d) = @_;
  my $rot = (4,11,16,23)[$i%4];
  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
  my $xi  = @X[$j/2];

$code.=<<___ if ($j&1 && ($xi=$tx));
	 srlx	@X[$j/2],32,$xi		! extract X[$j]
___
$code.=<<___;
	add	$t2,$a,$a		! round $i
	xor	$b,$t1,$t1
	 sethi	%hi(@K[$i+1]),$t2
	add	$t1,$a,$a
	 or	$t2,%lo(@K[$i+1]),$t2
	sll	$a,$rot,$t3
	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
	srl	$a,32-$rot,$a
	add	$b,$t3,$t3
	 xor	 $b,$c,$t1
	add	$t3,$a,$a
___
}

sub R3 {
  my ($i,$a,$b,$c,$d) = @_;
  my $rot = (6,10,15,21)[$i%4];
  my $j   = (0+7*($i+1))%16;
  my $xi  = @X[$j/2];

$code.=<<___;
	add	$t2,$a,$a		! round $i
___
$code.=<<___ if ($j&1 && ($xi=$tx));
	 srlx	@X[$j/2],32,$xi		! extract X[$j]
___
$code.=<<___;
	orn	$b,$d,$t1
	 sethi	%hi(@K[$i+1]),$t2
	xor	$c,$t1,$t1
	 or	$t2,%lo(@K[$i+1]),$t2
	add	$t1,$a,$a
	sll	$a,$rot,$t3
	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
	srl	$a,32-$rot,$a
	add	$b,$t3,$t3
	add	$t3,$a,$a
___
}

$code.=<<___;
#include "sparc_arch.h"

#ifdef __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.section	".text",#alloc,#execinstr

#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif

.globl	md5_block_asm_data_order
.align	32
md5_block_asm_data_order:
	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]

	andcc	%g1, CFR_MD5, %g0
	be	.Lsoftware
	nop

	mov	4, %g1
	andcc	%o1, 0x7, %g0
	lda	[%o0 + %g0]0x88, %f0		! load context
	lda	[%o0 + %g1]0x88, %f1
	add	%o0, 8, %o0
	lda	[%o0 + %g0]0x88, %f2
	lda	[%o0 + %g1]0x88, %f3
	bne,pn	%icc, .Lhwunaligned
	sub	%o0, 8, %o0

.Lhw_loop:
	ldd	[%o1 + 0x00], %f8
	ldd	[%o1 + 0x08], %f10
	ldd	[%o1 + 0x10], %f12
	ldd	[%o1 + 0x18], %f14
	ldd	[%o1 + 0x20], %f16
	ldd	[%o1 + 0x28], %f18
	ldd	[%o1 + 0x30], %f20
	subcc	%o2, 1, %o2		! done yet?
	ldd	[%o1 + 0x38], %f22
	add	%o1, 0x40, %o1
	prefetch [%o1 + 63], 20

	.word	0x81b02800		! MD5

	bne,pt	SIZE_T_CC, .Lhw_loop
	nop

.Lhwfinish:
	sta	%f0, [%o0 + %g0]0x88	! store context
	sta	%f1, [%o0 + %g1]0x88
	add	%o0, 8, %o0
	sta	%f2, [%o0 + %g0]0x88
	sta	%f3, [%o0 + %g1]0x88
	retl
	nop

.align	8
.Lhwunaligned:
	alignaddr %o1, %g0, %o1

	ldd	[%o1 + 0x00], %f10
.Lhwunaligned_loop:
	ldd	[%o1 + 0x08], %f12
	ldd	[%o1 + 0x10], %f14
	ldd	[%o1 + 0x18], %f16
	ldd	[%o1 + 0x20], %f18
	ldd	[%o1 + 0x28], %f20
	ldd	[%o1 + 0x30], %f22
	ldd	[%o1 + 0x38], %f24
	subcc	%o2, 1, %o2		! done yet?
	ldd	[%o1 + 0x40], %f26
	add	%o1, 0x40, %o1
	prefetch [%o1 + 63], 20

	faligndata %f10, %f12, %f8
	faligndata %f12, %f14, %f10
	faligndata %f14, %f16, %f12
	faligndata %f16, %f18, %f14
	faligndata %f18, %f20, %f16
	faligndata %f20, %f22, %f18
	faligndata %f22, %f24, %f20
	faligndata %f24, %f26, %f22

	.word	0x81b02800		! MD5

	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
	for	%f26, %f26, %f10	! %f10=%f26

	ba	.Lhwfinish
	nop

.align	16
.Lsoftware:
	save	%sp,-STACK_FRAME,%sp

	rd	%asi,$saved_asi
	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
	and	$inp,7,$shr
	andn	$inp,7,$inp

	sll	$shr,3,$shr		! *=8
	mov	56,$shl2
	ld	[$ctx+0],$A
	sub	$shl2,$shr,$shl2
	ld	[$ctx+4],$B
	and	$shl2,32,$shl1
	add	$shl2,8,$shl2
	ld	[$ctx+8],$C
	sub	$shl2,$shl1,$shl2	! shr+shl1+shl2==64
	ld	[$ctx+12],$D
	nop

.Loop:
	 cmp	$shr,0			! was inp aligned?
	ldxa	[$inp+0]%asi,@X[0]	! load little-endian input
	ldxa	[$inp+8]%asi,@X[1]
	ldxa	[$inp+16]%asi,@X[2]
	ldxa	[$inp+24]%asi,@X[3]
	ldxa	[$inp+32]%asi,@X[4]
	 sllx	$A,32,$AB		! pack A,B
	ldxa	[$inp+40]%asi,@X[5]
	 sllx	$C,32,$CD		! pack C,D
	ldxa	[$inp+48]%asi,@X[6]
	 or	$B,$AB,$AB
	ldxa	[$inp+56]%asi,@X[7]
	 or	$D,$CD,$CD
	bnz,a,pn	%icc,.+8
	ldxa	[$inp+64]%asi,@X[8]

	srlx	@X[0],$shr,@X[0]	! align X[0]
	sllx	@X[1],$shl1,$tx
	 sethi	%hi(@K[0]),$t2
	sllx	$tx,$shl2,$tx
	 or	$t2,%lo(@K[0]),$t2
	or	$tx,@X[0],@X[0]
	 xor	$C,$D,$t1
	 add	@X[0],$t2,$t2		! X[0]+K[0]
___
	for ($i=0;$i<15;$i++)	{ &R0($i,@V);	unshift(@V,pop(@V)); }
	for (;$i<16;$i++)	{ &R0_1($i,@V);	unshift(@V,pop(@V)); }
	for (;$i<32;$i++)	{ &R1($i,@V);	unshift(@V,pop(@V)); }
	for (;$i<48;$i++)	{ &R2($i,@V);	unshift(@V,pop(@V)); }
	for (;$i<64;$i++)	{ &R3($i,@V);	unshift(@V,pop(@V)); }
$code.=<<___;
	srlx	$AB,32,$t1		! unpack A,B,C,D and accumulate
	add	$inp,64,$inp		! advance inp
	srlx	$CD,32,$t2
	add	$t1,$A,$A
	subcc	$len,1,$len		! done yet?
	add	$AB,$B,$B
	add	$t2,$C,$C
	add	$CD,$D,$D
	srl	$B,0,$B			! clruw	$B
	bne	SIZE_T_CC,.Loop
	srl	$D,0,$D			! clruw	$D

	st	$A,[$ctx+0]		! write out ctx
	st	$B,[$ctx+4]
	st	$C,[$ctx+8]
	st	$D,[$ctx+12]

	wr	%g0,$saved_asi,%asi
	ret
	restore
.type	md5_block_asm_data_order,#function
.size	md5_block_asm_data_order,(.-md5_block_asm_data_order)

.asciz	"MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align	4
___

# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my $ref,$opf;
my %visopf = (	"faligndata"	=> 0x048,
		"for"		=> 0x07c	);

    $ref = "$mnemonic\t$rs1,$rs2,$rd";

    if ($opf=$visopf{$mnemonic}) {
	foreach ($rs1,$rs2,$rd) {
	    return $ref if (!/%f([0-9]{1,2})/);
	    $_=$1;
	    if ($1>=32) {
		return $ref if ($1&1);
		# re-encode for upper double register addressing
		$_=($1|$1>>5)&31;
	    }
	}

	return	sprintf ".word\t0x%08x !%s",
			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
			$ref;
    } else {
	return $ref;
    }
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";

    foreach ($rs1,$rs2,$rd) {
	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
	else			{ return $ref; }
    }
    return  sprintf ".word\t0x%08x !%s",
		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
		    $ref;
}

foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/ge;

	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
		&unvis($1,$2,$3,$4)
	 /ge;
	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
		&unalignaddr($1,$2,$3,$4)
	 /ge;

	print $_,"\n";
}

close STDOUT or die "error closing STDOUT: $!";
Commit	Line	Data
6aa36e8e	1	#! /usr/bin/env perl
33388b44	2	# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e	3	#
4911f553	4	# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e RS	5	# this file except in compliance with the License. You can obtain a copy
	6	# in the file LICENSE in the source distribution or at
	7	# https://www.openssl.org/source/license.html
	8
e98c526b AP	9
e98c526b AP	10	# ====================================================================
aea4126e	11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
e98c526b AP	12	# project. The module is, however, dual licensed under OpenSSL and
	13	# CRYPTOGAMS licenses depending on where you obtain it. For further
	14	# details see http://www.openssl.org/~appro/cryptogams/.
e66055b8	15	#
e3713c36	16	# Hardware SPARC T4 support by David S. Miller.
e98c526b AP	17	# ====================================================================
e98c526b AP	18
27e0c863	19	# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
e98c526b AP	20	# code generated by Sun C 5.2.
e98c526b AP	21
d17b59e4	22	# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
e66055b8 AP	23	# faster than software. Multi-process benchmark saturates at 12x
	24	# single-process result on 8-core processor, or ~11GBps per 2.85GHz
	25	# socket.
	26
1aa89a7a RL	27	# $output is the last argument if it looks like a file (it has an extension)
	28	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
	29
	30	$output and open STDOUT,">$output";
e98c526b AP	31
	32	use integer;
	33
	34	($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments
	35
	36	# 64-bit values
	37	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
	38	$tx="%g3";
	39	($AB,$CD)=("%g4","%g5");
	40
	41	# 32-bit values
	42	@V=($A,$B,$C,$D)=map("%l$_",(0..3));
	43	($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
	44	($shr,$shl1,$shl2)=("%i3","%i4","%i5");
	45
	46	my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
	47	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
	48	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
	49	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
	50
	51	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
	52	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
	53	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
	54	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
	55
	56	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
	57	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
	58	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
	59	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
	60
	61	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
	62	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
	63	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
	64	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0 );
	65
	66	sub R0 {
	67	my ($i,$a,$b,$c,$d) = @_;
	68	my $rot = (7,12,17,22)[$i%4];
	69	my $j = ($i+1)/2;
	70
	71	if ($i&1) {
	72	$code.=<<___;
	73	srlx @X[$j],$shr,@X[$j] ! align X[`$i+1`]
	74	and $b,$t1,$t1 ! round $i
	75	sllx @X[$j+1],$shl1,$tx
	76	add $t2,$a,$a
	77	sllx $tx,$shl2,$tx
	78	xor $d,$t1,$t1
	79	or $tx,@X[$j],@X[$j]
	80	sethi %hi(@K[$i+1]),$t2
	81	add $t1,$a,$a
	82	or $t2,%lo(@K[$i+1]),$t2
	83	sll $a,$rot,$t3
	84	add @X[$j],$t2,$t2 ! X[`$i+1`]+K[`$i+1`]
	85	srl $a,32-$rot,$a
	86	add $b,$t3,$t3
	87	xor $b,$c,$t1
	88	add $t3,$a,$a
	89	___
	90	} else {
	91	$code.=<<___;
	92	srlx @X[$j],32,$tx ! extract X[`2*$j+1`]
	93	and $b,$t1,$t1 ! round $i
	94	add $t2,$a,$a
95	xor $d,$t1,$t1
96	sethi %hi(@K[$i+1]),$t2
97	add $t1,$a,$a
98	or $t2,%lo(@K[$i+1]),$t2
99	sll $a,$rot,$t3
100	add $tx,$t2,$t2 ! X[`2*$j+1`]+K[`$i+1`]
101	srl $a,32-$rot,$a
102	add $b,$t3,$t3
103	xor $b,$c,$t1
104	add $t3,$a,$a
105	___
106	}
107	}
108
109	sub R0_1 {
110	my ($i,$a,$b,$c,$d) = @_;
111	my $rot = (7,12,17,22)[$i%4];
112
113	$code.=<<___;
114	srlx @X[0],32,$tx ! extract X[1]
115	and $b,$t1,$t1 ! round $i
116	add $t2,$a,$a
117	xor $d,$t1,$t1
118	sethi %hi(@K[$i+1]),$t2
119	add $t1,$a,$a
120	or $t2,%lo(@K[$i+1]),$t2
121	sll $a,$rot,$t3
122	add $tx,$t2,$t2 ! X[1]+K[`$i+1`]
123	srl $a,32-$rot,$a
124	add $b,$t3,$t3
125	andn $b,$c,$t1
126	add $t3,$a,$a
127	___
128	}
129
130	sub R1 {
131	my ($i,$a,$b,$c,$d) = @_;
132	my $rot = (5,9,14,20)[$i%4];
133	my $j = $i<31 ? (1+5($i+1))%16 : (5+3($i+1))%16;
134	my $xi = @X[$j/2];
135
136	$code.=<<___ if ($j&1 && ($xi=$tx));
137	srlx @X[$j/2],32,$xi ! extract X[$j]
138	___
139	$code.=<<___;
140	and $b,$d,$t3 ! round $i
141	add $t2,$a,$a
142	or $t3,$t1,$t1
143	sethi %hi(@K[$i+1]),$t2
144	add $t1,$a,$a
145	or $t2,%lo(@K[$i+1]),$t2
146	sll $a,$rot,$t3
147	add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
148	srl $a,32-$rot,$a
149	add $b,$t3,$t3
150	`$i<31?"andn":"xor"` $b,$c,$t1
151	add $t3,$a,$a
152	___
153	}
154
155	sub R2 {
156	my ($i,$a,$b,$c,$d) = @_;
157	my $rot = (4,11,16,23)[$i%4];
158	my $j = $i<47 ? (5+3($i+1))%16 : (0+7($i+1))%16;
159	my $xi = @X[$j/2];
160
161	$code.=<<___ if ($j&1 && ($xi=$tx));
162	srlx @X[$j/2],32,$xi ! extract X[$j]
163	___
164	$code.=<<___;
165	add $t2,$a,$a ! round $i
166	xor $b,$t1,$t1
167	sethi %hi(@K[$i+1]),$t2
168	add $t1,$a,$a
169	or $t2,%lo(@K[$i+1]),$t2
170	sll $a,$rot,$t3
171	add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
172	srl $a,32-$rot,$a
173	add $b,$t3,$t3
174	xor $b,$c,$t1
175	add $t3,$a,$a
176	___
177	}
178
179	sub R3 {
180	my ($i,$a,$b,$c,$d) = @_;
181	my $rot = (6,10,15,21)[$i%4];
182	my $j = (0+7*($i+1))%16;
183	my $xi = @X[$j/2];
184
185	$code.=<<___;
186	add $t2,$a,$a ! round $i
187	___
188	$code.=<<___ if ($j&1 && ($xi=$tx));
189	srlx @X[$j/2],32,$xi ! extract X[$j]
190	___
191	$code.=<<___;
192	orn $b,$d,$t1
193	sethi %hi(@K[$i+1]),$t2
194	xor $c,$t1,$t1
195	or $t2,%lo(@K[$i+1]),$t2
196	add $t1,$a,$a
197	sll $a,$rot,$t3
198	add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
199	srl $a,32-$rot,$a
200	add $b,$t3,$t3
201	add $t3,$a,$a
202	___
203	}
204
e98c526b	205	$code.=<<___;
e66055b8 AP	206	#include "sparc_arch.h"
e66055b8 AP	207
1efd5830 AP	208	#ifdef __arch64__
	209	.register %g2,#scratch
	210	.register %g3,#scratch
	211	#endif
	212
e98c526b AP	213	.section ".text",#alloc,#execinstr
e98c526b AP	214
e66055b8 AP	215	#ifdef __PIC__
	216	SPARC_PIC_THUNK(%g1)
	217	#endif
	218
e98c526b AP	219	.globl md5_block_asm_data_order
	220	.align 32
	221	md5_block_asm_data_order:
e66055b8 AP	222	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
	223	ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
	224
	225	andcc %g1, CFR_MD5, %g0
	226	be .Lsoftware
	227	nop
	228
d17b59e4	229	mov 4, %g1
e66055b8	230	andcc %o1, 0x7, %g0
d17b59e4 AP	231	lda [%o0 + %g0]0x88, %f0 ! load context
	232	lda [%o0 + %g1]0x88, %f1
	233	add %o0, 8, %o0
	234	lda [%o0 + %g0]0x88, %f2
	235	lda [%o0 + %g1]0x88, %f3
e66055b8	236	bne,pn %icc, .Lhwunaligned
d17b59e4	237	sub %o0, 8, %o0
e66055b8 AP	238
	239	.Lhw_loop:
	240	ldd [%o1 + 0x00], %f8
	241	ldd [%o1 + 0x08], %f10
	242	ldd [%o1 + 0x10], %f12
	243	ldd [%o1 + 0x18], %f14
	244	ldd [%o1 + 0x20], %f16
	245	ldd [%o1 + 0x28], %f18
	246	ldd [%o1 + 0x30], %f20
609b0852	247	subcc %o2, 1, %o2 ! done yet?
e66055b8 AP	248	ldd [%o1 + 0x38], %f22
e66055b8 AP	249	add %o1, 0x40, %o1
aea4126e	250	prefetch [%o1 + 63], 20
e66055b8 AP	251
	252	.word 0x81b02800 ! MD5
	253
1efd5830	254	bne,pt SIZE_T_CC, .Lhw_loop
e66055b8 AP	255	nop
	256
	257	.Lhwfinish:
d17b59e4 AP	258	sta %f0, [%o0 + %g0]0x88 ! store context
	259	sta %f1, [%o0 + %g1]0x88
	260	add %o0, 8, %o0
	261	sta %f2, [%o0 + %g0]0x88
	262	sta %f3, [%o0 + %g1]0x88
e66055b8	263	retl
d17b59e4	264	nop
e66055b8 AP	265
	266	.align 8
	267	.Lhwunaligned:
	268	alignaddr %o1, %g0, %o1
	269
	270	ldd [%o1 + 0x00], %f10
	271	.Lhwunaligned_loop:
	272	ldd [%o1 + 0x08], %f12
	273	ldd [%o1 + 0x10], %f14
	274	ldd [%o1 + 0x18], %f16
	275	ldd [%o1 + 0x20], %f18
	276	ldd [%o1 + 0x28], %f20
	277	ldd [%o1 + 0x30], %f22
	278	ldd [%o1 + 0x38], %f24
	279	subcc %o2, 1, %o2 ! done yet?
	280	ldd [%o1 + 0x40], %f26
	281	add %o1, 0x40, %o1
aea4126e	282	prefetch [%o1 + 63], 20
e66055b8 AP	283
	284	faligndata %f10, %f12, %f8
	285	faligndata %f12, %f14, %f10
	286	faligndata %f14, %f16, %f12
	287	faligndata %f16, %f18, %f14
	288	faligndata %f18, %f20, %f16
	289	faligndata %f20, %f22, %f18
	290	faligndata %f22, %f24, %f20
	291	faligndata %f24, %f26, %f22
	292
	293	.word 0x81b02800 ! MD5
	294
1efd5830	295	bne,pt SIZE_T_CC, .Lhwunaligned_loop
e66055b8 AP	296	for %f26, %f26, %f10 ! %f10=%f26
	297
	298	ba .Lhwfinish
	299	nop
	300
	301	.align 16
	302	.Lsoftware:
1efd5830	303	save %sp,-STACK_FRAME,%sp
e98c526b AP	304
	305	rd %asi,$saved_asi
	306	wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE
	307	and $inp,7,$shr
	308	andn $inp,7,$inp
	309
	310	sll $shr,3,$shr ! *=8
	311	mov 56,$shl2
	312	ld [$ctx+0],$A
	313	sub $shl2,$shr,$shl2
	314	ld [$ctx+4],$B
	315	and $shl2,32,$shl1
	316	add $shl2,8,$shl2
	317	ld [$ctx+8],$C
	318	sub $shl2,$shl1,$shl2 ! shr+shl1+shl2==64
	319	ld [$ctx+12],$D
	320	nop
	321
	322	.Loop:
	323	cmp $shr,0 ! was inp aligned?
	324	ldxa [$inp+0]%asi,@X[0] ! load little-endian input
	325	ldxa [$inp+8]%asi,@X[1]
	326	ldxa [$inp+16]%asi,@X[2]
	327	ldxa [$inp+24]%asi,@X[3]
	328	ldxa [$inp+32]%asi,@X[4]
	329	sllx $A,32,$AB ! pack A,B
	330	ldxa [$inp+40]%asi,@X[5]
	331	sllx $C,32,$CD ! pack C,D
	332	ldxa [$inp+48]%asi,@X[6]
	333	or $B,$AB,$AB
	334	ldxa [$inp+56]%asi,@X[7]
	335	or $D,$CD,$CD
	336	bnz,a,pn %icc,.+8
	337	ldxa [$inp+64]%asi,@X[8]
	338
	339	srlx @X[0],$shr,@X[0] ! align X[0]
	340	sllx @X[1],$shl1,$tx
	341	sethi %hi(@K[0]),$t2
	342	sllx $tx,$shl2,$tx
	343	or $t2,%lo(@K[0]),$t2
	344	or $tx,@X[0],@X[0]
	345	xor $C,$D,$t1
	346	add @X[0],$t2,$t2 ! X[0]+K[0]
	347	___
	348	for ($i=0;$i<15;$i++) { &R0($i,@V); unshift(@V,pop(@V)); }
	349	for (;$i<16;$i++) { &R0_1($i,@V); unshift(@V,pop(@V)); }
	350	for (;$i<32;$i++) { &R1($i,@V); unshift(@V,pop(@V)); }
	351	for (;$i<48;$i++) { &R2($i,@V); unshift(@V,pop(@V)); }
	352	for (;$i<64;$i++) { &R3($i,@V); unshift(@V,pop(@V)); }
	353	$code.=<<___;
	354	srlx $AB,32,$t1 ! unpack A,B,C,D and accumulate
	355	add $inp,64,$inp ! advance inp
	356	srlx $CD,32,$t2
	357	add $t1,$A,$A
	358	subcc $len,1,$len ! done yet?
	359	add $AB,$B,$B
	360	add $t2,$C,$C
	361	add $CD,$D,$D
	362	srl $B,0,$B ! clruw $B
1efd5830	363	bne SIZE_T_CC,.Loop
e98c526b AP	364	srl $D,0,$D ! clruw $D
	365
	366	st $A,[$ctx+0] ! write out ctx
	367	st $B,[$ctx+4]
	368	st $C,[$ctx+8]
	369	st $D,[$ctx+12]
	370
	371	wr %g0,$saved_asi,%asi
	372	ret
	373	restore
	374	.type md5_block_asm_data_order,#function
	375	.size md5_block_asm_data_order,(.-md5_block_asm_data_order)
	376
	377	.asciz "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
	378	.align 4
	379	___
	380
e66055b8 AP	381	# Purpose of these subroutines is to explicitly encode VIS instructions,
e66055b8 AP	382	# so that one can compile the module without having to specify VIS
478b50cf	383	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
e66055b8 AP	384	# Idea is to reserve for option to produce "universal" binary and let
	385	# programmer detect if current CPU is VIS capable at run-time.
	386	sub unvis {
	387	my ($mnemonic,$rs1,$rs2,$rd)=@_;
	388	my $ref,$opf;
	389	my %visopf = ( "faligndata" => 0x048,
	390	"for" => 0x07c );
	391
	392	$ref = "$mnemonic\t$rs1,$rs2,$rd";
	393
	394	if ($opf=$visopf{$mnemonic}) {
	395	foreach ($rs1,$rs2,$rd) {
	396	return $ref if (!/%f([0-9]{1,2})/);
	397	$_=$1;
	398	if ($1>=32) {
	399	return $ref if ($1&1);
	400	# re-encode for upper double register addressing
	401	$_=($1\|$1>>5)&31;
	402	}
	403	}
	404
	405	return sprintf ".word\t0x%08x !%s",
	406	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
	407	$ref;
	408	} else {
	409	return $ref;
	410	}
	411	}
	412	sub unalignaddr {
	413	my ($mnemonic,$rs1,$rs2,$rd)=@_;
	414	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
	415	my $ref="$mnemonic\t$rs1,$rs2,$rd";
	416
	417	foreach ($rs1,$rs2,$rd) {
	418	if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
	419	else { return $ref; }
	420	}
	421	return sprintf ".word\t0x%08x !%s",
	422	0x81b00300\|$rd<<25\|$rs1<<14\|$rs2,
	423	$ref;
	424	}
	425
	426	foreach (split("\n",$code)) {
	427	s/\`([^\`]*)\`/eval $1/ge;
	428
	429	s/\b(f[^\s])\s+(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
	430	&unvis($1,$2,$3,$4)
	431	/ge;
	432	s/\b(alignaddr)\s+(%[goli][0-7]),\s(%[goli][0-7]),\s(%[goli][0-7])/
	433	&unalignaddr($1,$2,$3,$4)
	434	/ge;
	435
	436	print $_,"\n";
	437	}
	438
a21314db	439	close STDOUT or die "error closing STDOUT: $!";