[thirdparty/openssl.git] / crypto / modes / asm / ghash-x86.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March, May, June 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
# code paths: vanilla x86 and vanilla SSE. Former will be executed on
# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
#		gcc 2.95.3(*)	SSE assembler	x86 assembler
#
# Pentium	105/111(**)	-		50
# PIII		68 /75		12.2		24
# P4		125/125		17.8		84(***)
# Opteron	66 /70		10.1		30
# Core2		54 /67		8.4		18
#
# (*)	gcc 3.4.x was observed to generate few percent slower code,
#	which is one of reasons why 2.95.3 results were chosen,
#	another reason is lack of 3.4.x results for older CPUs;
#	comparison with SSE results is not completely fair, because C
#	results are for vanilla "256B" implementation, while
#	assembler results are for "528B";-)
# (**)	second number is result for code compiled with -fPIC flag,
#	which is actually more relevant, because assembler code is
#	position-independent;
# (***)	see comment in non-MMX routine for further details;
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
# in particular, see comment at the end of the file...

# May 2010
#
# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
# The question is how close is it to theoretical limit? The pclmulqdq
# instruction latency appears to be 14 cycles and there can't be more
# than 2 of them executing at any given time. This means that single
# Karatsuba multiplication would take 28 cycles *plus* few cycles for
# pre- and post-processing. Then multiplication has to be followed by
# modulo-reduction. Given that aggregated reduction method [see
# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
# white paper by Intel] allows you to perform reduction only once in
# a while we can assume that asymptotic performance can be estimated
# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
# and Naggr is the aggregation factor.
#
# Before we proceed to this implementation let's have closer look at
# the best-performing code suggested by Intel in their white paper.
# By tracing inter-register dependencies Tmod is estimated as ~19
# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
# processed byte. As implied, this is quite optimistic estimate,
# because it does not account for Karatsuba pre- and post-processing,
# which for a single multiplication is ~5 cycles. Unfortunately Intel
# does not provide performance data for GHASH alone. But benchmarking
# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
# the result accounts even for pre-computing of degrees of the hash
# key H, but its portion is negligible at 16KB buffer size.
#
# Moving on to the implementation in question. Tmod is estimated as
# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
# 2.16. How is it possible that measured performance is better than
# optimistic theoretical estimate? There is one thing Intel failed
# to recognize. By serializing GHASH with CTR in same subroutine
# former's performance is really limited to above (Tmul + Tmod/Naggr)
# equation. But if GHASH procedure is detached, the modulo-reduction
# can be interleaved with Naggr-1 multiplications at instruction level
# and under ideal conditions even disappear from the equation. So that
# optimistic theoretical estimate for this implementation is ...
# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
# where Tproc is time required for Karatsuba pre- and post-processing,
# is more realistic estimate. In this case it gives ... 1.91 cycles.
# Or in other words, depending on how well we can interleave reduction
# and one of the two multiplications the performance should be betwen
# 1.91 and 2.16. As already mentioned, this implementation processes
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
# - in 2.02. x86_64 performance is better, because larger register
# bank allows to interleave reduction and multiplication better.
#
# Does it make sense to increase Naggr? To start with it's virtually
# impossible in 32-bit mode, because of limited register bank
# capacity. Otherwise improvement has to be weighed agiainst slower
# setup, as well as code size and complexity increase. As even
# optimistic estimate doesn't promise 30% performance improvement,
# there are currently no plans to increase Naggr.
#
# Special thanks to David Woodhouse <dwmw2@infradead.org> for
# providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre.

# January 2010
#
# Tweaked to optimize transitions between integer and FP operations
# on same XMM register, PCLMULQDQ subroutine was measured to process
# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
# The minor regression on Westmere is outweighed by ~15% improvement
# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";

&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");

$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
$inp  = "edi";
$Htbl = "esi";
\f
$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
		# than unrolled, which has to be weighted against
		# 2.5x x86-specific code size reduction.

sub x86_loop {
    my $off = shift;
    my $rem = "eax";

	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
	&mov	($Zll,&DWP(8,$Htbl,$Zll));
	&xor	($rem,$rem);	# avoid partial register stalls on PIII

	# shrd practically kills P4, 2.5x deterioration, but P4 has
	# MMX code-path to execute. shrd runs tad faster [than twice
	# the shifts, move's and or's] on pre-MMX Pentium (as well as
	# PIII and Core2), *but* minimizes code size, spares register
	# and thus allows to fold the loop...
	if (!$unroll) {
	my $cnt = $inp;
	&mov	($cnt,15);
	&jmp	(&label("x86_loop"));
	&set_label("x86_loop",16);
	    for($i=1;$i<=2;$i++) {
		&mov	(&LB($rem),&LB($Zll));
		&shrd	($Zll,$Zlh,4);
		&and	(&LB($rem),0xf);
		&shrd	($Zlh,$Zhl,4);
		&shrd	($Zhl,$Zhh,4);
		&shr	($Zhh,4);
		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));

		&mov	(&LB($rem),&BP($off,"esp",$cnt));
		if ($i&1) {
			&and	(&LB($rem),0xf0);
		} else {
			&shl	(&LB($rem),4);
		}

		&xor	($Zll,&DWP(8,$Htbl,$rem));
		&xor	($Zlh,&DWP(12,$Htbl,$rem));
		&xor	($Zhl,&DWP(0,$Htbl,$rem));
		&xor	($Zhh,&DWP(4,$Htbl,$rem));

		if ($i&1) {
			&dec	($cnt);
			&js	(&label("x86_break"));
		} else {
			&jmp	(&label("x86_loop"));
		}
	    }
	&set_label("x86_break",16);
	} else {
	    for($i=1;$i<32;$i++) {
		&comment($i);
		&mov	(&LB($rem),&LB($Zll));
		&shrd	($Zll,$Zlh,4);
		&and	(&LB($rem),0xf);
		&shrd	($Zlh,$Zhl,4);
		&shrd	($Zhl,$Zhh,4);
		&shr	($Zhh,4);
		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));

		if ($i&1) {
			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
			&and	(&LB($rem),0xf0);
		} else {
			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
			&shl	(&LB($rem),4);
		}

		&xor	($Zll,&DWP(8,$Htbl,$rem));
		&xor	($Zlh,&DWP(12,$Htbl,$rem));
		&xor	($Zhl,&DWP(0,$Htbl,$rem));
		&xor	($Zhh,&DWP(4,$Htbl,$rem));
	    }
	}
	&bswap	($Zll);
	&bswap	($Zlh);
	&bswap	($Zhl);
	if (!$x86only) {
		&bswap	($Zhh);
	} else {
		&mov	("eax",$Zhh);
		&bswap	("eax");
		&mov	($Zhh,"eax");
	}
}

if ($unroll) {
    &function_begin_B("_x86_gmult_4bit_inner");
	&x86_loop(4);
	&ret	();
    &function_end_B("_x86_gmult_4bit_inner");
}

sub deposit_rem_4bit {
    my $bias = shift;

	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
}
\f
$suffix = $x86only ? "" : "_x86";

&function_begin("gcm_gmult_4bit".$suffix);
	&stack_push(16+4+1);			# +1 for stack alignment
	&mov	($inp,&wparam(0));		# load Xi
	&mov	($Htbl,&wparam(1));		# load Htable

	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
	&mov	($Zhl,&DWP(4,$inp));
	&mov	($Zlh,&DWP(8,$inp));
	&mov	($Zll,&DWP(12,$inp));

	&deposit_rem_4bit(16);

	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
	&mov	(&DWP(4,"esp"),$Zhl);
	&mov	(&DWP(8,"esp"),$Zlh);
	&mov	(&DWP(12,"esp"),$Zll);
	&shr	($Zll,20);
	&and	($Zll,0xf0);

	if ($unroll) {
		&call	("_x86_gmult_4bit_inner");
	} else {
		&x86_loop(0);
		&mov	($inp,&wparam(0));
	}

	&mov	(&DWP(12,$inp),$Zll);
	&mov	(&DWP(8,$inp),$Zlh);
	&mov	(&DWP(4,$inp),$Zhl);
	&mov	(&DWP(0,$inp),$Zhh);
	&stack_pop(16+4+1);
&function_end("gcm_gmult_4bit".$suffix);

&function_begin("gcm_ghash_4bit".$suffix);
	&stack_push(16+4+1);			# +1 for 64-bit alignment
	&mov	($Zll,&wparam(0));		# load Xi
	&mov	($Htbl,&wparam(1));		# load Htable
	&mov	($inp,&wparam(2));		# load in
	&mov	("ecx",&wparam(3));		# load len
	&add	("ecx",$inp);
	&mov	(&wparam(3),"ecx");

	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
	&mov	($Zhl,&DWP(4,$Zll));
	&mov	($Zlh,&DWP(8,$Zll));
	&mov	($Zll,&DWP(12,$Zll));

	&deposit_rem_4bit(16);

    &set_label("x86_outer_loop",16);
	&xor	($Zll,&DWP(12,$inp));		# xor with input
	&xor	($Zlh,&DWP(8,$inp));
	&xor	($Zhl,&DWP(4,$inp));
	&xor	($Zhh,&DWP(0,$inp));
	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
	&mov	(&DWP(8,"esp"),$Zlh);
	&mov	(&DWP(4,"esp"),$Zhl);
	&mov	(&DWP(0,"esp"),$Zhh);

	&shr	($Zll,20);
	&and	($Zll,0xf0);

	if ($unroll) {
		&call	("_x86_gmult_4bit_inner");
	} else {
		&x86_loop(0);
		&mov	($inp,&wparam(2));
	}
	&lea	($inp,&DWP(16,$inp));
	&cmp	($inp,&wparam(3));
	&mov	(&wparam(2),$inp)	if (!$unroll);
	&jb	(&label("x86_outer_loop"));

	&mov	($inp,&wparam(0));	# load Xi
	&mov	(&DWP(12,$inp),$Zll);
	&mov	(&DWP(8,$inp),$Zlh);
	&mov	(&DWP(4,$inp),$Zhl);
	&mov	(&DWP(0,$inp),$Zhh);
	&stack_pop(16+4+1);
&function_end("gcm_ghash_4bit".$suffix);
\f
if (!$x86only) {{{

&static_label("rem_4bit");

if (!$sse2) {{	# pure-MMX "May" version...

$S=12;		# shift factor for rem_4bit

&function_begin_B("_mmx_gmult_4bit_inner");
# MMX version performs 3.5 times better on P4 (see comment in non-MMX
# routine for further details), 100% better on Opteron, ~70% better
# on Core2 and PIII... In other words effort is considered to be well
# spent... Since initial release the loop was unrolled in order to
# "liberate" register previously used as loop counter. Instead it's
# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
# The path involves move of Z.lo from MMX to integer register,
# effective address calculation and finally merge of value to Z.hi.
# Reference to rem_4bit is scheduled so late that I had to >>4
# rem_4bit elements. This resulted in 20-45% procent improvement
Commit	Line	Data
e3a510f8 AP	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	5	# project. The module is, however, dual licensed under OpenSSL and
	6	# CRYPTOGAMS licenses depending on where you obtain it. For further
	7	# details see http://www.openssl.org/~appro/cryptogams/.
	8	# ====================================================================
	9	#
8525950e	10	# March, May, June 2010
480cd6ab	11	#
c3473126 AP	12	# The module implements "4-bit" GCM GHASH function and underlying
	13	# single multiplication operation in GF(2^128). "4-bit" means that it
	14	# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
5c88dcca AP	15	# code paths: vanilla x86 and vanilla SSE. Former will be executed on
5c88dcca AP	16	# 486 and Pentium, latter on all others. SSE GHASH features so called
8525950e AP	17	# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
	18	# of per-key storage [+512 bytes shared table]. Performance results
	19	# are for streamed GHASH subroutine and are expressed in cycles per
	20	# processed byte, less is better:
e3a510f8	21	#
5c88dcca	22	# gcc 2.95.3(*) SSE assembler x86 assembler
e3a510f8	23	#
d52d5ad1 AP	24	# Pentium 105/111(**) - 50
	25	# PIII 68 /75 12.2 24
	26	# P4 125/125 17.8 84(***)
	27	# Opteron 66 /70 10.1 30
	28	# Core2 54 /67 8.4 18
e3a510f8 AP	29	#
e3a510f8 AP	30	# (*) gcc 3.4.x was observed to generate few percent slower code,
480cd6ab	31	# which is one of reasons why 2.95.3 results were chosen,
e3a510f8	32	# another reason is lack of 3.4.x results for older CPUs;
5c88dcca	33	# comparison with SSE results is not completely fair, because C
d52d5ad1 AP	34	# results are for vanilla "256B" implementation, while
d52d5ad1 AP	35	# assembler results are for "528B";-)
e3a510f8 AP	36	# (**) second number is result for code compiled with -fPIC flag,
	37	# which is actually more relevant, because assembler code is
	38	# position-independent;
	39	# (***) see comment in non-MMX routine for further details;
	40	#
8525950e	41	# To summarize, it's >2-5 times faster than gcc-generated code. To
480cd6ab	42	# anchor it to something else SHA1 assembler processes one byte in
5c88dcca AP	43	# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
5c88dcca AP	44	# in particular, see comment at the end of the file...
e3a510f8	45
c1f092d1 AP	46	# May 2010
c1f092d1 AP	47	#
d52d5ad1	48	# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
c1f092d1 AP	49	# The question is how close is it to theoretical limit? The pclmulqdq
	50	# instruction latency appears to be 14 cycles and there can't be more
	51	# than 2 of them executing at any given time. This means that single
	52	# Karatsuba multiplication would take 28 cycles plus few cycles for
	53	# pre- and post-processing. Then multiplication has to be followed by
	54	# modulo-reduction. Given that aggregated reduction method [see
	55	# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
	56	# white paper by Intel] allows you to perform reduction only once in
	57	# a while we can assume that asymptotic performance can be estimated
	58	# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
	59	# and Naggr is the aggregation factor.
	60	#
	61	# Before we proceed to this implementation let's have closer look at
	62	# the best-performing code suggested by Intel in their white paper.
	63	# By tracing inter-register dependencies Tmod is estimated as ~19
d52d5ad1 AP	64	# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
	65	# processed byte. As implied, this is quite optimistic estimate,
	66	# because it does not account for Karatsuba pre- and post-processing,
	67	# which for a single multiplication is ~5 cycles. Unfortunately Intel
	68	# does not provide performance data for GHASH alone. But benchmarking
	69	# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
	70	# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
	71	# the result accounts even for pre-computing of degrees of the hash
	72	# key H, but its portion is negligible at 16KB buffer size.
c1f092d1 AP	73	#
	74	# Moving on to the implementation in question. Tmod is estimated as
	75	# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
	76	# 2.16. How is it possible that measured performance is better than
	77	# optimistic theoretical estimate? There is one thing Intel failed
d52d5ad1 AP	78	# to recognize. By serializing GHASH with CTR in same subroutine
	79	# former's performance is really limited to above (Tmul + Tmod/Naggr)
	80	# equation. But if GHASH procedure is detached, the modulo-reduction
	81	# can be interleaved with Naggr-1 multiplications at instruction level
	82	# and under ideal conditions even disappear from the equation. So that
	83	# optimistic theoretical estimate for this implementation is ...
	84	# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
	85	# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
	86	# where Tproc is time required for Karatsuba pre- and post-processing,
	87	# is more realistic estimate. In this case it gives ... 1.91 cycles.
	88	# Or in other words, depending on how well we can interleave reduction
	89	# and one of the two multiplications the performance should be betwen
	90	# 1.91 and 2.16. As already mentioned, this implementation processes
	91	# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
	92	# - in 2.02. x86_64 performance is better, because larger register
	93	# bank allows to interleave reduction and multiplication better.
c1f092d1 AP	94	#
	95	# Does it make sense to increase Naggr? To start with it's virtually
	96	# impossible in 32-bit mode, because of limited register bank
	97	# capacity. Otherwise improvement has to be weighed agiainst slower
	98	# setup, as well as code size and complexity increase. As even
	99	# optimistic estimate doesn't promise 30% performance improvement,
	100	# there are currently no plans to increase Naggr.
1aa8a629 AP	101	#
	102	# Special thanks to David Woodhouse <dwmw2@infradead.org> for
	103	# providing access to a Westmere-based system on behalf of Intel
	104	# Open Source Technology Centre.
c1f092d1	105
bc5b136c AP	106	# January 2010
	107	#
	108	# Tweaked to optimize transitions between integer and FP operations
	109	# on same XMM register, PCLMULQDQ subroutine was measured to process
	110	# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
	111	# The minor regression on Westmere is outweighed by ~15% improvement
	112	# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
	113	# similar manner resulted in almost 20% degradation on Sandy Bridge,
	114	# where original 64-bit code processes one byte in 1.95 cycles.
	115
e3a510f8 AP	116	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	117	push(@INC,"${dir}","${dir}../../perlasm");
	118	require "x86asm.pl";
	119
c1f092d1	120	&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
e3a510f8	121
c1f092d1 AP	122	$sse2=0;
c1f092d1 AP	123	for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
e3a510f8	124
c1f092d1	125	($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
e3a510f8 AP	126	$inp = "edi";
e3a510f8 AP	127	$Htbl = "esi";
c1f092d1	128	\f
e3a510f8 AP	129	$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
e3a510f8 AP	130	# than unrolled, which has to be weighted against
c1f092d1	131	# 2.5x x86-specific code size reduction.
e3a510f8 AP	132
	133	sub x86_loop {
	134	my $off = shift;
	135	my $rem = "eax";
	136
	137	&mov ($Zhh,&DWP(4,$Htbl,$Zll));
	138	&mov ($Zhl,&DWP(0,$Htbl,$Zll));
	139	&mov ($Zlh,&DWP(12,$Htbl,$Zll));
	140	&mov ($Zll,&DWP(8,$Htbl,$Zll));
	141	&xor ($rem,$rem); # avoid partial register stalls on PIII
	142
	143	# shrd practically kills P4, 2.5x deterioration, but P4 has
	144	# MMX code-path to execute. shrd runs tad faster [than twice
	145	# the shifts, move's and or's] on pre-MMX Pentium (as well as
	146	# PIII and Core2), but minimizes code size, spares register
	147	# and thus allows to fold the loop...
	148	if (!$unroll) {
	149	my $cnt = $inp;
	150	&mov ($cnt,15);
	151	&jmp (&label("x86_loop"));
	152	&set_label("x86_loop",16);
	153	for($i=1;$i<=2;$i++) {
	154	&mov (&LB($rem),&LB($Zll));
	155	&shrd ($Zll,$Zlh,4);
	156	&and (&LB($rem),0xf);
	157	&shrd ($Zlh,$Zhl,4);
	158	&shrd ($Zhl,$Zhh,4);
	159	&shr ($Zhh,4);
	160	&xor ($Zhh,&DWP($off+16,"esp",$rem,4));
	161
	162	&mov (&LB($rem),&BP($off,"esp",$cnt));
	163	if ($i&1) {
	164	&and (&LB($rem),0xf0);
	165	} else {
	166	&shl (&LB($rem),4);
	167	}
	168
	169	&xor ($Zll,&DWP(8,$Htbl,$rem));
	170	&xor ($Zlh,&DWP(12,$Htbl,$rem));
	171	&xor ($Zhl,&DWP(0,$Htbl,$rem));
	172	&xor ($Zhh,&DWP(4,$Htbl,$rem));
	173
	174	if ($i&1) {
	175	&dec ($cnt);
	176	&js (&label("x86_break"));
	177	} else {
	178	&jmp (&label("x86_loop"));
	179	}
	180	}
	181	&set_label("x86_break",16);
	182	} else {
	183	for($i=1;$i<32;$i++) {
	184	&comment($i);
	185	&mov (&LB($rem),&LB($Zll));
	186	&shrd ($Zll,$Zlh,4);
	187	&and (&LB($rem),0xf);
	188	&shrd ($Zlh,$Zhl,4);
	189	&shrd ($Zhl,$Zhh,4);
	190	&shr ($Zhh,4);
	191	&xor ($Zhh,&DWP($off+16,"esp",$rem,4));
	192
	193	if ($i&1) {
	194	&mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
	195	&and (&LB($rem),0xf0);
196	} else {
197	&mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
198	&shl (&LB($rem),4);
199	}
200
201	&xor ($Zll,&DWP(8,$Htbl,$rem));
202	&xor ($Zlh,&DWP(12,$Htbl,$rem));
203	&xor ($Zhl,&DWP(0,$Htbl,$rem));
204	&xor ($Zhh,&DWP(4,$Htbl,$rem));
205	}
206	}
207	&bswap ($Zll);
208	&bswap ($Zlh);
209	&bswap ($Zhl);
210	if (!$x86only) {
211	&bswap ($Zhh);
212	} else {
213	&mov ("eax",$Zhh);
214	&bswap ("eax");
215	&mov ($Zhh,"eax");
216	}
217	}
218
219	if ($unroll) {
220	&function_begin_B("_x86_gmult_4bit_inner");
221	&x86_loop(4);
222	&ret ();
223	&function_end_B("_x86_gmult_4bit_inner");
224	}
225
c1f092d1 AP	226	sub deposit_rem_4bit {
c1f092d1 AP	227	my $bias = shift;
e3a510f8	228
c1f092d1 AP	229	&mov (&DWP($bias+0, "esp"),0x0000<<16);
	230	&mov (&DWP($bias+4, "esp"),0x1C20<<16);
	231	&mov (&DWP($bias+8, "esp"),0x3840<<16);
	232	&mov (&DWP($bias+12,"esp"),0x2460<<16);
	233	&mov (&DWP($bias+16,"esp"),0x7080<<16);
	234	&mov (&DWP($bias+20,"esp"),0x6CA0<<16);
	235	&mov (&DWP($bias+24,"esp"),0x48C0<<16);
	236	&mov (&DWP($bias+28,"esp"),0x54E0<<16);
	237	&mov (&DWP($bias+32,"esp"),0xE100<<16);
	238	&mov (&DWP($bias+36,"esp"),0xFD20<<16);
	239	&mov (&DWP($bias+40,"esp"),0xD940<<16);
	240	&mov (&DWP($bias+44,"esp"),0xC560<<16);
	241	&mov (&DWP($bias+48,"esp"),0x9180<<16);
	242	&mov (&DWP($bias+52,"esp"),0x8DA0<<16);
	243	&mov (&DWP($bias+56,"esp"),0xA9C0<<16);
	244	&mov (&DWP($bias+60,"esp"),0xB5E0<<16);
	245	}
	246	\f
	247	$suffix = $x86only ? "" : "_x86";
e3a510f8	248
c1f092d1	249	&function_begin("gcm_gmult_4bit".$suffix);
e3a510f8 AP	250	&stack_push(16+4+1); # +1 for stack alignment
	251	&mov ($inp,&wparam(0)); # load Xi
	252	&mov ($Htbl,&wparam(1)); # load Htable
	253
	254	&mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
	255	&mov ($Zhl,&DWP(4,$inp));
	256	&mov ($Zlh,&DWP(8,$inp));
	257	&mov ($Zll,&DWP(12,$inp));
	258
	259	&deposit_rem_4bit(16);
	260
	261	&mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
	262	&mov (&DWP(4,"esp"),$Zhl);
	263	&mov (&DWP(8,"esp"),$Zlh);
	264	&mov (&DWP(12,"esp"),$Zll);
	265	&shr ($Zll,20);
	266	&and ($Zll,0xf0);
	267
	268	if ($unroll) {
	269	&call ("_x86_gmult_4bit_inner");
	270	} else {
	271	&x86_loop(0);
	272	&mov ($inp,&wparam(0));
	273	}
	274
	275	&mov (&DWP(12,$inp),$Zll);
	276	&mov (&DWP(8,$inp),$Zlh);
	277	&mov (&DWP(4,$inp),$Zhl);
	278	&mov (&DWP(0,$inp),$Zhh);
	279	&stack_pop(16+4+1);
c1f092d1 AP	280	&function_end("gcm_gmult_4bit".$suffix);
	281
	282	&function_begin("gcm_ghash_4bit".$suffix);
	283	&stack_push(16+4+1); # +1 for 64-bit alignment
	284	&mov ($Zll,&wparam(0)); # load Xi
	285	&mov ($Htbl,&wparam(1)); # load Htable
	286	&mov ($inp,&wparam(2)); # load in
	287	&mov ("ecx",&wparam(3)); # load len
	288	&add ("ecx",$inp);
	289	&mov (&wparam(3),"ecx");
	290
	291	&mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
	292	&mov ($Zhl,&DWP(4,$Zll));
	293	&mov ($Zlh,&DWP(8,$Zll));
	294	&mov ($Zll,&DWP(12,$Zll));
	295
	296	&deposit_rem_4bit(16);
	297
	298	&set_label("x86_outer_loop",16);
	299	&xor ($Zll,&DWP(12,$inp)); # xor with input
	300	&xor ($Zlh,&DWP(8,$inp));
	301	&xor ($Zhl,&DWP(4,$inp));
	302	&xor ($Zhh,&DWP(0,$inp));
	303	&mov (&DWP(12,"esp"),$Zll); # dump it on stack
	304	&mov (&DWP(8,"esp"),$Zlh);
	305	&mov (&DWP(4,"esp"),$Zhl);
	306	&mov (&DWP(0,"esp"),$Zhh);
	307
	308	&shr ($Zll,20);
	309	&and ($Zll,0xf0);
	310
	311	if ($unroll) {
	312	&call ("_x86_gmult_4bit_inner");
	313	} else {
	314	&x86_loop(0);
	315	&mov ($inp,&wparam(2));
	316	}
	317	&lea ($inp,&DWP(16,$inp));
	318	&cmp ($inp,&wparam(3));
	319	&mov (&wparam(2),$inp) if (!$unroll);
	320	&jb (&label("x86_outer_loop"));
	321
	322	&mov ($inp,&wparam(0)); # load Xi
	323	&mov (&DWP(12,$inp),$Zll);
	324	&mov (&DWP(8,$inp),$Zlh);
	325	&mov (&DWP(4,$inp),$Zhl);
	326	&mov (&DWP(0,$inp),$Zhh);
	327	&stack_pop(16+4+1);
	328	&function_end("gcm_ghash_4bit".$suffix);
	329	\f
	330	if (!$x86only) {{{
	331
	332	&static_label("rem_4bit");
	333
98909c1d	334	if (!$sse2) {{ # pure-MMX "May" version...
8525950e AP	335
	336	$S=12; # shift factor for rem_4bit
	337
07e29c12 AP	338	&function_begin_B("_mmx_gmult_4bit_inner");
	339	# MMX version performs 3.5 times better on P4 (see comment in non-MMX
	340	# routine for further details), 100% better on Opteron, ~70% better
	341	# on Core2 and PIII... In other words effort is considered to be well
	342	# spent... Since initial release the loop was unrolled in order to
	343	# "liberate" register previously used as loop counter. Instead it's
	344	# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
	345	# The path involves move of Z.lo from MMX to integer register,
	346	# effective address calculation and finally merge of value to Z.hi.
	347	# Reference to rem_4bit is scheduled so late that I had to >>4
	348	# rem_4bit elements. This resulted in 20-45% procent improvement
	349