[thirdparty/openssl.git] / crypto / rc4 / asm / rc4-x86_64.pl

#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# July 2004
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
# line results in ~40% improvement (yes, even for C implementation).
# Presumably it has everything to do with AMD cache architecture and
# RAW or whatever penalties. Once again! The module *requires* config
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
# I simply 'inc %r8b'. Even though optimization manual discourages
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...

# November 2004
#
# As was shown by Marc Bevand reordering of couple of load operations
# results in even higher performance gain of 3.3x:-) At least on
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
# Latter means that if you want to *estimate* what to expect from
# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.

# November 2004
#
# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 was to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details...

# April 2005
#
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...

# May 2005
#
# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
# performance by >30% [unlike P4 32-bit case that is]. But this is
# provided that loads are reordered even more aggressively! Both code
# paths, AMD64 and EM64T, reorder loads in essentially same manner
# as my IA-64 implementation. On Opteron this resulted in modest 5%
# improvement [I had to test it], while final Intel P4 performance
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
# RC4_INT code-path. While if executed on Opteron, it's only 25%
# slower than the RC4_INT one [meaning that if CPU µ-arch detection
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].

# March 2007
#
# Intel Core2 was observed to perform poorly on both code paths:-( It
# apparently suffers from some kind of partial register stall, which
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
# cloop1 boosts its performance by 80%! This loop appears to be optimal
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.

# May 2010
#
# Intel Westmere was observed to perform suboptimally. Adding yet
# another movzb to cloop1 improved performance by almost 50%! Core2
# performance is improved too, but nominally...

# May 2011
#
# The only code path that was not modified is P4-specific one. Non-P4
# Intel code path optimization is heavily based on submission by Maxim
# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
# some of the ideas even in attempt to optimize the original RC4_INT
# code path... Current performance in cycles per processed byte (less
# is better) and improvement coefficients relative to previous
# version of this module are:
#
# Opteron	5.3/+0%(*)
# P4		6.5
# Core2		6.2/+15%(**)
# Westmere	4.2/+60%
# Sandy Bridge	4.2/+120%
# Atom		9.3/+80%
# VIA Nano	6.4/+4%
# Ivy Bridge	4.1/+30%
# Bulldozer	4.5/+30%(*)
#
# (*)	But corresponding loop has less instructions, which should have
#	positive effect on upcoming Bulldozer, which has one less ALU.
#	For reference, Intel code runs at 6.8 cpb rate on Opteron.
# (**)	Note that Core2 result is ~15% lower than corresponding result
#	for 32-bit code, meaning that it's possible to improve it,
#	but more than likely at the cost of the others (see rc4-586.pl
#	to get the idea)...

# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
    or die "can't call $xlate: $!";
*STDOUT=*OUT;

$dat="%rdi";	    # arg1
$len="%rsi";	    # arg2
$inp="%rdx";	    # arg3
$out="%rcx";	    # arg4

{
$code=<<___;
.text
.extern	OPENSSL_ia32cap_P

.globl	RC4
.type	RC4,\@function,4
.align	16
RC4:	or	$len,$len
	jne	.Lentry
	ret
.Lentry:
.cfi_startproc
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
.Lprologue:
	mov	$len,%r11
	mov	$inp,%r12
	mov	$out,%r13
___
my $len="%r11";		# reassign input arguments
my $inp="%r12";
my $out="%r13";

my @XX=("%r10","%rsi");
my @TX=("%rax","%rbx");
my $YY="%rcx";
my $TY="%rdx";

$code.=<<___;
	xor	$XX[0],$XX[0]
	xor	$YY,$YY

	lea	8($dat),$dat
	mov	-8($dat),$XX[0]#b
	mov	-4($dat),$YY#b
	cmpl	\$-1,256($dat)
	je	.LRC4_CHAR
	mov	OPENSSL_ia32cap_P(%rip),%r8d
	xor	$TX[1],$TX[1]
	inc	$XX[0]#b
	sub	$XX[0],$TX[1]
	sub	$inp,$out
	movl	($dat,$XX[0],4),$TX[0]#d
	test	\$-16,$len
	jz	.Lloop1
	bt	\$30,%r8d	# Intel CPU?
	jc	.Lintel
	and	\$7,$TX[1]
	lea	1($XX[0]),$XX[1]
	jz	.Loop8
	sub	$TX[1],$len
.Loop8_warmup:
	add	$TX[0]#b,$YY#b
	movl	($dat,$YY,4),$TY#d
	movl	$TX[0]#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX[0],4)
	add	$TY#b,$TX[0]#b
	inc	$XX[0]#b
	movl	($dat,$TX[0],4),$TY#d
	movl	($dat,$XX[0],4),$TX[0]#d
	xorb	($inp),$TY#b
	movb	$TY#b,($out,$inp)
	lea	1($inp),$inp
	dec	$TX[1]
	jnz	.Loop8_warmup

	lea	1($XX[0]),$XX[1]
	jmp	.Loop8
.align	16
.Loop8:
___
for ($i=0;$i<8;$i++) {
$code.=<<___ if ($i==7);
	add	\$8,$XX[1]#b
___
$code.=<<___;
	add	$TX[0]#b,$YY#b
	movl	($dat,$YY,4),$TY#d
	movl	$TX[0]#d,($dat,$YY,4)
	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
	ror	\$8,%r8				# ror is redundant when $i=0
	movl	$TY#d,4*$i($dat,$XX[0],4)
	add	$TX[0]#b,$TY#b
	movb	($dat,$TY,4),%r8b
___
push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
}
$code.=<<___;
	add	\$8,$XX[0]#b
	ror	\$8,%r8
	sub	\$8,$len

	xor	($inp),%r8
	mov	%r8,($out,$inp)
	lea	8($inp),$inp

	test	\$-8,$len
	jnz	.Loop8
	cmp	\$0,$len
	jne	.Lloop1
	jmp	.Lexit

.align	16
.Lintel:
	test	\$-32,$len
	jz	.Lloop1
	and	\$15,$TX[1]
	jz	.Loop16_is_hot
	sub	$TX[1],$len
.Loop16_warmup:
	add	$TX[0]#b,$YY#b
	movl	($dat,$YY,4),$TY#d
	movl	$TX[0]#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX[0],4)
	add	$TY#b,$TX[0]#b
	inc	$XX[0]#b
	movl	($dat,$TX[0],4),$TY#d
	movl	($dat,$XX[0],4),$TX[0]#d
	xorb	($inp),$TY#b
	movb	$TY#b,($out,$inp)
	lea	1($inp),$inp
	dec	$TX[1]
	jnz	.Loop16_warmup

	mov	$YY,$TX[1]
	xor	$YY,$YY
	mov	$TX[1]#b,$YY#b

.Loop16_is_hot:
	lea	($dat,$XX[0],4),$XX[1]
___
sub RC4_loop {
  my $i=shift;
  my $j=$i<0?0:$i;
  my $xmm="%xmm".($j&1);

    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
    $code.="	movl	($dat,$YY,4),$TY#d\n";
    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
    $code.="	add	$TY#b,$TX[0]#b\n";
    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
    $code.="	movl	$TY#d,4*$j($XX[1])\n";
    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
    $code.="	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
    $code.="	lea	16($inp),$inp\n"		if ($i==0);
    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
}
	RC4_loop(-1);
$code.=<<___;
	jmp	.Loop16_enter
.align	16
.Loop16:
___

for ($i=0;$i<16;$i++) {
    $code.=".Loop16_enter:\n"		if ($i==1);
	RC4_loop($i);
	push(@TX,shift(@TX)); 		# "rotate" registers
}
$code.=<<___;
	mov	$YY,$TX[1]
	xor	$YY,$YY			# keyword to partial register
	sub	\$16,$len
	mov	$TX[1]#b,$YY#b
	test	\$-16,$len
	jnz	.Loop16

	psllq	\$8,%xmm1
	pxor	%xmm0,%xmm2
	pxor	%xmm1,%xmm2
	movdqu	%xmm2,($out,$inp)
	lea	16($inp),$inp

	cmp	\$0,$len
	jne	.Lloop1
	jmp	.Lexit

.align	16
.Lloop1:
	add	$TX[0]#b,$YY#b
	movl	($dat,$YY,4),$TY#d
	movl	$TX[0]#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX[0],4)
	add	$TY#b,$TX[0]#b
	inc	$XX[0]#b
	movl	($dat,$TX[0],4),$TY#d
	movl	($dat,$XX[0],4),$TX[0]#d
	xorb	($inp),$TY#b
	movb	$TY#b,($out,$inp)
	lea	1($inp),$inp
	dec	$len
	jnz	.Lloop1
	jmp	.Lexit

.align	16
.LRC4_CHAR:
	add	\$1,$XX[0]#b
	movzb	($dat,$XX[0]),$TX[0]#d
	test	\$-8,$len
	jz	.Lcloop1
	jmp	.Lcloop8
.align	16
.Lcloop8:
	mov	($inp),%r8d
	mov	4($inp),%r9d
___
# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
for ($i=0;$i<4;$i++) {
$code.=<<___;
	add	$TX[0]#b,$YY#b
	lea	1($XX[0]),$XX[1]
	movzb	($dat,$YY),$TY#d
	movzb	$XX[1]#b,$XX[1]#d
	movzb	($dat,$XX[1]),$TX[1]#d
	movb	$TX[0]#b,($dat,$YY)
	cmp	$XX[1],$YY
	movb	$TY#b,($dat,$XX[0])
	jne	.Lcmov$i			# Intel cmov is sloooow...
	mov	$TX[0],$TX[1]
.Lcmov$i:
	add	$TX[0]#b,$TY#b
	xor	($dat,$TY),%r8b
	ror	\$8,%r8d
___
push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
}
for ($i=4;$i<8;$i++) {
$code.=<<___;
	add	$TX[0]#b,$YY#b
	lea	1($XX[0]),$XX[1]
	movzb	($dat,$YY),$TY#d
	movzb	$XX[1]#b,$XX[1]#d
	movzb	($dat,$XX[1]),$TX[1]#d
	movb	$TX[0]#b,($dat,$YY)
	cmp	$XX[1],$YY
	movb	$TY#b,($dat,$XX[0])
	jne	.Lcmov$i			# Intel cmov is sloooow...
	mov	$TX[0],$TX[1]
.Lcmov$i:
	add	$TX[0]#b,$TY#b
	xor	($dat,$TY),%r9b
	ror	\$8,%r9d
___
push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
}
$code.=<<___;
	lea	-8($len),$len
	mov	%r8d,($out)
	lea	8($inp),$inp
	mov	%r9d,4($out)
	lea	8($out),$out

	test	\$-8,$len
	jnz	.Lcloop8
	cmp	\$0,$len
	jne	.Lcloop1
	jmp	.Lexit
___
$code.=<<___;
.align	16
.Lcloop1:
	add	$TX[0]#b,$YY#b
	movzb	$YY#b,$YY#d
	movzb	($dat,$YY),$TY#d
	movb	$TX[0]#b,($dat,$YY)
	movb	$TY#b,($dat,$XX[0])
	add	$TX[0]#b,$TY#b
	add	\$1,$XX[0]#b
	movzb	$TY#b,$TY#d
	movzb	$XX[0]#b,$XX[0]#d
	movzb	($dat,$TY),$TY#d
	movzb	($dat,$XX[0]),$TX[0]#d
	xorb	($inp),$TY#b
	lea	1($inp),$inp
	movb	$TY#b,($out)
	lea	1($out),$out
	sub	\$1,$len
	jnz	.Lcloop1
	jmp	.Lexit

.align	16
.Lexit:
	sub	\$1,$XX[0]#b
	movl	$XX[0]#d,-8($dat)
	movl	$YY#d,-4($dat)

	mov	(%rsp),%r13
.cfi_restore	%r13
	mov	8(%rsp),%r12
.cfi_restore	%r12
	mov	16(%rsp),%rbx
.cfi_restore	%rbx
	add	\$24,%rsp
.cfi_adjust_cfa_offset	-24
.Lepilogue:
	ret
.cfi_endproc
.size	RC4,.-RC4
___
}

$idx="%r8";
$ido="%r9";

$code.=<<___;
.globl	RC4_set_key
.type	RC4_set_key,\@function,3
.align	16
RC4_set_key:
.cfi_startproc
	lea	8($dat),$dat
	lea	($inp,$len),$inp
	neg	$len
	mov	$len,%rcx
	xor	%eax,%eax
	xor	$ido,$ido
	xor	%r10,%r10
	xor	%r11,%r11

	mov	OPENSSL_ia32cap_P(%rip),$idx#d
	bt	\$20,$idx#d	# RC4_CHAR?
	jc	.Lc1stloop
	jmp	.Lw1stloop

.align	16
.Lw1stloop:
	mov	%eax,($dat,%rax,4)
	add	\$1,%al
	jnc	.Lw1stloop

	xor	$ido,$ido
	xor	$idx,$idx
.align	16
.Lw2ndloop:
	mov	($dat,$ido,4),%r10d
	add	($inp,$len,1),$idx#b
	add	%r10b,$idx#b
	add	\$1,$len
	mov	($dat,$idx,4),%r11d
	cmovz	%rcx,$len
	mov	%r10d,($dat,$idx,4)
	mov	%r11d,($dat,$ido,4)
	add	\$1,$ido#b
	jnc	.Lw2ndloop
	jmp	.Lexit_key

.align	16
.Lc1stloop:
	mov	%al,($dat,%rax)
	add	\$1,%al
	jnc	.Lc1stloop

	xor	$ido,$ido
	xor	$idx,$idx
.align	16
.Lc2ndloop:
	mov	($dat,$ido),%r10b
	add	($inp,$len),$idx#b
	add	%r10b,$idx#b
	add	\$1,$len
	mov	($dat,$idx),%r11b
	jnz	.Lcnowrap
	mov	%rcx,$len
.Lcnowrap:
	mov	%r10b,($dat,$idx)
	mov	%r11b,($dat,$ido)
	add	\$1,$ido#b
	jnc	.Lc2ndloop
	movl	\$-1,256($dat)

.align	16
.Lexit_key:
	xor	%eax,%eax
	mov	%eax,-8($dat)
	mov	%eax,-4($dat)
	ret
.cfi_endproc
.size	RC4_set_key,.-RC4_set_key

.globl	RC4_options
.type	RC4_options,\@abi-omnipotent
.align	16
RC4_options:
	lea	.Lopts(%rip),%rax
	mov	OPENSSL_ia32cap_P(%rip),%edx
	bt	\$20,%edx
	jc	.L8xchar
	bt	\$30,%edx
	jnc	.Ldone
	add	\$25,%rax
	ret
.L8xchar:
	add	\$12,%rax
.Ldone:
	ret
.align	64
.Lopts:
.asciz	"rc4(8x,int)"
.asciz	"rc4(8x,char)"
.asciz	"rc4(16x,int)"
.asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align	64
.size	RC4_options,.-RC4_options
___

# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";

$code.=<<___;
.extern	__imp_RtlVirtualUnwind
.type	stream_se_handler,\@abi-omnipotent
.align	16
stream_se_handler:
	push	%rsi
	push	%rdi
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
	pushfq
	sub	\$64,%rsp

	mov	120($context),%rax	# pull context->Rax
	mov	248($context),%rbx	# pull context->Rip

	lea	.Lprologue(%rip),%r10
	cmp	%r10,%rbx		# context->Rip<prologue label
	jb	.Lin_prologue

	mov	152($context),%rax	# pull context->Rsp

	lea	.Lepilogue(%rip),%r10
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lin_prologue

	lea	24(%rax),%rax

	mov	-8(%rax),%rbx
	mov	-16(%rax),%r12
	mov	-24(%rax),%r13
	mov	%rbx,144($context)	# restore context->Rbx
	mov	%r12,216($context)	# restore context->R12
	mov	%r13,224($context)	# restore context->R13

.Lin_prologue:
	mov	8(%rax),%rdi
	mov	16(%rax),%rsi
	mov	%rax,152($context)	# restore context->Rsp
	mov	%rsi,168($context)	# restore context->Rsi
	mov	%rdi,176($context)	# restore context->Rdi

	jmp	.Lcommon_seh_exit
.size	stream_se_handler,.-stream_se_handler

.type	key_se_handler,\@abi-omnipotent
.align	16
key_se_handler:
	push	%rsi
	push	%rdi
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
	pushfq
	sub	\$64,%rsp

	mov	152($context),%rax	# pull context->Rsp
	mov	8(%rax),%rdi
	mov	16(%rax),%rsi
	mov	%rsi,168($context)	# restore context->Rsi
	mov	%rdi,176($context)	# restore context->Rdi

.Lcommon_seh_exit:

	mov	40($disp),%rdi		# disp->ContextRecord
	mov	$context,%rsi		# context
	mov	\$154,%ecx		# sizeof(CONTEXT)
	.long	0xa548f3fc		# cld; rep movsq

	mov	$disp,%rsi
	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
	mov	0(%rsi),%r8		# arg3, disp->ControlPc
	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
	mov	40(%rsi),%r10		# disp->ContextRecord
	lea	56(%rsi),%r11		# &disp->HandlerData
	lea	24(%rsi),%r12		# &disp->EstablisherFrame
	mov	%r10,32(%rsp)		# arg5
	mov	%r11,40(%rsp)		# arg6
	mov	%r12,48(%rsp)		# arg7
	mov	%rcx,56(%rsp)		# arg8, (NULL)
	call	*__imp_RtlVirtualUnwind(%rip)

	mov	\$1,%eax		# ExceptionContinueSearch
	add	\$64,%rsp
	popfq
	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbp
	pop	%rbx
	pop	%rdi
	pop	%rsi
	ret
.size	key_se_handler,.-key_se_handler

.section	.pdata
.align	4
	.rva	.LSEH_begin_RC4
	.rva	.LSEH_end_RC4
	.rva	.LSEH_info_RC4

	.rva	.LSEH_begin_RC4_set_key
	.rva	.LSEH_end_RC4_set_key
	.rva	.LSEH_info_RC4_set_key

.section	.xdata
.align	8
.LSEH_info_RC4:
	.byte	9,0,0,0
	.rva	stream_se_handler
.LSEH_info_RC4_set_key:
	.byte	9,0,0,0
	.rva	key_se_handler
___
}

sub reg_part {
my ($reg,$conv)=@_;
    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
    return $reg;
}

$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
$code =~ s/\`([^\`]*)\`/eval $1/gem;

print $code;

close STDOUT;
Commit	Line	Data
6aa36e8e RS	1	#! /usr/bin/env perl
	2	# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
	3	#
5e4435a7	4	# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e RS	5	# this file except in compliance with the License. You can obtain a copy
	6	# in the file LICENSE in the source distribution or at
	7	# https://www.openssl.org/source/license.html
	8
5f1841cd AP	9	#
5f1841cd AP	10	# ====================================================================
e3713c36	11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
20c04a13 AP	12	# project. The module is, however, dual licensed under OpenSSL and
	13	# CRYPTOGAMS licenses depending on where you obtain it. For further
	14	# details see http://www.openssl.org/~appro/cryptogams/.
5f1841cd AP	15	# ====================================================================
5f1841cd AP	16	#
0ca9a483 AP	17	# July 2004
0ca9a483 AP	18	#
5f1841cd AP	19	# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
	20	# "hand-coded assembler"] doesn't stand for the whole improvement
	21	# coefficient. It turned out that eliminating RC4_CHAR from config
	22	# line results in ~40% improvement (yes, even for C implementation).
	23	# Presumably it has everything to do with AMD cache architecture and
	24	# RAW or whatever penalties. Once again! The module requires config
	25	# line without RC4_CHAR! As for coding "secret," I bet on partial
	26	# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
	27	# I simply 'inc %r8b'. Even though optimization manual discourages
	28	# to operate on partial registers, it turned out to be the best bet.
	29	# At least for AMD... How IA32E would perform remains to be seen...
	30
0ca9a483 AP	31	# November 2004
0ca9a483 AP	32	#
5f1841cd AP	33	# As was shown by Marc Bevand reordering of couple of load operations
	34	# results in even higher performance gain of 3.3x:-) At least on
	35	# Opteron... For reference, 1x in this case is RC4_CHAR C-code
	36	# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
	37	# Latter means that if you want to estimate what to expect from
	38	# your Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
	39
0ca9a483 AP	40	# November 2004
0ca9a483 AP	41	#
5f1841cd AP	42	# Intel P4 EM64T core was found to run the AMD64 code really slow...
	43	# The only way to achieve comparable performance on P4 was to keep
	44	# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
	45	# compose blended code, which would perform even within 30% marginal
	46	# on either AMD and Intel platforms, I implement both cases. See
	47	# rc4_skey.c for further details...
	48
0ca9a483 AP	49	# April 2005
0ca9a483 AP	50	#
609b0852	51	# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
5f1841cd AP	52	# those with add/sub results in 50% performance improvement of folded
	53	# loop...
	54
0ca9a483 AP	55	# May 2005
0ca9a483 AP	56	#
5f1841cd AP	57	# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
	58	# performance by >30% [unlike P4 32-bit case that is]. But this is
	59	# provided that loads are reordered even more aggressively! Both code
60250017	60	# paths, AMD64 and EM64T, reorder loads in essentially same manner
5f1841cd AP	61	# as my IA-64 implementation. On Opteron this resulted in modest 5%
	62	# improvement [I had to test it], while final Intel P4 performance
	63	# achieves respectful 432MBps on 2.8GHz processor now. For reference.
	64	# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
64790791	65	# RC4_INT code-path. While if executed on Opteron, it's only 25%
053fa39a	66	# slower than the RC4_INT one [meaning that if CPU µ-arch detection
0ee88365 AP	67	# is not implemented, then this final RC4_CHAR code-path should be
0ee88365 AP	68	# preferred, as it provides better all-round performance].
5f1841cd	69
0ca9a483 AP	70	# March 2007
0ca9a483 AP	71	#
9babf392 AP	72	# Intel Core2 was observed to perform poorly on both code paths:-( It
	73	# apparently suffers from some kind of partial register stall, which
	74	# occurs in 64-bit mode only [as virtually identical 32-bit loop was
	75	# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
	76	# cloop1 boosts its performance by 80%! This loop appears to be optimal
	77	# fit for Core2 and therefore the code was modified to skip cloop8 on
	78	# this CPU.
	79
0ca9a483 AP	80	# May 2010
0ca9a483 AP	81	#
629fd3aa AP	82	# Intel Westmere was observed to perform suboptimally. Adding yet
	83	# another movzb to cloop1 improved performance by almost 50%! Core2
	84	# performance is improved too, but nominally...
	85
0ca9a483 AP	86	# May 2011
0ca9a483 AP	87	#
f44cb15f AP	88	# The only code path that was not modified is P4-specific one. Non-P4
	89	# Intel code path optimization is heavily based on submission by Maxim
	90	# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
46f4e1be	91	# some of the ideas even in attempt to optimize the original RC4_INT
f44cb15f AP	92	# code path... Current performance in cycles per processed byte (less
	93	# is better) and improvement coefficients relative to previous
	94	# version of this module are:
0ca9a483	95	#
0772f3b4	96	# Opteron 5.3/+0%(*)
0ca9a483	97	# P4 6.5
0772f3b4	98	# Core2 6.2/+15%(**)
0ca9a483 AP	99	# Westmere 4.2/+60%
	100	# Sandy Bridge 4.2/+120%
	101	# Atom 9.3/+80%
d2e18031 AP	102	# VIA Nano 6.4/+4%
	103	# Ivy Bridge 4.1/+30%
	104	# Bulldozer 4.5/+30%(*)
0ca9a483	105	#
0772f3b4 AP	106	# (*) But corresponding loop has less instructions, which should have
	107	# positive effect on upcoming Bulldozer, which has one less ALU.
	108	# For reference, Intel code runs at 6.8 cpb rate on Opteron.
	109	# (**) Note that Core2 result is ~15% lower than corresponding result
f44cb15f AP	110	# for 32-bit code, meaning that it's possible to improve it,
	111	# but more than likely at the cost of the others (see rc4-586.pl
	112	# to get the idea)...
0ca9a483	113
1aa89a7a RL	114	# $output is the last argument if it looks like a file (it has an extension)
	115	# $flavour is the first argument if it doesn't look like a file
	116	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
	117	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
be01f79d AP	118
be01f79d AP	119	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
20c04a13 AP	120
	121	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	122	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
	123	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
	124	die "can't locate x86_64-xlate.pl";
	125
1aa89a7a RL	126	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\""
1aa89a7a RL	127	or die "can't call $xlate: $!";
46bf83f0	128	STDOUT=OUT;
5f1841cd AP	129
	130	$dat="%rdi"; # arg1
	131	$len="%rsi"; # arg2
	132	$inp="%rdx"; # arg3
	133	$out="%rcx"; # arg4
	134
0ca9a483	135	{
5f1841cd AP	136	$code=<<___;
5f1841cd AP	137	.text
fe9a5107	138	.extern OPENSSL_ia32cap_P
5f1841cd AP	139
	140	.globl RC4
	141	.type RC4,\@function,4
	142	.align 16
	143	RC4: or $len,$len
	144	jne .Lentry
	145	ret
	146	.Lentry:
2dfb52d3	147	.cfi_startproc
75d448dd	148	push %rbx
2dfb52d3	149	.cfi_push %rbx
5f1841cd	150	push %r12
2dfb52d3	151	.cfi_push %r12
5f1841cd	152	push %r13
2dfb52d3	153	.cfi_push %r13
be01f79d	154	.Lprologue:
0ca9a483 AP	155	mov $len,%r11
	156	mov $inp,%r12
	157	mov $out,%r13
	158	___
	159	my $len="%r11"; # reassign input arguments
	160	my $inp="%r12";
	161	my $out="%r13";
	162
	163	my @XX=("%r10","%rsi");
	164	my @TX=("%rax","%rbx");
	165	my $YY="%rcx";
	166	my $TY="%rdx";
5f1841cd	167
0ca9a483 AP	168	$code.=<<___;
	169	xor $XX[0],$XX[0]
	170	xor $YY,$YY
	171
	172	lea 8($dat),$dat
	173	mov -8($dat),$XX[0]#b
	174	mov -4($dat),$YY#b
5f1841cd AP	175	cmpl \$-1,256($dat)
5f1841cd AP	176	je .LRC4_CHAR
0ca9a483 AP	177	mov OPENSSL_ia32cap_P(%rip),%r8d
0ca9a483 AP	178	xor $TX[1],$TX[1]
5f1841cd	179	inc $XX[0]#b
0ca9a483 AP	180	sub $XX[0],$TX[1]
0ca9a483 AP	181	sub $inp,$out
5f1841cd	182	movl ($dat,$XX[0],4),$TX[0]#d
0ca9a483	183	test \$-16,$len
5f1841cd	184	jz .Lloop1
4bb90087 AP	185	bt \$30,%r8d # Intel CPU?
4bb90087 AP	186	jc .Lintel
0ca9a483 AP	187	and \$7,$TX[1]
	188	lea 1($XX[0]),$XX[1]
	189	jz .Loop8
	190	sub $TX[1],$len
	191	.Loop8_warmup:
	192	add $TX[0]#b,$YY#b
	193	movl ($dat,$YY,4),$TY#d
	194	movl $TX[0]#d,($dat,$YY,4)
	195	movl $TY#d,($dat,$XX[0],4)
	196	add $TY#b,$TX[0]#b
	197	inc $XX[0]#b
	198	movl ($dat,$TX[0],4),$TY#d
	199	movl ($dat,$XX[0],4),$TX[0]#d
	200	xorb ($inp),$TY#b
	201	movb $TY#b,($out,$inp)
	202	lea 1($inp),$inp
	203	dec $TX[1]
	204	jnz .Loop8_warmup
	205
	206	lea 1($XX[0]),$XX[1]
	207	jmp .Loop8
5f1841cd	208	.align 16
0ca9a483	209	.Loop8:
5f1841cd AP	210	___
5f1841cd AP	211	for ($i=0;$i<8;$i++) {
0ca9a483 AP	212	$code.=<<___ if ($i==7);
	213	add \$8,$XX[1]#b
	214	___
5f1841cd AP	215	$code.=<<___;
5f1841cd AP	216	add $TX[0]#b,$YY#b
5f1841cd	217	movl ($dat,$YY,4),$TY#d
5f1841cd	218	movl $TX[0]#d,($dat,$YY,4)
0ca9a483 AP	219	movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
	220	ror \$8,%r8 # ror is redundant when $i=0
	221	movl $TY#d,4*$i($dat,$XX[0],4)
5f1841cd	222	add $TX[0]#b,$TY#b
0ca9a483	223	movb ($dat,$TY,4),%r8b
5f1841cd	224	___
0ca9a483	225	push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
5f1841cd AP	226	}
5f1841cd AP	227	$code.=<<___;
0ca9a483 AP	228	add \$8,$XX[0]#b
0ca9a483 AP	229	ror \$8,%r8
5f1841cd AP	230	sub \$8,$len
5f1841cd AP	231
0ca9a483 AP	232	xor ($inp),%r8
	233	mov %r8,($out,$inp)
	234	lea 8($inp),$inp
5f1841cd AP	235
5f1841cd AP	236	test \$-8,$len
0ca9a483 AP	237	jnz .Loop8
	238	cmp \$0,$len
	239	jne .Lloop1
	240	jmp .Lexit
	241
	242	.align 16
4bb90087	243	.Lintel:
0ca9a483 AP	244	test \$-32,$len
	245	jz .Lloop1
	246	and \$15,$TX[1]
	247	jz .Loop16_is_hot
	248	sub $TX[1],$len
	249	.Loop16_warmup:
	250	add $TX[0]#b,$YY#b
	251	movl ($dat,$YY,4),$TY#d
	252	movl $TX[0]#d,($dat,$YY,4)
	253	movl $TY#d,($dat,$XX[0],4)
	254	add $TY#b,$TX[0]#b
	255	inc $XX[0]#b
	256	movl ($dat,$TX[0],4),$TY#d
	257	movl ($dat,$XX[0],4),$TX[0]#d
	258	xorb ($inp),$TY#b
	259	movb $TY#b,($out,$inp)
	260	lea 1($inp),$inp
	261	dec $TX[1]
	262	jnz .Loop16_warmup
	263
	264	mov $YY,$TX[1]
	265	xor $YY,$YY
	266	mov $TX[1]#b,$YY#b
	267
	268	.Loop16_is_hot:
	269	lea ($dat,$XX[0],4),$XX[1]
	270	___
	271	sub RC4_loop {
	272	my $i=shift;
	273	my $j=$i<0?0:$i;
	274	my $xmm="%xmm".($j&1);
	275
	276	$code.=" add \$16,$XX[0]#b\n" if ($i==15);
	277	$code.=" movdqu ($inp),%xmm2\n" if ($i==15);
	278	$code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
	279	$code.=" movl ($dat,$YY,4),$TY#d\n";
	280	$code.=" pxor %xmm0,%xmm2\n" if ($i==0);
	281	$code.=" psllq \$8,%xmm1\n" if ($i==0);
	282	$code.=" pxor $xmm,$xmm\n" if ($i<=1);
	283	$code.=" movl $TX[0]#d,($dat,$YY,4)\n";
	284	$code.=" add $TY#b,$TX[0]#b\n";
	285	$code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
	286	$code.=" movz $TX[0]#b,$TX[0]#d\n";
	287	$code.=" movl $TY#d,4*$j($XX[1])\n";
	288	$code.=" pxor %xmm1,%xmm2\n" if ($i==0);
	289	$code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
	290	$code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
	291	$code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
	292	$code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
	293	$code.=" lea 16($inp),$inp\n" if ($i==0);
	294	$code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
	295	}
	296	RC4_loop(-1);
	297	$code.=<<___;
	298	jmp .Loop16_enter
	299	.align 16
	300	.Loop16:
	301	___
	302
	303	for ($i=0;$i<16;$i++) {
	304	$code.=".Loop16_enter:\n" if ($i==1);
	305	RC4_loop($i);
	306	push(@TX,shift(@TX)); # "rotate" registers
	307	}
308	$code.=<<___;
309	mov $YY,$TX[1]
310	xor $YY,$YY # keyword to partial register
311	sub \$16,$len
312	mov $TX[1]#b,$YY#b
313	test \$-16,$len
314	jnz .Loop16
315
316	psllq \$8,%xmm1
317	pxor %xmm0,%xmm2
318	pxor %xmm1,%xmm2
319	movdqu %xmm2,($out,$inp)
320	lea 16($inp),$inp
321
5f1841cd AP	322	cmp \$0,$len
5f1841cd AP	323	jne .Lloop1
be01f79d	324	jmp .Lexit
5f1841cd	325
5f1841cd AP	326	.align 16
	327	.Lloop1:
	328	add $TX[0]#b,$YY#b
	329	movl ($dat,$YY,4),$TY#d
	330	movl $TX[0]#d,($dat,$YY,4)
	331	movl $TY#d,($dat,$XX[0],4)
	332	add $TY#b,$TX[0]#b
	333	inc $XX[0]#b
	334	movl ($dat,$TX[0],4),$TY#d
	335	movl ($dat,$XX[0],4),$TX[0]#d
	336	xorb ($inp),$TY#b
0ca9a483 AP	337	movb $TY#b,($out,$inp)
0ca9a483 AP	338	lea 1($inp),$inp
5f1841cd AP	339	dec $len
	340	jnz .Lloop1
	341	jmp .Lexit
	342
	343	.align 16
	344	.LRC4_CHAR:
	345	add \$1,$XX[0]#b
	346	movzb ($dat,$XX[0]),$TX[0]#d
	347	test \$-8,$len
	348	jz .Lcloop1
5f1841cd AP	349	jmp .Lcloop8
	350	.align 16
	351	.Lcloop8:
0ca9a483 AP	352	mov ($inp),%r8d
0ca9a483 AP	353	mov 4($inp),%r9d
5f1841cd AP	354	___
	355	# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
	356	for ($i=0;$i<4;$i++) {
	357	$code.=<<___;
	358	add $TX[0]#b,$YY#b
	359	lea 1($XX[0]),$XX[1]
	360	movzb ($dat,$YY),$TY#d
	361	movzb $XX[1]#b,$XX[1]#d
	362	movzb ($dat,$XX[1]),$TX[1]#d
	363	movb $TX[0]#b,($dat,$YY)
	364	cmp $XX[1],$YY
	365	movb $TY#b,($dat,$XX[0])
	366	jne .Lcmov$i # Intel cmov is sloooow...
	367	mov $TX[0],$TX[1]
	368	.Lcmov$i:
	369	add $TX[0]#b,$TY#b
0ca9a483 AP	370	xor ($dat,$TY),%r8b
0ca9a483 AP	371	ror \$8,%r8d
5f1841cd AP	372	___
	373	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	374	}
	375	for ($i=4;$i<8;$i++) {
	376	$code.=<<___;
	377	add $TX[0]#b,$YY#b
	378	lea 1($XX[0]),$XX[1]
	379	movzb ($dat,$YY),$TY#d
0ee88365	380	movzb $XX[1]#b,$XX[1]#d
5f1841cd AP	381	movzb ($dat,$XX[1]),$TX[1]#d
	382	movb $TX[0]#b,($dat,$YY)
	383	cmp $XX[1],$YY
	384	movb $TY#b,($dat,$XX[0])
	385	jne .Lcmov$i # Intel cmov is sloooow...
	386	mov $TX[0],$TX[1]
	387	.Lcmov$i:
	388	add $TX[0]#b,$TY#b
0ca9a483 AP	389	xor ($dat,$TY),%r9b
0ca9a483 AP	390	ror \$8,%r9d
5f1841cd AP	391	___
	392	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	393	}
	394	$code.=<<___;
	395	lea -8($len),$len
0ca9a483	396	mov %r8d,($out)
5f1841cd	397	lea 8($inp),$inp
0ca9a483	398	mov %r9d,4($out)
5f1841cd AP	399	lea 8($out),$out
	400
	401	test \$-8,$len
	402	jnz .Lcloop8
5f1841cd AP	403	cmp \$0,$len
	404	jne .Lcloop1
	405	jmp .Lexit
	406	___
	407	$code.=<<___;
	408	.align 16
	409	.Lcloop1:
	410	add $TX[0]#b,$YY#b
629fd3aa	411	movzb $YY#b,$YY#d
5f1841cd AP	412	movzb ($dat,$YY),$TY#d
	413	movb $TX[0]#b,($dat,$YY)
	414	movb $TY#b,($dat,$XX[0])
	415	add $TX[0]#b,$TY#b
	416	add \$1,$XX[0]#b
de504945 AP	417	movzb $TY#b,$TY#d
de504945 AP	418	movzb $XX[0]#b,$XX[0]#d
5f1841cd AP	419	movzb ($dat,$TY),$TY#d
	420	movzb ($dat,$XX[0]),$TX[0]#d
	421	xorb ($inp),$TY#b
	422	lea 1($inp),$inp
	423	movb $TY#b,($out)
	424	lea 1($out),$out
	425	sub \$1,$len
	426	jnz .Lcloop1
	427	jmp .Lexit
be01f79d AP	428
	429	.align 16
	430	.Lexit:
	431	sub \$1,$XX[0]#b
	432	movl $XX[0]#d,-8($dat)
	433	movl $YY#d,-4($dat)
	434
75d448dd	435	mov (%rsp),%r13
2dfb52d3	436	.cfi_restore %r13
75d448dd	437	mov 8(%rsp),%r12
2dfb52d3	438	.cfi_restore %r12
75d448dd	439	mov 16(%rsp),%rbx
2dfb52d3	440	.cfi_restore %rbx
be01f79d	441	add \$24,%rsp
2dfb52d3	442	.cfi_adjust_cfa_offset -24
be01f79d AP	443	.Lepilogue:
be01f79d AP	444	ret
2dfb52d3	445	.cfi_endproc
5f1841cd AP	446	.size RC4,.-RC4
5f1841cd AP	447	___
0ca9a483	448	}
5f1841cd	449
9babf392 AP	450	$idx="%r8";
	451	$ido="%r9";
	452
	453	$code.=<<___;
9babf392 AP	454	.globl RC4_set_key
	455	.type RC4_set_key,\@function,3
	456	.align 16
	457	RC4_set_key:
b2a00f62	458	.cfi_startproc
9babf392 AP	459	lea 8($dat),$dat
	460	lea ($inp,$len),$inp
	461	neg $len
	462	mov $len,%rcx
	463	xor %eax,%eax
	464	xor $ido,$ido
	465	xor %r10,%r10
	466	xor %r11,%r11
	467
	468	mov OPENSSL_ia32cap_P(%rip),$idx#d
4bb90087 AP	469	bt \$20,$idx#d # RC4_CHAR?
4bb90087 AP	470	jc .Lc1stloop
0ca9a483	471	jmp .Lw1stloop
9babf392 AP	472
	473	.align 16
	474	.Lw1stloop:
	475	mov %eax,($dat,%rax,4)
	476	add \$1,%al
	477	jnc .Lw1stloop
	478
	479	xor $ido,$ido
	480	xor $idx,$idx
	481	.align 16
	482	.Lw2ndloop:
	483	mov ($dat,$ido,4),%r10d
	484	add ($inp,$len,1),$idx#b
	485	add %r10b,$idx#b
	486	add \$1,$len
	487	mov ($dat,$idx,4),%r11d
	488	cmovz %rcx,$len
	489	mov %r10d,($dat,$idx,4)
	490	mov %r11d,($dat,$ido,4)
	491	add \$1,$ido#b
	492	jnc .Lw2ndloop
	493	jmp .Lexit_key
	494
	495	.align 16
	496	.Lc1stloop:
	497	mov %al,($dat,%rax)
	498	add \$1,%al
	499	jnc .Lc1stloop
	500
	501	xor $ido,$ido
	502	xor $idx,$idx
	503	.align 16
	504	.Lc2ndloop:
	505	mov ($dat,$ido),%r10b
	506	add ($inp,$len),$idx#b
	507	add %r10b,$idx#b
	508	add \$1,$len
	509	mov ($dat,$idx),%r11b
	510	jnz .Lcnowrap
	511	mov %rcx,$len
	512	.Lcnowrap:
	513	mov %r10b,($dat,$idx)
	514	mov %r11b,($dat,$ido)
	515	add \$1,$ido#b
	516	jnc .Lc2ndloop
	517	movl \$-1,256($dat)
	518
	519	.align 16
	520	.Lexit_key:
	521	xor %eax,%eax
	522	mov %eax,-8($dat)
	523	mov %eax,-4($dat)
	524	ret
b2a00f62	525	.cfi_endproc
9babf392 AP	526	.size RC4_set_key,.-RC4_set_key
	527
	528	.globl RC4_options
be01f79d	529	.type RC4_options,\@abi-omnipotent
9babf392 AP	530	.align 16
9babf392 AP	531	RC4_options:
aa8f38e4	532	lea .Lopts(%rip),%rax
9babf392 AP	533	mov OPENSSL_ia32cap_P(%rip),%edx
9babf392 AP	534	bt \$20,%edx
98628960	535	jc .L8xchar
9babf392 AP	536	bt \$30,%edx
9babf392 AP	537	jnc .Ldone
98628960 AP	538	add \$25,%rax
	539	ret
	540	.L8xchar:
	541	add \$12,%rax
9babf392 AP	542	.Ldone:
	543	ret
	544	.align 64
	545	.Lopts:
	546	.asciz "rc4(8x,int)"
	547	.asciz "rc4(8x,char)"
0ca9a483	548	.asciz "rc4(16x,int)"
20c04a13	549	.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
9babf392 AP	550	.align 64
	551	.size RC4_options,.-RC4_options
	552	___
	553
be01f79d AP	554	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
	555	# CONTEXT context,DISPATCHER_CONTEXT disp)
	556	if ($win64) {
	557	$rec="%rcx";
	558	$frame="%rdx";
	559	$context="%r8";
	560	$disp="%r9";
	561
	562	$code.=<<___;
	563	.extern __imp_RtlVirtualUnwind
	564	.type stream_se_handler,\@abi-omnipotent
	565	.align 16
	566	stream_se_handler:
	567	push %rsi
	568	push %rdi
	569	push %rbx
	570	push %rbp
	571	push %r12
	572	push %r13
	573	push %r14
	574	push %r15
	575	pushfq
	576	sub \$64,%rsp
	577
	578	mov 120($context),%rax # pull context->Rax
	579	mov 248($context),%rbx # pull context->Rip
	580
	581	lea .Lprologue(%rip),%r10
	582	cmp %r10,%rbx # context->Rip<prologue label
	583	jb .Lin_prologue
	584
	585	mov 152($context),%rax # pull context->Rsp
	586
	587	lea .Lepilogue(%rip),%r10
75d448dd	588	cmp %r10,%rbx # context->Rip>=epilogue label
be01f79d AP	589	jae .Lin_prologue
	590
	591	lea 24(%rax),%rax
	592
75d448dd AP	593	mov -8(%rax),%rbx
	594	mov -16(%rax),%r12
	595	mov -24(%rax),%r13
	596	mov %rbx,144($context) # restore context->Rbx
be01f79d AP	597	mov %r12,216($context) # restore context->R12
	598	mov %r13,224($context) # restore context->R13
	599
	600	.Lin_prologue:
	601	mov 8(%rax),%rdi
	602	mov 16(%rax),%rsi
	603	mov %rax,152($context) # restore context->Rsp
	604	mov %rsi,168($context) # restore context->Rsi
	605	mov %rdi,176($context) # restore context->Rdi
	606
	607	jmp .Lcommon_seh_exit
	608	.size stream_se_handler,.-stream_se_handler
	609
	610	.type key_se_handler,\@abi-omnipotent
	611	.align 16
	612	key_se_handler:
	613	push %rsi
	614	push %rdi
	615	push %rbx
	616	push %rbp
	617	push %r12
	618	push %r13
	619	push %r14
	620	push %r15
	621	pushfq
	622	sub \$64,%rsp
	623
	624	mov 152($context),%rax # pull context->Rsp
	625	mov 8(%rax),%rdi
	626	mov 16(%rax),%rsi
	627	mov %rsi,168($context) # restore context->Rsi
	628	mov %rdi,176($context) # restore context->Rdi
	629
	630	.Lcommon_seh_exit:
	631
	632	mov 40($disp),%rdi # disp->ContextRecord
	633	mov $context,%rsi # context
	634	mov \$154,%ecx # sizeof(CONTEXT)
	635	.long 0xa548f3fc # cld; rep movsq
	636
	637	mov $disp,%rsi
	638	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
	639	mov 8(%rsi),%rdx # arg2, disp->ImageBase
	640	mov 0(%rsi),%r8 # arg3, disp->ControlPc
	641	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
	642	mov 40(%rsi),%r10 # disp->ContextRecord
	643	lea 56(%rsi),%r11 # &disp->HandlerData
	644	lea 24(%rsi),%r12 # &disp->EstablisherFrame
	645	mov %r10,32(%rsp) # arg5
	646	mov %r11,40(%rsp) # arg6
	647	mov %r12,48(%rsp) # arg7
	648	mov %rcx,56(%rsp) # arg8, (NULL)
	649	call *__imp_RtlVirtualUnwind(%rip)
	650
	651	mov \$1,%eax # ExceptionContinueSearch
	652	add \$64,%rsp
	653	popfq
	654	pop %r15
	655	pop %r14
	656	pop %r13
	657	pop %r12
	658	pop %rbp
	659	pop %rbx
	660	pop %rdi
661	pop %rsi
662	ret
663	.size key_se_handler,.-key_se_handler
664
665	.section .pdata
666	.align 4
667	.rva .LSEH_begin_RC4
668	.rva .LSEH_end_RC4
669	.rva .LSEH_info_RC4
670
671	.rva .LSEH_begin_RC4_set_key
672	.rva .LSEH_end_RC4_set_key
673	.rva .LSEH_info_RC4_set_key
674
675	.section .xdata
676	.align 8
677	.LSEH_info_RC4:
678	.byte 9,0,0,0
679	.rva stream_se_handler
680	.LSEH_info_RC4_set_key:
681	.byte 9,0,0,0
682	.rva key_se_handler
683	___
684	}
685
0ca9a483 AP	686	sub reg_part {
	687	my ($reg,$conv)=@_;
	688	if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
	689	elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
	690	elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
	691	elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
	692	return $reg;
	693	}
	694
	695	$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
	696	$code =~ s/\`([^\`]*)\`/eval $1/gem;
5f1841cd AP	697
	698	print $code;
	699
	700	close STDOUT;