[thirdparty/openssl.git] / crypto / bn / asm / ia64.S

.explicit
.text
.asciz	"ia64.S, Version 1.0"
.asciz	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
//

// Q.	How much faster does it get?
// A.	Here is the output from 'openssl speed rsa dsa' for vanilla
//	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
//	Linux 7.1 2.96-81):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
//	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
//	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
//	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
//	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
//
//	And here is similar output but for this assembler
//	implementation:-)
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
//	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
//	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
//	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
//	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
//	
//	Yes, you may argue that it's not fair comparison as it's
//	possible to craft the C implementation with BN_UMULT_HIGH
//	inline assembler macro. But of course! Here is the output
//	with the macro:
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
//	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
//	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
//	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
//	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
//
//	My code is still way faster, huh:-) And I believe that even
//	higher performance can be achieved. Note that as keys get
//	longer, performance gain is larger. Why? According to the
//	profiler there is another player in the field, namely
//	BN_from_montgomery consuming larger and larger portion of CPU
//	time as keysize decreases. I therefore consider putting effort
//	to assembler implementation of the following routine:
//
//	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
//	{
//	int      i,j;
//	BN_ULONG v;
//
//	for (i=0; i<nl; i++)
//		{
//		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
//		nrp++;
//		rp++;
//		if (((nrp[-1]+=v)&BN_MASK2) < v)
//			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
//		}
//	}
//
//	It might as well be beneficial to implement even combaX
//	variants, as it appears as it can literally unleash the
//	performance (see comment section to bn_mul_comba8 below).
//
//	And finally for your reference the output for 0.9.6a compiled
//	with SGIcc version 0.01.0-12 (keep in mind that for the moment
//	of this writing it's not possible to convince SGIcc to use
//	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
//	i.e. for a compiler generated one:-):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
//	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
//	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
//	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
//	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
//
//	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
//	system running Redhat Linux 7.1 (very special thanks to Ray
//	McCaffity of Williams Communications for providing an account).
//
// Q.	What's the heck with 'rum 1<<5' at the end of every function?
// A.	Well, by clearing the "upper FP registers written" bit of the
//	User Mask I want to excuse the kernel from preserving upper
//	(f32-f128) FP register bank over process context switch, thus
//	minimizing bus bandwidth consumption during the switch (i.e.
//	after PKI opration completes and the program is off doing
//	something else like bulk symmetric encryption). Having said
//	this, I also want to point out that it might be good idea
//	to compile the whole toolkit (as well as majority of the
//	programs for that matter) with -mfixed-range=f32-f127 command
//	line option. No, it doesn't prevent the compiler from writing
//	to upper bank, but at least discourages to do so. If you don't
//	like the idea you have the option to compile the module with
//	-Drum=nop.m in command line.
//

#if 1
//
// bn_[add|sub]_words routines.
//
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
// I consider that the epilogue is short enough as it is to trade tiny
// performance loss on Itanium for scalability.
//
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_add_words#
.proc	bn_add_words#
.align	64
.space	32	// makes the loop body aligned at 64-byte boundary
bn_add_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
{ .mii	alloc		r2=ar.pfs,4,12,0,16
	cmp4.le		p6,p0=r35,r0	};;
{ .mfb	mov		r8=r0			// return value
(p6)	br.ret.spnt.many	b0	};;

	.save	ar.lc,r3
{ .mib	sub		r10=r35,r0,1
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
					}
	.body
{ .mib	mov		r14=r32			// rp
	mov		r9=pr		};;
{ .mii	mov		r15=r33			// ap
	mov		ar.lc=r10
	mov		ar.ec=6		}
{ .mib	mov		r16=r34			// bp
	mov		pr.rot=1<<16	}

.L_bn_add_words_ctop:	;;
{ .mii	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
	(p18)	add		r39=r37,r34
	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
{ .mfb	(p0)	nop.m		0x0
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
{ .mii	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
	(p58)	add		r41=1,r41	} // (p20)
{ .mfb	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_add_words_ctop	};;
.L_bn_add_words_cend:

{ .mii
(p59)	add		r8=1,r8		// return value
	mov		pr=r9,-1
	mov		ar.lc=r3	}
{ .mbb	nop.b		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_add_words#

//
// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_sub_words#
.proc	bn_sub_words#
.align	64
.space	32	// makes the loop body aligned at 64-byte boundary
bn_sub_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
{ .mii	alloc		r2=ar.pfs,4,12,0,16
	cmp4.le		p6,p0=r35,r0	};;
{ .mfb	mov		r8=r0			// return value
(p6)	br.ret.spnt.many	b0	};;

	.save	ar.lc,r3
{ .mib	sub		r10=r35,r0,1
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
					}
	.body
{ .mib	mov		r14=r32			// rp
	mov		r9=pr		};;
{ .mii	mov		r15=r33			// ap
	mov		ar.lc=r10
	mov		ar.ec=6		}
{ .mib	mov		r16=r34			// bp
	mov		pr.rot=1<<16	}

.L_bn_sub_words_ctop:	;;
{ .mii	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
	(p18)	sub		r39=r37,r34
	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
{ .mfb	(p0)	nop.m		0x0
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
{ .mii	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
	(p58)	add		r41=-1,r41	} // (p20)
{ .mbb	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
	(p0)	nop.b		0x0
	br.ctop.sptk	.L_bn_sub_words_ctop	};;
.L_bn_sub_words_cend:

{ .mii
(p59)	add		r8=1,r8		// return value
	mov		pr=r9,-1
	mov		ar.lc=r3	}
{ .mbb	nop.b		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_sub_words#
#endif

#if 0
#define XMA_TEMPTATION
#endif

#if 1
//
// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_words#
.proc	bn_mul_words#
.align	64
.space	32	// makes the loop body aligned at 64-byte boundary
bn_mul_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
#ifdef XMA_TEMPTATION
{ .mfi	alloc		r2=ar.pfs,4,0,0,0	};;
#else
{ .mfi	alloc		r2=ar.pfs,4,4,0,8	};;
#endif
{ .mib	mov		r8=r0			// return value
	cmp4.le		p6,p0=r34,r0
(p6)	br.ret.spnt.many	b0		};;

	.save	ar.lc,r3
{ .mii	sub	r10=r34,r0,1
	mov	r3=ar.lc
	mov	r9=pr			};;

	.body
{ .mib	setf.sig	f8=r35	// w
	mov		pr.rot=0x400001<<16
			// ------^----- serves as (p48) at first (p26)
	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
					}

#ifndef XMA_TEMPTATION

{ .mii	mov		r14=r32	// rp
	mov		r15=r33	// ap
	mov		ar.lc=r10	}
{ .mii	mov		r39=0	// serves as r33 at first (p26)
	mov		ar.ec=12	}

// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
// cache (i.e. 9 ticks away) as floating point load/store instructions
// bypass L1 cache and L2 latency is actually best-case scenario for
// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
// would give us ~5% in *overall* performance improvement on "wider"
// IA-64, but would hurt Itanium for about same because of longer
// epilogue. As it's a matter of few percents in either case I've
// chosen to trade the scalability for development time (you can see
// this very instruction sequence in bn_mul_add_words loop which in
// turn is scalable).
.L_bn_mul_words_ctop:	;;
{ .mfi	(p25)	getf.sig	r36=f49			// low
	(p21)	xmpy.lu		f45=f37,f8
	(p27)	cmp.ltu		p52,p48=r39,r38	}
{ .mfi	(p16)	ldf8		f32=[r15],8
	(p21)	xmpy.hu		f38=f37,f8
	(p0)	nop.i		0x0		};;
{ .mii	(p26)	getf.sig	r32=f43			// high
	.pred.rel	"mutex",p48,p52
	(p48)	add		r38=r37,r33		// (p26)
	(p52)	add		r38=r37,r33,1	}	// (p26)
{ .mfb	(p27)	st8		[r14]=r39,8
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

{ .mii	nop.m		0x0
.pred.rel	"mutex",p49,p53
(p49)	add		r8=r34,r0
(p53)	add		r8=r34,r0,1	}
{ .mfb	nop.m	0x0
	nop.f	0x0
	nop.b	0x0			}

#else	// XMA_TEMPTATION

	setf.sig	f37=r0	// serves as carry at (p18) tick
	mov		ar.lc=r10
	mov		ar.ec=5

// Most of you examining this code very likely wonder why in the name
// of Intel the following loop is commented out? Indeed, it looks so
// neat that you find it hard to believe that it's something wrong
// with it, right? The catch is that every iteration depends on the
// result from previous one and the latter isn't available instantly.
// The loop therefore spins at the latency of xma minus 1, or in other
// words at 6*(n+4) ticks:-( Compare to the "production" loop above
// that runs in 2*(n+11) where the low latency problem is worked around
// by moving the dependency to one-tick latent interger ALU. Note that
// "distance" between ldf8 and xma is not latency of ldf8, but the
// *difference* between xma and ldf8 latencies.
.L_bn_mul_words_ctop:	;;
{ .mfi	(p16)	ldf8		f32=[r33],8
	(p18)	xma.hu		f38=f34,f8,f39	}
{ .mfb	(p20)	stf8		[r32]=f37,8
	(p18)	xma.lu		f35=f34,f8,f39
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

	getf.sig	r8=f41		// the return value

#endif	// XMA_TEMPTATION

{ .mii	nop.m		0x0
	mov		pr=r9,-1
	mov		ar.lc=r3	}
{ .mfb	rum		1<<5		// clear um.mfh
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_mul_words#
#endif

#if 1
//
// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_add_words#
.proc	bn_mul_add_words#
.align	64
//.space	0	// makes the loop split at 64-byte boundary
bn_mul_add_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
{ .mii	alloc		r2=ar.pfs,4,12,0,16
	cmp4.le		p6,p0=r34,r0	};;
{ .mfb	mov		r8=r0			// return value
(p6)	br.ret.spnt.many	b0	};;

	.save	ar.lc,r3
{ .mii	sub	r10=r34,r0,1
	mov	r3=ar.lc
	mov	r9=pr			};;

	.body
{ .mib	setf.sig	f8=r35	// w
	mov		pr.rot=0x400001<<16
			// ------^----- serves as (p48) at first (p26)
	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
					}
{ .mii	mov		r14=r32	// rp
	mov		r15=r33	// ap
	mov		ar.lc=r10	}
{ .mii	mov		r39=0	// serves as r33 at first (p26)
	mov		r18=r32	// rp copy
	mov		ar.ec=14	}

// This loop spins in 3*(n+13) ticks on Itanium and should spin in
// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
Commit	Line	Data
52c0d300	1	.explicit
4cb73bf8 AP	2	.text
	3	.asciz "ia64.S, Version 1.0"
	4	.asciz "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
	5
	6	//
	7	// ====================================================================
	8	// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
	9	// project.
	10	//
	11	// Rights for redistribution and usage in source and binary forms are
	12	// granted according to the OpenSSL license. Warranty of any kind is
	13	// disclaimed.
	14	// ====================================================================
	15	//
	16
	17	// Q. How much faster does it get?
	18	// A. Here is the output from 'openssl speed rsa dsa' for vanilla
	19	// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
	20	// Linux 7.1 2.96-81):
	21	//
	22	// sign verify sign/s verify/s
	23	// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
	24	// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
	25	// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
	26	// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
	27	// sign verify sign/s verify/s
	28	// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
	29	// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
	30	//
	31	// And here is similar output but for this assembler
	32	// implementation:-)
	33	//
	34	// sign verify sign/s verify/s
	35	// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
	36	// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
	37	// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
	38	// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
	39	// sign verify sign/s verify/s
	40	// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
	41	// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
	42	//
	43	// Yes, you may argue that it's not fair comparison as it's
	44	// possible to craft the C implementation with BN_UMULT_HIGH
	45	// inline assembler macro. But of course! Here is the output
	46	// with the macro:
	47	//
	48	// sign verify sign/s verify/s
	49	// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
	50	// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
	51	// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
	52	// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
	53	// sign verify sign/s verify/s
	54	// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
	55	// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
	56	//
	57	// My code is still way faster, huh:-) And I believe that even
	58	// higher performance can be achieved. Note that as keys get
	59	// longer, performance gain is larger. Why? According to the
	60	// profiler there is another player in the field, namely
	61	// BN_from_montgomery consuming larger and larger portion of CPU
	62	// time as keysize decreases. I therefore consider putting effort
	63	// to assembler implementation of the following routine:
	64	//
	65	// void bn_mul_add_mont (BN_ULONG rp,BN_ULONG np,int nl,BN_ULONG n0)
66	// {
67	// int i,j;
68	// BN_ULONG v;
69	//
70	// for (i=0; i<nl; i++)
71	// {
72	// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
73	// nrp++;
74	// rp++;
75	// if (((nrp[-1]+=v)&BN_MASK2) < v)
76	// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
77	// }
78	// }
79	//
80	// It might as well be beneficial to implement even combaX
81	// variants, as it appears as it can literally unleash the
82	// performance (see comment section to bn_mul_comba8 below).
83	//
84	// And finally for your reference the output for 0.9.6a compiled
85	// with SGIcc version 0.01.0-12 (keep in mind that for the moment
86	// of this writing it's not possible to convince SGIcc to use
87	// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
88	// i.e. for a compiler generated one:-):
89	//
90	// sign verify sign/s verify/s
91	// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
92	// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
93	// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
94	// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
95	// sign verify sign/s verify/s
96	// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
97	// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
98	//
99	// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
100	// system running Redhat Linux 7.1 (very special thanks to Ray
101	// McCaffity of Williams Communications for providing an account).
102	//
103	// Q. What's the heck with 'rum 1<<5' at the end of every function?
104	// A. Well, by clearing the "upper FP registers written" bit of the
105	// User Mask I want to excuse the kernel from preserving upper
106	// (f32-f128) FP register bank over process context switch, thus
107	// minimizing bus bandwidth consumption during the switch (i.e.
108	// after PKI opration completes and the program is off doing
109	// something else like bulk symmetric encryption). Having said
110	// this, I also want to point out that it might be good idea
111	// to compile the whole toolkit (as well as majority of the
112	// programs for that matter) with -mfixed-range=f32-f127 command
113	// line option. No, it doesn't prevent the compiler from writing
114	// to upper bank, but at least discourages to do so. If you don't
115	// like the idea you have the option to compile the module with
116	// -Drum=nop.m in command line.
117	//
118
119	#if 1
120	//
121	// bn_[add\|sub]_words routines.
122	//
123	// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
124	// data reside in L1 cache, i.e. 2 ticks away). It's possible to
125	// compress the epilogue and get down to 2*n+6, but at the cost of
126	// scalability (the neat feature of this implementation is that it
127	// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
128	// I consider that the epilogue is short enough as it is to trade tiny
129	// performance loss on Itanium for scalability.
130	//
131	// BN_ULONG bn_add_words(BN_ULONG rp, BN_ULONG ap, BN_ULONG *bp,int num)
132	//
133	.global bn_add_words#
134	.proc bn_add_words#
135	.align 64
136	.space 32 // makes the loop body aligned at 64-byte boundary
137	bn_add_words:
138	.prologue
139	.fframe 0
140	.save ar.pfs,r2
141	{ .mii alloc r2=ar.pfs,4,12,0,16
142	cmp4.le p6,p0=r35,r0 };;
143	{ .mfb mov r8=r0 // return value
144	(p6) br.ret.spnt.many b0 };;
145
146	.save ar.lc,r3
147	{ .mib sub r10=r35,r0,1
148	mov r3=ar.lc
149	brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
150	}
151	.body
152	{ .mib mov r14=r32 // rp
153	mov r9=pr };;
154	{ .mii mov r15=r33 // ap
155	mov ar.lc=r10
156	mov ar.ec=6 }
157	{ .mib mov r16=r34 // bp
158	mov pr.rot=1<<16 }
159
160	.L_bn_add_words_ctop: ;;
161	{ .mii (p16) ld8 r32=[r16],8 // b=*(bp++)
162	(p18) add r39=r37,r34
163	(p19) cmp.ltu.unc p56,p0=r40,r38 }
164	{ .mfb (p0) nop.m 0x0
165	(p0) nop.f 0x0
166	(p0) nop.b 0x0 }
167	{ .mii (p16) ld8 r35=[r15],8 // a=*(ap++)
168	(p58) cmp.eq.or p57,p0=-1,r41 // (p20)
169	(p58) add r41=1,r41 } // (p20)
170	{ .mfb (p21) st8 [r14]=r42,8 // *(rp++)=r
171	(p0) nop.f 0x0
172	br.ctop.sptk .L_bn_add_words_ctop };;
173	.L_bn_add_words_cend:
174
175	{ .mii
176	(p59) add r8=1,r8 // return value
177	mov pr=r9,-1
178	mov ar.lc=r3 }
179	{ .mbb nop.b 0x0
180	br.ret.sptk.many b0 };;
181	.endp bn_add_words#
182
183	//
184	// BN_ULONG bn_sub_words(BN_ULONG rp, BN_ULONG ap, BN_ULONG *bp,int num)
185	//
186	.global bn_sub_words#
187	.proc bn_sub_words#
188	.align 64
189	.space 32 // makes the loop body aligned at 64-byte boundary
190	bn_sub_words:
191	.prologue
192	.fframe 0
193	.save ar.pfs,r2
194	{ .mii alloc r2=ar.pfs,4,12,0,16
195	cmp4.le p6,p0=r35,r0 };;
196	{ .mfb mov r8=r0 // return value
197	(p6) br.ret.spnt.many b0 };;
198
199	.save ar.lc,r3
200	{ .mib sub r10=r35,r0,1
201	mov r3=ar.lc
202	brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
203	}
204	.body
205	{ .mib mov r14=r32 // rp
206	mov r9=pr };;
207	{ .mii mov r15=r33 // ap
208	mov ar.lc=r10
209	mov ar.ec=6 }
210	{ .mib mov r16=r34 // bp
211	mov pr.rot=1<<16 }
212
213	.L_bn_sub_words_ctop: ;;
214	{ .mii (p16) ld8 r32=[r16],8 // b=*(bp++)
215	(p18) sub r39=r37,r34
216	(p19) cmp.gtu.unc p56,p0=r40,r38 }
217	{ .mfb (p0) nop.m 0x0
218	(p0) nop.f 0x0
219	(p0) nop.b 0x0 }
220	{ .mii (p16) ld8 r35=[r15],8 // a=*(ap++)
221	(p58) cmp.eq.or p57,p0=0,r41 // (p20)
222	(p58) add r41=-1,r41 } // (p20)
223	{ .mbb (p21) st8 [r14]=r42,8 // *(rp++)=r
224	(p0) nop.b 0x0
225	br.ctop.sptk .L_bn_sub_words_ctop };;
226	.L_bn_sub_words_cend:
227
228	{ .mii
229	(p59) add r8=1,r8 // return value
230	mov pr=r9,-1
231	mov ar.lc=r3 }
232	{ .mbb nop.b 0x0
233	br.ret.sptk.many b0 };;
234	.endp bn_sub_words#
235	#endif
236
237	#if 0
238	#define XMA_TEMPTATION
239	#endif
240
241	#if 1
242	//
243	// BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
244	//
245	.global bn_mul_words#
246	.proc bn_mul_words#
247	.align 64
248	.space 32 // makes the loop body aligned at 64-byte boundary
249	bn_mul_words:
250	.prologue
251	.fframe 0
252	.save ar.pfs,r2
253	#ifdef XMA_TEMPTATION
254	{ .mfi alloc r2=ar.pfs,4,0,0,0 };;
255	#else
256	{ .mfi alloc r2=ar.pfs,4,4,0,8 };;
257	#endif
258	{ .mib mov r8=r0 // return value
259	cmp4.le p6,p0=r34,r0
260	(p6) br.ret.spnt.many b0 };;
261
262	.save ar.lc,r3
263	{ .mii sub r10=r34,r0,1
264	mov r3=ar.lc
265	mov r9=pr };;
266
267	.body
268	{ .mib setf.sig f8=r35 // w
269	mov pr.rot=0x400001<<16
270	// ------^----- serves as (p48) at first (p26)
271	brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
272	}
273
274	#ifndef XMA_TEMPTATION
275
276	{ .mii mov r14=r32 // rp
277	mov r15=r33 // ap
278	mov ar.lc=r10 }
279	{ .mii mov r39=0 // serves as r33 at first (p26)
280	mov ar.ec=12 }
281
282	// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
283	// cache (i.e. 9 ticks away) as floating point load/store instructions
284	// bypass L1 cache and L2 latency is actually best-case scenario for
285	// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
286	// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
287	// would give us ~5% in overall performance improvement on "wider"
288	// IA-64, but would hurt Itanium for about same because of longer
289	// epilogue. As it's a matter of few percents in either case I've
290	// chosen to trade the scalability for development time (you can see
291	// this very instruction sequence in bn_mul_add_words loop which in
292	// turn is scalable).
293	.L_bn_mul_words_ctop: ;;
294	{ .mfi (p25) getf.sig r36=f49 // low
295	(p21) xmpy.lu f45=f37,f8
296	(p27) cmp.ltu p52,p48=r39,r38 }
297	{ .mfi (p16) ldf8 f32=[r15],8
298	(p21) xmpy.hu f38=f37,f8
299	(p0) nop.i 0x0 };;
300	{ .mii (p26) getf.sig r32=f43 // high
52c0d300	301	.pred.rel "mutex",p48,p52
4cb73bf8 AP	302	(p48) add r38=r37,r33 // (p26)
	303	(p52) add r38=r37,r33,1 } // (p26)
	304	{ .mfb (p27) st8 [r14]=r39,8
	305	(p0) nop.f 0x0
	306	br.ctop.sptk .L_bn_mul_words_ctop };;
	307	.L_bn_mul_words_cend:
	308
	309	{ .mii nop.m 0x0
52c0d300	310	.pred.rel "mutex",p49,p53
4cb73bf8 AP	311	(p49) add r8=r34,r0
	312	(p53) add r8=r34,r0,1 }
	313	{ .mfb nop.m 0x0
	314	nop.f 0x0
	315	nop.b 0x0 }
	316
	317	#else // XMA_TEMPTATION
	318
	319	setf.sig f37=r0 // serves as carry at (p18) tick
	320	mov ar.lc=r10
	321	mov ar.ec=5
	322
	323	// Most of you examining this code very likely wonder why in the name
	324	// of Intel the following loop is commented out? Indeed, it looks so
	325	// neat that you find it hard to believe that it's something wrong
	326	// with it, right? The catch is that every iteration depends on the
	327	// result from previous one and the latter isn't available instantly.
	328	// The loop therefore spins at the latency of xma minus 1, or in other
	329	// words at 6*(n+4) ticks:-( Compare to the "production" loop above
	330	// that runs in 2*(n+11) where the low latency problem is worked around
	331	// by moving the dependency to one-tick latent interger ALU. Note that
	332	// "distance" between ldf8 and xma is not latency of ldf8, but the
	333	// difference between xma and ldf8 latencies.
	334	.L_bn_mul_words_ctop: ;;
	335	{ .mfi (p16) ldf8 f32=[r33],8
	336	(p18) xma.hu f38=f34,f8,f39 }
	337	{ .mfb (p20) stf8 [r32]=f37,8
	338	(p18) xma.lu f35=f34,f8,f39
	339	br.ctop.sptk .L_bn_mul_words_ctop };;
	340	.L_bn_mul_words_cend:
	341
	342	getf.sig r8=f41 // the return value
	343
	344	#endif // XMA_TEMPTATION
	345
	346	{ .mii nop.m 0x0
	347	mov pr=r9,-1
	348	mov ar.lc=r3 }
	349	{ .mfb rum 1<<5 // clear um.mfh
	350	nop.f 0x0
	351	br.ret.sptk.many b0 };;
	352	.endp bn_mul_words#
	353	#endif
	354
	355	#if 1
	356	//
	357	// BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
	358	//
	359	.global bn_mul_add_words#
	360	.proc bn_mul_add_words#
	361	.align 64
	362	//.space 0 // makes the loop split at 64-byte boundary
	363	bn_mul_add_words:
	364	.prologue
	365	.fframe 0
	366	.save ar.pfs,r2
	367	{ .mii alloc r2=ar.pfs,4,12,0,16
	368	cmp4.le p6,p0=r34,r0 };;
	369	{ .mfb mov r8=r0 // return value
	370	(p6) br.ret.spnt.many b0 };;
	371
	372	.save ar.lc,r3
	373	{ .mii sub r10=r34,r0,1
	374	mov r3=ar.lc
375	mov r9=pr };;
376
377	.body
378	{ .mib setf.sig f8=r35 // w
379	mov pr.rot=0x400001<<16
380	// ------^----- serves as (p48) at first (p26)
381	brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
382	}
383	{ .mii mov r14=r32 // rp
384	mov r15=r33 // ap
385	mov ar.lc=r10 }
386	{ .mii mov r39=0 // serves as r33 at first (p26)
387	mov r18=r32 // rp copy
388	mov ar.ec=14 }
389
390	// This loop spins in 3*(n+13) ticks on Itanium and should spin in
391	// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
392