[thirdparty/linux.git] / arch / alpha / lib / ev6-copy_page.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * arch/alpha/lib/ev6-copy_page.S
 *
 * Copy an entire page.
 */

/* The following comparison of this routine vs the normal copy_page.S
   was written by an unnamed ev6 hardware designer and forwarded to me
   via Steven Hobbs <hobbs@steven.zko.dec.com>.
 
   First Problem: STQ overflows.
   -----------------------------

	It would be nice if EV6 handled every resource overflow efficiently,
	but for some it doesn't.  Including store queue overflows.  It causes
	a trap and a restart of the pipe.

	To get around this we sometimes use (to borrow a term from a VSSAD
	researcher) "aeration".  The idea is to slow the rate at which the
	processor receives valid instructions by inserting nops in the fetch
	path.  In doing so, you can prevent the overflow and actually make
	the code run faster.  You can, of course, take advantage of the fact
	that the processor can fetch at most 4 aligned instructions per cycle.

	I inserted enough nops to force it to take 10 cycles to fetch the
	loop code.  In theory, EV6 should be able to execute this loop in
	9 cycles but I was not able to get it to run that fast -- the initial
	conditions were such that I could not reach this optimum rate on
	(chaotic) EV6.  I wrote the code such that everything would issue
	in order. 

   Second Problem: Dcache index matches.
   -------------------------------------

	If you are going to use this routine on random aligned pages, there
	is a 25% chance that the pages will be at the same dcache indices.
	This results in many nasty memory traps without care.

	The solution is to schedule the prefetches to avoid the memory
	conflicts.  I schedule the wh64 prefetches farther ahead of the
	read prefetches to avoid this problem.

   Third Problem: Needs more prefetching.
   --------------------------------------

	In order to improve the code I added deeper prefetching to take the
	most advantage of EV6's bandwidth.

	I also prefetched the read stream. Note that adding the read prefetch
	forced me to add another cycle to the inner-most kernel - up to 11
	from the original 8 cycles per iteration.  We could improve performance
	further by unrolling the loop and doing multiple prefetches per cycle.

   I think that the code below will be very robust and fast code for the
   purposes of copying aligned pages.  It is slower when both source and
   destination pages are in the dcache, but it is my guess that this is
   less important than the dcache miss case.  */

#include <asm/export.h>
	.text
	.align 4
	.global copy_page
	.ent copy_page
copy_page:
	.prologue 0

	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
	wh64	($16)
	ldl	$31,0($17)
	ldl	$31,64($17)
	lda	$1,1*64($16)

	wh64	($1)
	ldl	$31,128($17)
	ldl	$31,192($17)
	lda	$1,2*64($16)

	wh64	($1)
	ldl	$31,256($17)
	lda	$18,118
	lda	$1,3*64($16)

	wh64	($1)
	nop
	lda	$1,4*64($16)
	lda	$2,5*64($16)

	wh64	($1)
	wh64	($2)
	lda	$1,6*64($16)
	lda	$2,7*64($16)

	wh64	($1)
	wh64	($2)
	lda	$1,8*64($16)
	lda	$2,9*64($16)

	wh64	($1)
	wh64	($2)
	lda	$19,10*64($16)
	nop

	/* Main prefetching/write-hinting loop.  */
1:	ldq	$0,0($17)
	ldq	$1,8($17)
	unop
	unop

	unop
	unop
	ldq	$2,16($17)
	ldq	$3,24($17)

	ldq	$4,32($17)
	ldq	$5,40($17)
	unop
	unop

	unop
	unop
	ldq	$6,48($17)
	ldq	$7,56($17)

	ldl	$31,320($17)
	unop
	unop
	unop

	/* This gives the extra cycle of aeration above the minimum.  */
	unop			
	unop
	unop
	unop

	wh64	($19)
	unop
	unop
	unop

	stq	$0,0($16)
	subq	$18,1,$18
	stq	$1,8($16)
	unop

	unop
	stq	$2,16($16)
	addq	$17,64,$17
	stq	$3,24($16)

	stq	$4,32($16)
	stq	$5,40($16)
	addq	$19,64,$19
	unop

	stq	$6,48($16)
	stq	$7,56($16)
	addq	$16,64,$16
	bne	$18, 1b

	/* Prefetch the final 5 cache lines of the read stream.  */
	lda	$18,10
	ldl	$31,320($17)
	ldl	$31,384($17)
	ldl	$31,448($17)

	ldl	$31,512($17)
	ldl	$31,576($17)
	nop
	nop

	/* Non-prefetching, non-write-hinting cleanup loop for the
	   final 10 cache lines.  */
2:	ldq	$0,0($17)
	ldq	$1,8($17)
	ldq	$2,16($17)
	ldq	$3,24($17)

	ldq	$4,32($17)
	ldq	$5,40($17)
	ldq	$6,48($17)
	ldq	$7,56($17)

	stq	$0,0($16)
	subq	$18,1,$18
	stq	$1,8($16)
	addq	$17,64,$17

	stq	$2,16($16)
	stq	$3,24($16)
	stq	$4,32($16)
	stq	$5,40($16)

	stq	$6,48($16)
	stq	$7,56($16)
	addq	$16,64,$16
	bne	$18, 2b

	ret
	nop
	unop
	nop

	.end copy_page
	EXPORT_SYMBOL(copy_page)
Commit	Line	Data
b2441318	1	/* SPDX-License-Identifier: GPL-2.0 */
1da177e4 LT	2	/*
	3	* arch/alpha/lib/ev6-copy_page.S
	4	*
	5	* Copy an entire page.
	6	*/
	7
	8	/* The following comparison of this routine vs the normal copy_page.S
	9	was written by an unnamed ev6 hardware designer and forwarded to me
	10	via Steven Hobbs <hobbs@steven.zko.dec.com>.
	11
	12	First Problem: STQ overflows.
	13	-----------------------------
	14
	15	It would be nice if EV6 handled every resource overflow efficiently,
	16	but for some it doesn't. Including store queue overflows. It causes
	17	a trap and a restart of the pipe.
	18
	19	To get around this we sometimes use (to borrow a term from a VSSAD
	20	researcher) "aeration". The idea is to slow the rate at which the
	21	processor receives valid instructions by inserting nops in the fetch
	22	path. In doing so, you can prevent the overflow and actually make
	23	the code run faster. You can, of course, take advantage of the fact
	24	that the processor can fetch at most 4 aligned instructions per cycle.
	25
	26	I inserted enough nops to force it to take 10 cycles to fetch the
	27	loop code. In theory, EV6 should be able to execute this loop in
	28	9 cycles but I was not able to get it to run that fast -- the initial
	29	conditions were such that I could not reach this optimum rate on
	30	(chaotic) EV6. I wrote the code such that everything would issue
	31	in order.
	32
	33	Second Problem: Dcache index matches.
	34	-------------------------------------
	35
	36	If you are going to use this routine on random aligned pages, there
	37	is a 25% chance that the pages will be at the same dcache indices.
	38	This results in many nasty memory traps without care.
	39
	40	The solution is to schedule the prefetches to avoid the memory
	41	conflicts. I schedule the wh64 prefetches farther ahead of the
	42	read prefetches to avoid this problem.
	43
	44	Third Problem: Needs more prefetching.
	45	--------------------------------------
	46
	47	In order to improve the code I added deeper prefetching to take the
	48	most advantage of EV6's bandwidth.
	49
	50	I also prefetched the read stream. Note that adding the read prefetch
	51	forced me to add another cycle to the inner-most kernel - up to 11
	52	from the original 8 cycles per iteration. We could improve performance
	53	further by unrolling the loop and doing multiple prefetches per cycle.
	54
	55	I think that the code below will be very robust and fast code for the
	56	purposes of copying aligned pages. It is slower when both source and
	57	destination pages are in the dcache, but it is my guess that this is
	58	less important than the dcache miss case. */
	59
00fc0e0d	60	#include <asm/export.h>
1da177e4 LT	61	.text
	62	.align 4
	63	.global copy_page
	64	.ent copy_page
	65	copy_page:
	66	.prologue 0
	67
	68	/* Prefetch 5 read cachelines; write-hint 10 cache lines. */
	69	wh64 ($16)
	70	ldl $31,0($17)
	71	ldl $31,64($17)
	72	lda $1,1*64($16)
	73
	74	wh64 ($1)
	75	ldl $31,128($17)
	76	ldl $31,192($17)
	77	lda $1,2*64($16)
	78
	79	wh64 ($1)
	80	ldl $31,256($17)
	81	lda $18,118
	82	lda $1,3*64($16)
	83
	84	wh64 ($1)
	85	nop
	86	lda $1,4*64($16)
	87	lda $2,5*64($16)
	88
	89	wh64 ($1)
	90	wh64 ($2)
	91	lda $1,6*64($16)
	92	lda $2,7*64($16)
	93
	94	wh64 ($1)
	95	wh64 ($2)
	96	lda $1,8*64($16)
	97	lda $2,9*64($16)
	98
	99	wh64 ($1)
	100	wh64 ($2)
	101	lda $19,10*64($16)
	102	nop
	103
	104	/* Main prefetching/write-hinting loop. */
	105	1: ldq $0,0($17)
	106	ldq $1,8($17)
	107	unop
	108	unop
	109
	110	unop
	111	unop
	112	ldq $2,16($17)
	113	ldq $3,24($17)
	114
	115	ldq $4,32($17)
	116	ldq $5,40($17)
	117	unop
	118	unop
	119
	120	unop
	121	unop
	122	ldq $6,48($17)
	123	ldq $7,56($17)
	124
125	ldl $31,320($17)
126	unop
127	unop
128	unop
129
130	/* This gives the extra cycle of aeration above the minimum. */
131	unop
132	unop
133	unop
134	unop
135
136	wh64 ($19)
137	unop
138	unop
139	unop
140
141	stq $0,0($16)
142	subq $18,1,$18
143	stq $1,8($16)
144	unop
145
146	unop
147	stq $2,16($16)
148	addq $17,64,$17
149	stq $3,24($16)
150
151	stq $4,32($16)
152	stq $5,40($16)
153	addq $19,64,$19
154	unop
155
156	stq $6,48($16)
157	stq $7,56($16)
158	addq $16,64,$16
159	bne $18, 1b
160
161	/* Prefetch the final 5 cache lines of the read stream. */
162	lda $18,10
163	ldl $31,320($17)
164	ldl $31,384($17)
165	ldl $31,448($17)
166
167	ldl $31,512($17)
168	ldl $31,576($17)
169	nop
170	nop
171
172	/* Non-prefetching, non-write-hinting cleanup loop for the
173	final 10 cache lines. */
174	2: ldq $0,0($17)
175	ldq $1,8($17)
176	ldq $2,16($17)
177	ldq $3,24($17)
178
179	ldq $4,32($17)
180	ldq $5,40($17)
181	ldq $6,48($17)
182	ldq $7,56($17)
183
184	stq $0,0($16)
185	subq $18,1,$18
186	stq $1,8($16)
187	addq $17,64,$17
188
189	stq $2,16($16)
190	stq $3,24($16)
191	stq $4,32($16)
192	stq $5,40($16)
193
194	stq $6,48($16)
195	stq $7,56($16)
196	addq $16,64,$16
197	bne $18, 2b
198
199	ret
200	nop
201	unop
202	nop
203
204	.end copy_page
00fc0e0d	205	EXPORT_SYMBOL(copy_page)