[thirdparty/gcc.git] / libgcc / config / sh / lib1funcs-Os-4-200.S

/* Copyright (C) 2006-2022 Free Software Foundation, Inc.

This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/* Moderately Space-optimized libgcc routines for the Renesas SH /
   STMicroelectronics ST40 CPUs.
   Contributed by J"orn Rennecke joern.rennecke@st.com.  */

#include "lib1funcs.h"

#ifdef L_udivsi3_i4i

/* 88 bytes; sh4-200 cycle counts:
   divisor  >= 2G: 11 cycles
   dividend <  2G: 48 cycles
   dividend >= 2G: divisor != 1: 54 cycles
   dividend >= 2G, divisor == 1: 22 cycles */
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in r0, clobber r1

	.global GLOBAL(udivsi3_i4i)
	FUNC(GLOBAL(udivsi3_i4i))
GLOBAL(udivsi3_i4i):
	mova L1,r0
	cmp/pz r5
	sts fpscr,r1
	lds.l @r0+,fpscr
	sts.l fpul,@-r15
	bf LOCAL(huge_divisor)
	mov.l r1,@-r15
	lds r4,fpul
	cmp/pz r4
#ifdef FMOVD_WORKS
	fmov.d dr0,@-r15
	float fpul,dr0
	fmov.d dr2,@-r15
	bt LOCAL(dividend_adjusted)
	mov #1,r1
	fmov.d @r0,dr2
	cmp/eq r1,r5
	bt LOCAL(div_by_1)
	fadd dr2,dr0
LOCAL(dividend_adjusted):
	lds r5,fpul
	float fpul,dr2
	fdiv dr2,dr0
LOCAL(div_by_1):
	fmov.d @r15+,dr2
	ftrc dr0,fpul
	fmov.d @r15+,dr0
#else /* !FMOVD_WORKS */
	fmov.s DR01,@-r15
	mov #1,r1
	fmov.s DR00,@-r15
	float fpul,dr0
	fmov.s DR21,@-r15
	bt/s LOCAL(dividend_adjusted)
	fmov.s DR20,@-r15
	cmp/eq r1,r5
	bt LOCAL(div_by_1)
	fmov.s @r0+,DR20
	fmov.s @r0,DR21
	fadd dr2,dr0
LOCAL(dividend_adjusted):
	lds r5,fpul
	float fpul,dr2
	fdiv dr2,dr0
LOCAL(div_by_1):
	fmov.s @r15+,DR20
	fmov.s @r15+,DR21
	ftrc dr0,fpul
	fmov.s @r15+,DR00
	fmov.s @r15+,DR01
#endif /* !FMOVD_WORKS */
	lds.l @r15+,fpscr
	sts fpul,r0
	rts
	lds.l @r15+,fpul

#ifdef FMOVD_WORKS
	.p2align 3        ! make double below 8 byte aligned.
#endif
LOCAL(huge_divisor):
	lds r1,fpscr
	add #4,r15
	cmp/hs r5,r4
	rts
	movt r0

	.p2align 2
L1:
#ifndef FMOVD_WORKS
	.long 0x80000
#else
	.long 0x180000
#endif
	.double 4294967296

	ENDFUNC(GLOBAL(udivsi3_i4i))
#elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */

#if 0
/* With 36 bytes, the following would probably be the most compact
   implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
GLOBAL(udivsi3_i4i):
	mov.l r2,@-r15
	mov #0,r1
	div0u
	mov r1,r2
	mov.l r3,@-r15
	mov r1,r3
	sett
	mov r4,r0
LOCAL(loop):
	rotcr r2
	;
	bt/s LOCAL(end)
	cmp/gt r2,r3
	rotcl r0
	bra LOCAL(loop)
	div1 r5,r1
LOCAL(end):
	rotcl r0
	mov.l @r15+,r3
	rts
	mov.l @r15+,r2
#endif /* 0 */

/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
   sh4-200 run times:
   udiv small divisor: 55 cycles
   udiv large divisor: 52 cycles
   sdiv small divisor, positive result: 59 cycles
   sdiv large divisor, positive result: 56 cycles
   sdiv small divisor, negative result: 65 cycles (*)
   sdiv large divisor, negative result: 62 cycles (*)
   (*): r2 is restored in the rts delay slot and has a lingering latency
        of two more cycles.  */
	.balign 4
	.global	GLOBAL(udivsi3_i4i)
	FUNC(GLOBAL(udivsi3_i4i))
	FUNC(GLOBAL(sdivsi3_i4i))
GLOBAL(udivsi3_i4i):
	sts pr,r1
	mov.l r4,@-r15
	extu.w r5,r0
	cmp/eq r5,r0
	swap.w r4,r0
	shlr16 r4
	bf/s LOCAL(large_divisor)
	div0u
	mov.l r5,@-r15
	shll16 r5
LOCAL(sdiv_small_divisor):
	div1 r5,r4
	bsr LOCAL(div6)
	div1 r5,r4
	div1 r5,r4
	bsr LOCAL(div6)
	div1 r5,r4
	xtrct r4,r0
	xtrct r0,r4
	bsr LOCAL(div7)
	swap.w r4,r4
	div1 r5,r4
	bsr LOCAL(div7)
	div1 r5,r4
	xtrct r4,r0
	mov.l @r15+,r5
	swap.w r0,r0
	mov.l @r15+,r4
	jmp @r1
	rotcl r0
LOCAL(div7):
	div1 r5,r4
LOCAL(div6):
	            div1 r5,r4; div1 r5,r4; div1 r5,r4
	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4

LOCAL(divx3):
	rotcl r0
	div1 r5,r4
	rotcl r0
	div1 r5,r4
	rotcl r0
	rts
	div1 r5,r4

LOCAL(large_divisor):
	mov.l r5,@-r15
LOCAL(sdiv_large_divisor):
	xor r4,r0
	.rept 4
	rotcl r0
	bsr LOCAL(divx3)
	div1 r5,r4
	.endr
	mov.l @r15+,r5
	mov.l @r15+,r4
	jmp @r1
	rotcl r0
	ENDFUNC(GLOBAL(udivsi3_i4i))

	.global	GLOBAL(sdivsi3_i4i)
GLOBAL(sdivsi3_i4i):
	mov.l r4,@-r15
	cmp/pz r5
	mov.l r5,@-r15
	bt/s LOCAL(pos_divisor)
	cmp/pz r4
	neg r5,r5
	extu.w r5,r0
	bt/s LOCAL(neg_result)
	cmp/eq r5,r0
	neg r4,r4
LOCAL(pos_result):
	swap.w r4,r0
	bra LOCAL(sdiv_check_divisor)
	sts pr,r1
LOCAL(pos_divisor):
	extu.w r5,r0
	bt/s LOCAL(pos_result)
	cmp/eq r5,r0
	neg r4,r4
LOCAL(neg_result):
	mova LOCAL(negate_result),r0
	;
	mov r0,r1
	swap.w r4,r0
	lds r2,macl
	sts pr,r2
LOCAL(sdiv_check_divisor):
	shlr16 r4
	bf/s LOCAL(sdiv_large_divisor)
	div0u
	bra LOCAL(sdiv_small_divisor)
	shll16 r5
	.balign 4
LOCAL(negate_result):
	neg r0,r0
	jmp @r2
	sts macl,r2
	ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* !__SH_FPU_DOUBLE__ */
#endif /* L_udivsi3_i4i */

#ifdef L_sdivsi3_i4i
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
/* 48 bytes, 45 cycles on sh4-200  */
!! args in r4 and r5, result in r0, clobber r1

	.global GLOBAL(sdivsi3_i4i)
	FUNC(GLOBAL(sdivsi3_i4i))
GLOBAL(sdivsi3_i4i):
	sts.l fpscr,@-r15
	sts fpul,r1
	mova L1,r0
	lds.l @r0+,fpscr
	lds r4,fpul
#ifdef FMOVD_WORKS
	fmov.d dr0,@-r15
	float fpul,dr0
	lds r5,fpul
	fmov.d dr2,@-r15
#else
	fmov.s DR01,@-r15
	fmov.s DR00,@-r15
	float fpul,dr0
	lds r5,fpul
	fmov.s DR21,@-r15
	fmov.s DR20,@-r15
#endif
	float fpul,dr2
	fdiv dr2,dr0
#ifdef FMOVD_WORKS
	fmov.d @r15+,dr2
#else
	fmov.s @r15+,DR20
	fmov.s @r15+,DR21
#endif
	ftrc dr0,fpul
#ifdef FMOVD_WORKS
	fmov.d @r15+,dr0
#else
	fmov.s @r15+,DR00
	fmov.s @r15+,DR01
#endif
	lds.l @r15+,fpscr
	sts fpul,r0
	rts
	lds r1,fpul

	.p2align 2
L1:
#ifndef FMOVD_WORKS
	.long 0x80000
#else
	.long 0x180000
#endif

	ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* __SH_FPU_DOUBLE__ */
#endif /* L_sdivsi3_i4i */
Commit	Line	Data
7adcbafe	1	/* Copyright (C) 2006-2022 Free Software Foundation, Inc.
3f1d3526 R	2
	3	This file is free software; you can redistribute it and/or modify it
	4	under the terms of the GNU General Public License as published by the
748086b7	5	Free Software Foundation; either version 3, or (at your option) any
3f1d3526 R	6	later version.
3f1d3526 R	7
3f1d3526 R	8	This file is distributed in the hope that it will be useful, but
	9	WITHOUT ANY WARRANTY; without even the implied warranty of
	10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	11	General Public License for more details.
	12
748086b7 JJ	13	Under Section 7 of GPL version 3, you are granted additional
	14	permissions described in the GCC Runtime Library Exception, version
	15	3.1, as published by the Free Software Foundation.
	16
	17	You should have received a copy of the GNU General Public License and
	18	a copy of the GCC Runtime Library Exception along with this program;
	19	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	20	<http://www.gnu.org/licenses/>. */
3f1d3526 R	21
	22	/* Moderately Space-optimized libgcc routines for the Renesas SH /
	23	STMicroelectronics ST40 CPUs.
	24	Contributed by J"orn Rennecke joern.rennecke@st.com. */
	25
	26	#include "lib1funcs.h"
	27
	28	#ifdef L_udivsi3_i4i
	29
	30	/* 88 bytes; sh4-200 cycle counts:
	31	divisor >= 2G: 11 cycles
	32	dividend < 2G: 48 cycles
	33	dividend >= 2G: divisor != 1: 54 cycles
	34	dividend >= 2G, divisor == 1: 22 cycles */
	35	#if defined (__SH_FPU_DOUBLE__) \|\| defined (__SH4_SINGLE_ONLY__)
	36	!! args in r4 and r5, result in r0, clobber r1
	37
	38	.global GLOBAL(udivsi3_i4i)
	39	FUNC(GLOBAL(udivsi3_i4i))
	40	GLOBAL(udivsi3_i4i):
	41	mova L1,r0
	42	cmp/pz r5
	43	sts fpscr,r1
	44	lds.l @r0+,fpscr
	45	sts.l fpul,@-r15
	46	bf LOCAL(huge_divisor)
	47	mov.l r1,@-r15
	48	lds r4,fpul
	49	cmp/pz r4
	50	#ifdef FMOVD_WORKS
	51	fmov.d dr0,@-r15
	52	float fpul,dr0
	53	fmov.d dr2,@-r15
	54	bt LOCAL(dividend_adjusted)
	55	mov #1,r1
	56	fmov.d @r0,dr2
	57	cmp/eq r1,r5
	58	bt LOCAL(div_by_1)
	59	fadd dr2,dr0
	60	LOCAL(dividend_adjusted):
	61	lds r5,fpul
	62	float fpul,dr2
	63	fdiv dr2,dr0
	64	LOCAL(div_by_1):
	65	fmov.d @r15+,dr2
	66	ftrc dr0,fpul
	67	fmov.d @r15+,dr0
	68	#else /* !FMOVD_WORKS */
	69	fmov.s DR01,@-r15
	70	mov #1,r1
	71	fmov.s DR00,@-r15
	72	float fpul,dr0
	73	fmov.s DR21,@-r15
	74	bt/s LOCAL(dividend_adjusted)
	75	fmov.s DR20,@-r15
	76	cmp/eq r1,r5
	77	bt LOCAL(div_by_1)
	78	fmov.s @r0+,DR20
	79	fmov.s @r0,DR21
	80	fadd dr2,dr0
	81	LOCAL(dividend_adjusted):
	82	lds r5,fpul
	83	float fpul,dr2
	84	fdiv dr2,dr0
85	LOCAL(div_by_1):
86	fmov.s @r15+,DR20
87	fmov.s @r15+,DR21
88	ftrc dr0,fpul
89	fmov.s @r15+,DR00
90	fmov.s @r15+,DR01
91	#endif /* !FMOVD_WORKS */
92	lds.l @r15+,fpscr
93	sts fpul,r0
94	rts
95	lds.l @r15+,fpul
96
97	#ifdef FMOVD_WORKS
98	.p2align 3 ! make double below 8 byte aligned.
99	#endif
100	LOCAL(huge_divisor):
101	lds r1,fpscr
102	add #4,r15
103	cmp/hs r5,r4
104	rts
105	movt r0
106
107	.p2align 2
108	L1:
109	#ifndef FMOVD_WORKS
110	.long 0x80000
111	#else
112	.long 0x180000
113	#endif
114	.double 4294967296
115
116	ENDFUNC(GLOBAL(udivsi3_i4i))
117	#elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */
118
119	#if 0
120	/* With 36 bytes, the following would probably be the most compact
121	implementation, but with 139 cycles on an sh4-200, it is extremely slow. */
122	GLOBAL(udivsi3_i4i):
123	mov.l r2,@-r15
124	mov #0,r1
125	div0u
126	mov r1,r2
127	mov.l r3,@-r15
128	mov r1,r3
129	sett
130	mov r4,r0
131	LOCAL(loop):
132	rotcr r2
133	;
134	bt/s LOCAL(end)
135	cmp/gt r2,r3
136	rotcl r0
137	bra LOCAL(loop)
138	div1 r5,r1
139	LOCAL(end):
140	rotcl r0
141	mov.l @r15+,r3
142	rts
143	mov.l @r15+,r2
144	#endif /* 0 */
145
146	/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
147	sh4-200 run times:
148	udiv small divisor: 55 cycles
149	udiv large divisor: 52 cycles
150	sdiv small divisor, positive result: 59 cycles
151	sdiv large divisor, positive result: 56 cycles
152	sdiv small divisor, negative result: 65 cycles (*)
153	sdiv large divisor, negative result: 62 cycles (*)
154	(*): r2 is restored in the rts delay slot and has a lingering latency
155	of two more cycles. */
156	.balign 4
157	.global GLOBAL(udivsi3_i4i)
158	FUNC(GLOBAL(udivsi3_i4i))
159	FUNC(GLOBAL(sdivsi3_i4i))
160	GLOBAL(udivsi3_i4i):
161	sts pr,r1
162	mov.l r4,@-r15
163	extu.w r5,r0
164	cmp/eq r5,r0
165	swap.w r4,r0
166	shlr16 r4
167	bf/s LOCAL(large_divisor)
168	div0u
169	mov.l r5,@-r15
170	shll16 r5
171	LOCAL(sdiv_small_divisor):
172	div1 r5,r4
173	bsr LOCAL(div6)
174	div1 r5,r4
175	div1 r5,r4
176	bsr LOCAL(div6)
177	div1 r5,r4
178	xtrct r4,r0
179	xtrct r0,r4
180	bsr LOCAL(div7)
181	swap.w r4,r4
182	div1 r5,r4
183	bsr LOCAL(div7)
184	div1 r5,r4
185	xtrct r4,r0
186	mov.l @r15+,r5
187	swap.w r0,r0
188	mov.l @r15+,r4
189	jmp @r1
190	rotcl r0
191	LOCAL(div7):
192	div1 r5,r4
193	LOCAL(div6):
194	div1 r5,r4; div1 r5,r4; div1 r5,r4
195	div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
196
197	LOCAL(divx3):
198	rotcl r0
199	div1 r5,r4
200	rotcl r0
201	div1 r5,r4
202	rotcl r0
203	rts
204	div1 r5,r4
205
206	LOCAL(large_divisor):
207	mov.l r5,@-r15
208	LOCAL(sdiv_large_divisor):
209	xor r4,r0
210	.rept 4
211	rotcl r0
212	bsr LOCAL(divx3)
213	div1 r5,r4
214	.endr
215	mov.l @r15+,r5
216	mov.l @r15+,r4
217	jmp @r1
218	rotcl r0
219	ENDFUNC(GLOBAL(udivsi3_i4i))
220
221	.global GLOBAL(sdivsi3_i4i)
222	GLOBAL(sdivsi3_i4i):
223	mov.l r4,@-r15
224	cmp/pz r5
225	mov.l r5,@-r15
226	bt/s LOCAL(pos_divisor)
227	cmp/pz r4
228	neg r5,r5
229	extu.w r5,r0
230	bt/s LOCAL(neg_result)
231	cmp/eq r5,r0
232	neg r4,r4
233	LOCAL(pos_result):
234	swap.w r4,r0
235	bra LOCAL(sdiv_check_divisor)
236	sts pr,r1
237	LOCAL(pos_divisor):
238	extu.w r5,r0
239	bt/s LOCAL(pos_result)
240	cmp/eq r5,r0
241	neg r4,r4
242	LOCAL(neg_result):
243	mova LOCAL(negate_result),r0
244	;
245	mov r0,r1
246	swap.w r4,r0
247	lds r2,macl
248	sts pr,r2
249	LOCAL(sdiv_check_divisor):
250	shlr16 r4
251	bf/s LOCAL(sdiv_large_divisor)
252	div0u
253	bra LOCAL(sdiv_small_divisor)
254	shll16 r5
255	.balign 4
256	LOCAL(negate_result):
257	neg r0,r0
258	jmp @r2
259	sts macl,r2
260	ENDFUNC(GLOBAL(sdivsi3_i4i))
261	#endif /* !__SH_FPU_DOUBLE__ */
262	#endif /* L_udivsi3_i4i */
263
264	#ifdef L_sdivsi3_i4i
265	#if defined (__SH_FPU_DOUBLE__) \|\| defined (__SH4_SINGLE_ONLY__)
266	/* 48 bytes, 45 cycles on sh4-200 */
267	!! args in r4 and r5, result in r0, clobber r1
268
269	.global GLOBAL(sdivsi3_i4i)
270	FUNC(GLOBAL(sdivsi3_i4i))
271	GLOBAL(sdivsi3_i4i):
272	sts.l fpscr,@-r15
273	sts fpul,r1
274	mova L1,r0
275	lds.l @r0+,fpscr
276	lds r4,fpul
277	#ifdef FMOVD_WORKS
278	fmov.d dr0,@-r15
279	float fpul,dr0
280	lds r5,fpul
281	fmov.d dr2,@-r15
282	#else
283	fmov.s DR01,@-r15
284	fmov.s DR00,@-r15
285	float fpul,dr0
286	lds r5,fpul
287	fmov.s DR21,@-r15
288	fmov.s DR20,@-r15
289	#endif
290	float fpul,dr2
291	fdiv dr2,dr0
292	#ifdef FMOVD_WORKS
293	fmov.d @r15+,dr2
294	#else
295	fmov.s @r15+,DR20
296	fmov.s @r15+,DR21
297	#endif
298	ftrc dr0,fpul
299	#ifdef FMOVD_WORKS
300	fmov.d @r15+,dr0
301	#else
302	fmov.s @r15+,DR00
303	fmov.s @r15+,DR01
304	#endif
305	lds.l @r15+,fpscr
306	sts fpul,r0
307	rts
308	lds r1,fpul
309
310	.p2align 2
311	L1:
312	#ifndef FMOVD_WORKS
313	.long 0x80000
314	#else
315	.long 0x180000
316	#endif
317
318	ENDFUNC(GLOBAL(sdivsi3_i4i))
319	#endif /* __SH_FPU_DOUBLE__ */
320	#endif /* L_sdivsi3_i4i */