]>
Commit | Line | Data |
---|---|---|
1 | /* Copyright (C) 2006-2017 Free Software Foundation, Inc. | |
2 | ||
3 | This file is free software; you can redistribute it and/or modify it | |
4 | under the terms of the GNU General Public License as published by the | |
5 | Free Software Foundation; either version 3, or (at your option) any | |
6 | later version. | |
7 | ||
8 | This file is distributed in the hope that it will be useful, but | |
9 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | General Public License for more details. | |
12 | ||
13 | Under Section 7 of GPL version 3, you are granted additional | |
14 | permissions described in the GCC Runtime Library Exception, version | |
15 | 3.1, as published by the Free Software Foundation. | |
16 | ||
17 | You should have received a copy of the GNU General Public License and | |
18 | a copy of the GCC Runtime Library Exception along with this program; | |
19 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
20 | <http://www.gnu.org/licenses/>. */ | |
21 | ||
22 | /* Moderately Space-optimized libgcc routines for the Renesas SH / | |
23 | STMicroelectronics ST40 CPUs. | |
24 | Contributed by J"orn Rennecke joern.rennecke@st.com. */ | |
25 | ||
26 | #include "lib1funcs.h" | |
27 | ||
28 | #ifdef L_udivsi3_i4i | |
29 | ||
30 | /* 88 bytes; sh4-200 cycle counts: | |
31 | divisor >= 2G: 11 cycles | |
32 | dividend < 2G: 48 cycles | |
33 | dividend >= 2G: divisor != 1: 54 cycles | |
34 | dividend >= 2G, divisor == 1: 22 cycles */ | |
35 | #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) | |
36 | !! args in r4 and r5, result in r0, clobber r1 | |
37 | ||
38 | .global GLOBAL(udivsi3_i4i) | |
39 | FUNC(GLOBAL(udivsi3_i4i)) | |
40 | GLOBAL(udivsi3_i4i): | |
41 | mova L1,r0 | |
42 | cmp/pz r5 | |
43 | sts fpscr,r1 | |
44 | lds.l @r0+,fpscr | |
45 | sts.l fpul,@-r15 | |
46 | bf LOCAL(huge_divisor) | |
47 | mov.l r1,@-r15 | |
48 | lds r4,fpul | |
49 | cmp/pz r4 | |
50 | #ifdef FMOVD_WORKS | |
51 | fmov.d dr0,@-r15 | |
52 | float fpul,dr0 | |
53 | fmov.d dr2,@-r15 | |
54 | bt LOCAL(dividend_adjusted) | |
55 | mov #1,r1 | |
56 | fmov.d @r0,dr2 | |
57 | cmp/eq r1,r5 | |
58 | bt LOCAL(div_by_1) | |
59 | fadd dr2,dr0 | |
60 | LOCAL(dividend_adjusted): | |
61 | lds r5,fpul | |
62 | float fpul,dr2 | |
63 | fdiv dr2,dr0 | |
64 | LOCAL(div_by_1): | |
65 | fmov.d @r15+,dr2 | |
66 | ftrc dr0,fpul | |
67 | fmov.d @r15+,dr0 | |
68 | #else /* !FMOVD_WORKS */ | |
69 | fmov.s DR01,@-r15 | |
70 | mov #1,r1 | |
71 | fmov.s DR00,@-r15 | |
72 | float fpul,dr0 | |
73 | fmov.s DR21,@-r15 | |
74 | bt/s LOCAL(dividend_adjusted) | |
75 | fmov.s DR20,@-r15 | |
76 | cmp/eq r1,r5 | |
77 | bt LOCAL(div_by_1) | |
78 | fmov.s @r0+,DR20 | |
79 | fmov.s @r0,DR21 | |
80 | fadd dr2,dr0 | |
81 | LOCAL(dividend_adjusted): | |
82 | lds r5,fpul | |
83 | float fpul,dr2 | |
84 | fdiv dr2,dr0 | |
85 | LOCAL(div_by_1): | |
86 | fmov.s @r15+,DR20 | |
87 | fmov.s @r15+,DR21 | |
88 | ftrc dr0,fpul | |
89 | fmov.s @r15+,DR00 | |
90 | fmov.s @r15+,DR01 | |
91 | #endif /* !FMOVD_WORKS */ | |
92 | lds.l @r15+,fpscr | |
93 | sts fpul,r0 | |
94 | rts | |
95 | lds.l @r15+,fpul | |
96 | ||
97 | #ifdef FMOVD_WORKS | |
98 | .p2align 3 ! make double below 8 byte aligned. | |
99 | #endif | |
100 | LOCAL(huge_divisor): | |
101 | lds r1,fpscr | |
102 | add #4,r15 | |
103 | cmp/hs r5,r4 | |
104 | rts | |
105 | movt r0 | |
106 | ||
107 | .p2align 2 | |
108 | L1: | |
109 | #ifndef FMOVD_WORKS | |
110 | .long 0x80000 | |
111 | #else | |
112 | .long 0x180000 | |
113 | #endif | |
114 | .double 4294967296 | |
115 | ||
116 | ENDFUNC(GLOBAL(udivsi3_i4i)) | |
117 | #elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */ | |
118 | ||
119 | #if 0 | |
120 | /* With 36 bytes, the following would probably be the most compact | |
121 | implementation, but with 139 cycles on an sh4-200, it is extremely slow. */ | |
122 | GLOBAL(udivsi3_i4i): | |
123 | mov.l r2,@-r15 | |
124 | mov #0,r1 | |
125 | div0u | |
126 | mov r1,r2 | |
127 | mov.l r3,@-r15 | |
128 | mov r1,r3 | |
129 | sett | |
130 | mov r4,r0 | |
131 | LOCAL(loop): | |
132 | rotcr r2 | |
133 | ; | |
134 | bt/s LOCAL(end) | |
135 | cmp/gt r2,r3 | |
136 | rotcl r0 | |
137 | bra LOCAL(loop) | |
138 | div1 r5,r1 | |
139 | LOCAL(end): | |
140 | rotcl r0 | |
141 | mov.l @r15+,r3 | |
142 | rts | |
143 | mov.l @r15+,r2 | |
144 | #endif /* 0 */ | |
145 | ||
146 | /* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i | |
147 | sh4-200 run times: | |
148 | udiv small divisor: 55 cycles | |
149 | udiv large divisor: 52 cycles | |
150 | sdiv small divisor, positive result: 59 cycles | |
151 | sdiv large divisor, positive result: 56 cycles | |
152 | sdiv small divisor, negative result: 65 cycles (*) | |
153 | sdiv large divisor, negative result: 62 cycles (*) | |
154 | (*): r2 is restored in the rts delay slot and has a lingering latency | |
155 | of two more cycles. */ | |
156 | .balign 4 | |
157 | .global GLOBAL(udivsi3_i4i) | |
158 | FUNC(GLOBAL(udivsi3_i4i)) | |
159 | FUNC(GLOBAL(sdivsi3_i4i)) | |
160 | GLOBAL(udivsi3_i4i): | |
161 | sts pr,r1 | |
162 | mov.l r4,@-r15 | |
163 | extu.w r5,r0 | |
164 | cmp/eq r5,r0 | |
165 | swap.w r4,r0 | |
166 | shlr16 r4 | |
167 | bf/s LOCAL(large_divisor) | |
168 | div0u | |
169 | mov.l r5,@-r15 | |
170 | shll16 r5 | |
171 | LOCAL(sdiv_small_divisor): | |
172 | div1 r5,r4 | |
173 | bsr LOCAL(div6) | |
174 | div1 r5,r4 | |
175 | div1 r5,r4 | |
176 | bsr LOCAL(div6) | |
177 | div1 r5,r4 | |
178 | xtrct r4,r0 | |
179 | xtrct r0,r4 | |
180 | bsr LOCAL(div7) | |
181 | swap.w r4,r4 | |
182 | div1 r5,r4 | |
183 | bsr LOCAL(div7) | |
184 | div1 r5,r4 | |
185 | xtrct r4,r0 | |
186 | mov.l @r15+,r5 | |
187 | swap.w r0,r0 | |
188 | mov.l @r15+,r4 | |
189 | jmp @r1 | |
190 | rotcl r0 | |
191 | LOCAL(div7): | |
192 | div1 r5,r4 | |
193 | LOCAL(div6): | |
194 | div1 r5,r4; div1 r5,r4; div1 r5,r4 | |
195 | div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 | |
196 | ||
197 | LOCAL(divx3): | |
198 | rotcl r0 | |
199 | div1 r5,r4 | |
200 | rotcl r0 | |
201 | div1 r5,r4 | |
202 | rotcl r0 | |
203 | rts | |
204 | div1 r5,r4 | |
205 | ||
206 | LOCAL(large_divisor): | |
207 | mov.l r5,@-r15 | |
208 | LOCAL(sdiv_large_divisor): | |
209 | xor r4,r0 | |
210 | .rept 4 | |
211 | rotcl r0 | |
212 | bsr LOCAL(divx3) | |
213 | div1 r5,r4 | |
214 | .endr | |
215 | mov.l @r15+,r5 | |
216 | mov.l @r15+,r4 | |
217 | jmp @r1 | |
218 | rotcl r0 | |
219 | ENDFUNC(GLOBAL(udivsi3_i4i)) | |
220 | ||
221 | .global GLOBAL(sdivsi3_i4i) | |
222 | GLOBAL(sdivsi3_i4i): | |
223 | mov.l r4,@-r15 | |
224 | cmp/pz r5 | |
225 | mov.l r5,@-r15 | |
226 | bt/s LOCAL(pos_divisor) | |
227 | cmp/pz r4 | |
228 | neg r5,r5 | |
229 | extu.w r5,r0 | |
230 | bt/s LOCAL(neg_result) | |
231 | cmp/eq r5,r0 | |
232 | neg r4,r4 | |
233 | LOCAL(pos_result): | |
234 | swap.w r4,r0 | |
235 | bra LOCAL(sdiv_check_divisor) | |
236 | sts pr,r1 | |
237 | LOCAL(pos_divisor): | |
238 | extu.w r5,r0 | |
239 | bt/s LOCAL(pos_result) | |
240 | cmp/eq r5,r0 | |
241 | neg r4,r4 | |
242 | LOCAL(neg_result): | |
243 | mova LOCAL(negate_result),r0 | |
244 | ; | |
245 | mov r0,r1 | |
246 | swap.w r4,r0 | |
247 | lds r2,macl | |
248 | sts pr,r2 | |
249 | LOCAL(sdiv_check_divisor): | |
250 | shlr16 r4 | |
251 | bf/s LOCAL(sdiv_large_divisor) | |
252 | div0u | |
253 | bra LOCAL(sdiv_small_divisor) | |
254 | shll16 r5 | |
255 | .balign 4 | |
256 | LOCAL(negate_result): | |
257 | neg r0,r0 | |
258 | jmp @r2 | |
259 | sts macl,r2 | |
260 | ENDFUNC(GLOBAL(sdivsi3_i4i)) | |
261 | #endif /* !__SH_FPU_DOUBLE__ */ | |
262 | #endif /* L_udivsi3_i4i */ | |
263 | ||
264 | #ifdef L_sdivsi3_i4i | |
265 | #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) | |
266 | /* 48 bytes, 45 cycles on sh4-200 */ | |
267 | !! args in r4 and r5, result in r0, clobber r1 | |
268 | ||
269 | .global GLOBAL(sdivsi3_i4i) | |
270 | FUNC(GLOBAL(sdivsi3_i4i)) | |
271 | GLOBAL(sdivsi3_i4i): | |
272 | sts.l fpscr,@-r15 | |
273 | sts fpul,r1 | |
274 | mova L1,r0 | |
275 | lds.l @r0+,fpscr | |
276 | lds r4,fpul | |
277 | #ifdef FMOVD_WORKS | |
278 | fmov.d dr0,@-r15 | |
279 | float fpul,dr0 | |
280 | lds r5,fpul | |
281 | fmov.d dr2,@-r15 | |
282 | #else | |
283 | fmov.s DR01,@-r15 | |
284 | fmov.s DR00,@-r15 | |
285 | float fpul,dr0 | |
286 | lds r5,fpul | |
287 | fmov.s DR21,@-r15 | |
288 | fmov.s DR20,@-r15 | |
289 | #endif | |
290 | float fpul,dr2 | |
291 | fdiv dr2,dr0 | |
292 | #ifdef FMOVD_WORKS | |
293 | fmov.d @r15+,dr2 | |
294 | #else | |
295 | fmov.s @r15+,DR20 | |
296 | fmov.s @r15+,DR21 | |
297 | #endif | |
298 | ftrc dr0,fpul | |
299 | #ifdef FMOVD_WORKS | |
300 | fmov.d @r15+,dr0 | |
301 | #else | |
302 | fmov.s @r15+,DR00 | |
303 | fmov.s @r15+,DR01 | |
304 | #endif | |
305 | lds.l @r15+,fpscr | |
306 | sts fpul,r0 | |
307 | rts | |
308 | lds r1,fpul | |
309 | ||
310 | .p2align 2 | |
311 | L1: | |
312 | #ifndef FMOVD_WORKS | |
313 | .long 0x80000 | |
314 | #else | |
315 | .long 0x180000 | |
316 | #endif | |
317 | ||
318 | ENDFUNC(GLOBAL(sdivsi3_i4i)) | |
319 | #endif /* __SH_FPU_DOUBLE__ */ | |
320 | #endif /* L_sdivsi3_i4i */ |