]>
Commit | Line | Data |
---|---|---|
e55e4056 | 1 | /* -*- Mode: Asm -*- */ |
83ffe9cd | 2 | ;; Copyright (C) 2012-2023 Free Software Foundation, Inc. |
e55e4056 GJL |
3 | ;; Contributed by Sean D'Epagnier (sean@depagnier.com) |
4 | ;; Georg-Johann Lay (avr@gjlay.de) | |
5 | ||
6 | ;; This file is free software; you can redistribute it and/or modify it | |
7 | ;; under the terms of the GNU General Public License as published by the | |
8 | ;; Free Software Foundation; either version 3, or (at your option) any | |
9 | ;; later version. | |
10 | ||
11 | ;; In addition to the permissions in the GNU General Public License, the | |
12 | ;; Free Software Foundation gives you unlimited permission to link the | |
13 | ;; compiled version of this file into combinations with other programs, | |
14 | ;; and to distribute those combinations without any restriction coming | |
15 | ;; from the use of this file. (The General Public License restrictions | |
16 | ;; do apply in other respects; for example, they cover modification of | |
17 | ;; the file, and distribution when not linked into a combine | |
18 | ;; executable.) | |
19 | ||
20 | ;; This file is distributed in the hope that it will be useful, but | |
21 | ;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
23 | ;; General Public License for more details. | |
24 | ||
25 | ;; You should have received a copy of the GNU General Public License | |
26 | ;; along with this program; see the file COPYING. If not, write to | |
27 | ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, | |
28 | ;; Boston, MA 02110-1301, USA. | |
29 | ||
30 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
31 | ;; Fixed point library routines for AVR | |
32 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
33 | ||
c1dd9790 JR |
34 | #if defined __AVR_TINY__ |
35 | #define __zero_reg__ r17 | |
36 | #define __tmp_reg__ r16 | |
37 | #else | |
38 | #define __zero_reg__ r1 | |
39 | #define __tmp_reg__ r0 | |
40 | #endif | |
41 | ||
e55e4056 GJL |
42 | .section .text.libgcc.fixed, "ax", @progbits |
43 | ||
c1dd9790 JR |
44 | #ifndef __AVR_TINY__ |
45 | ||
e55e4056 GJL |
46 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
47 | ;; Conversions to float | |
48 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
49 | ||
50 | #if defined (L_fractqqsf) | |
51 | DEFUN __fractqqsf | |
52 | ;; Move in place for SA -> SF conversion | |
53 | clr r22 | |
54 | mov r23, r24 | |
e55e4056 | 55 | ;; Sign-extend |
e13d9d5a | 56 | lsl r24 |
e55e4056 GJL |
57 | sbc r24, r24 |
58 | mov r25, r24 | |
59 | XJMP __fractsasf | |
60 | ENDF __fractqqsf | |
61 | #endif /* L_fractqqsf */ | |
62 | ||
63 | #if defined (L_fractuqqsf) | |
64 | DEFUN __fractuqqsf | |
65 | ;; Move in place for USA -> SF conversion | |
66 | clr r22 | |
67 | mov r23, r24 | |
68 | ;; Zero-extend | |
69 | clr r24 | |
70 | clr r25 | |
71 | XJMP __fractusasf | |
72 | ENDF __fractuqqsf | |
73 | #endif /* L_fractuqqsf */ | |
74 | ||
75 | #if defined (L_fracthqsf) | |
76 | DEFUN __fracthqsf | |
77 | ;; Move in place for SA -> SF conversion | |
78 | wmov 22, 24 | |
e55e4056 | 79 | ;; Sign-extend |
e13d9d5a | 80 | lsl r25 |
e55e4056 GJL |
81 | sbc r24, r24 |
82 | mov r25, r24 | |
83 | XJMP __fractsasf | |
84 | ENDF __fracthqsf | |
85 | #endif /* L_fracthqsf */ | |
86 | ||
87 | #if defined (L_fractuhqsf) | |
88 | DEFUN __fractuhqsf | |
89 | ;; Move in place for USA -> SF conversion | |
90 | wmov 22, 24 | |
91 | ;; Zero-extend | |
92 | clr r24 | |
93 | clr r25 | |
94 | XJMP __fractusasf | |
95 | ENDF __fractuhqsf | |
96 | #endif /* L_fractuhqsf */ | |
97 | ||
98 | #if defined (L_fracthasf) | |
99 | DEFUN __fracthasf | |
100 | ;; Move in place for SA -> SF conversion | |
101 | clr r22 | |
102 | mov r23, r24 | |
103 | mov r24, r25 | |
104 | ;; Sign-extend | |
105 | lsl r25 | |
106 | sbc r25, r25 | |
107 | XJMP __fractsasf | |
108 | ENDF __fracthasf | |
109 | #endif /* L_fracthasf */ | |
110 | ||
111 | #if defined (L_fractuhasf) | |
112 | DEFUN __fractuhasf | |
113 | ;; Move in place for USA -> SF conversion | |
114 | clr r22 | |
115 | mov r23, r24 | |
116 | mov r24, r25 | |
117 | ;; Zero-extend | |
118 | clr r25 | |
119 | XJMP __fractusasf | |
120 | ENDF __fractuhasf | |
121 | #endif /* L_fractuhasf */ | |
122 | ||
123 | ||
124 | #if defined (L_fractsqsf) | |
125 | DEFUN __fractsqsf | |
126 | XCALL __floatsisf | |
127 | ;; Divide non-zero results by 2^31 to move the | |
128 | ;; decimal point into place | |
129 | tst r25 | |
130 | breq 0f | |
131 | subi r24, exp_lo (31) | |
132 | sbci r25, exp_hi (31) | |
133 | 0: ret | |
134 | ENDF __fractsqsf | |
135 | #endif /* L_fractsqsf */ | |
136 | ||
137 | #if defined (L_fractusqsf) | |
138 | DEFUN __fractusqsf | |
139 | XCALL __floatunsisf | |
140 | ;; Divide non-zero results by 2^32 to move the | |
141 | ;; decimal point into place | |
142 | cpse r25, __zero_reg__ | |
143 | subi r25, exp_hi (32) | |
144 | ret | |
145 | ENDF __fractusqsf | |
146 | #endif /* L_fractusqsf */ | |
147 | ||
148 | #if defined (L_fractsasf) | |
149 | DEFUN __fractsasf | |
150 | XCALL __floatsisf | |
e13d9d5a | 151 | ;; Divide non-zero results by 2^15 to move the |
e55e4056 | 152 | ;; decimal point into place |
e13d9d5a GJL |
153 | tst r25 |
154 | breq 0f | |
155 | subi r24, exp_lo (15) | |
156 | sbci r25, exp_hi (15) | |
157 | 0: ret | |
e55e4056 GJL |
158 | ENDF __fractsasf |
159 | #endif /* L_fractsasf */ | |
160 | ||
161 | #if defined (L_fractusasf) | |
162 | DEFUN __fractusasf | |
163 | XCALL __floatunsisf | |
164 | ;; Divide non-zero results by 2^16 to move the | |
165 | ;; decimal point into place | |
166 | cpse r25, __zero_reg__ | |
167 | subi r25, exp_hi (16) | |
168 | ret | |
169 | ENDF __fractusasf | |
170 | #endif /* L_fractusasf */ | |
171 | ||
172 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
173 | ;; Conversions from float | |
174 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
00892272 | 175 | |
e55e4056 GJL |
176 | #if defined (L_fractsfqq) |
177 | DEFUN __fractsfqq | |
178 | ;; Multiply with 2^{24+7} to get a QQ result in r25 | |
179 | subi r24, exp_lo (-31) | |
180 | sbci r25, exp_hi (-31) | |
181 | XCALL __fixsfsi | |
182 | mov r24, r25 | |
183 | ret | |
184 | ENDF __fractsfqq | |
185 | #endif /* L_fractsfqq */ | |
186 | ||
187 | #if defined (L_fractsfuqq) | |
188 | DEFUN __fractsfuqq | |
189 | ;; Multiply with 2^{24+8} to get a UQQ result in r25 | |
190 | subi r25, exp_hi (-32) | |
191 | XCALL __fixunssfsi | |
192 | mov r24, r25 | |
193 | ret | |
194 | ENDF __fractsfuqq | |
195 | #endif /* L_fractsfuqq */ | |
196 | ||
197 | #if defined (L_fractsfha) | |
198 | DEFUN __fractsfha | |
e13d9d5a GJL |
199 | ;; Multiply with 2^{16+7} to get a HA result in r25:r24 |
200 | subi r24, exp_lo (-23) | |
201 | sbci r25, exp_hi (-23) | |
e55e4056 GJL |
202 | XJMP __fixsfsi |
203 | ENDF __fractsfha | |
204 | #endif /* L_fractsfha */ | |
205 | ||
206 | #if defined (L_fractsfuha) | |
207 | DEFUN __fractsfuha | |
208 | ;; Multiply with 2^24 to get a UHA result in r25:r24 | |
209 | subi r25, exp_hi (-24) | |
210 | XJMP __fixunssfsi | |
211 | ENDF __fractsfuha | |
212 | #endif /* L_fractsfuha */ | |
213 | ||
214 | #if defined (L_fractsfhq) | |
e13d9d5a | 215 | FALIAS __fractsfsq |
e55e4056 GJL |
216 | |
217 | DEFUN __fractsfhq | |
218 | ;; Multiply with 2^{16+15} to get a HQ result in r25:r24 | |
219 | ;; resp. with 2^31 to get a SQ result in r25:r22 | |
220 | subi r24, exp_lo (-31) | |
221 | sbci r25, exp_hi (-31) | |
222 | XJMP __fixsfsi | |
223 | ENDF __fractsfhq | |
224 | #endif /* L_fractsfhq */ | |
225 | ||
226 | #if defined (L_fractsfuhq) | |
e13d9d5a | 227 | FALIAS __fractsfusq |
e55e4056 GJL |
228 | |
229 | DEFUN __fractsfuhq | |
230 | ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24 | |
231 | ;; resp. with 2^32 to get a USQ result in r25:r22 | |
232 | subi r25, exp_hi (-32) | |
233 | XJMP __fixunssfsi | |
234 | ENDF __fractsfuhq | |
235 | #endif /* L_fractsfuhq */ | |
236 | ||
237 | #if defined (L_fractsfsa) | |
238 | DEFUN __fractsfsa | |
e13d9d5a GJL |
239 | ;; Multiply with 2^15 to get a SA result in r25:r22 |
240 | subi r24, exp_lo (-15) | |
241 | sbci r25, exp_hi (-15) | |
e55e4056 GJL |
242 | XJMP __fixsfsi |
243 | ENDF __fractsfsa | |
244 | #endif /* L_fractsfsa */ | |
245 | ||
246 | #if defined (L_fractsfusa) | |
247 | DEFUN __fractsfusa | |
248 | ;; Multiply with 2^16 to get a USA result in r25:r22 | |
249 | subi r25, exp_hi (-16) | |
250 | XJMP __fixunssfsi | |
251 | ENDF __fractsfusa | |
252 | #endif /* L_fractsfusa */ | |
253 | ||
254 | ||
255 | ;; For multiplication the functions here are called directly from | |
256 | ;; avr-fixed.md instead of using the standard libcall mechanisms. | |
257 | ;; This can make better code because GCC knows exactly which | |
258 | ;; of the call-used registers (not all of them) are clobbered. */ | |
259 | ||
260 | /******************************************************* | |
261 | Fractional Multiplication 8 x 8 without MUL | |
262 | *******************************************************/ | |
263 | ||
264 | #if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__) | |
265 | ;;; R23 = R24 * R25 | |
266 | ;;; Clobbers: __tmp_reg__, R22, R24, R25 | |
267 | ;;; Rounding: ??? | |
268 | DEFUN __mulqq3 | |
269 | XCALL __fmuls | |
270 | ;; TR 18037 requires that (-1) * (-1) does not overflow | |
271 | ;; The only input that can produce -1 is (-1)^2. | |
272 | dec r23 | |
273 | brvs 0f | |
274 | inc r23 | |
275 | 0: ret | |
276 | ENDF __mulqq3 | |
277 | #endif /* L_mulqq3 && ! HAVE_MUL */ | |
278 | ||
279 | /******************************************************* | |
280 | Fractional Multiply .16 x .16 with and without MUL | |
281 | *******************************************************/ | |
282 | ||
283 | #if defined (L_mulhq3) | |
284 | ;;; Same code with and without MUL, but the interfaces differ: | |
285 | ;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) | |
286 | ;;; Clobbers: ABI, called by optabs | |
287 | ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) | |
288 | ;;; Clobbers: __tmp_reg__, R22, R23 | |
289 | ;;; Rounding: -0.5 LSB <= error <= 0.5 LSB | |
290 | DEFUN __mulhq3 | |
291 | XCALL __mulhisi3 | |
292 | ;; Shift result into place | |
293 | lsl r23 | |
294 | rol r24 | |
295 | rol r25 | |
296 | brvs 1f | |
297 | ;; Round | |
298 | sbrc r23, 7 | |
299 | adiw r24, 1 | |
300 | ret | |
301 | 1: ;; Overflow. TR 18037 requires (-1)^2 not to overflow | |
302 | ldi r24, lo8 (0x7fff) | |
303 | ldi r25, hi8 (0x7fff) | |
304 | ret | |
305 | ENDF __mulhq3 | |
306 | #endif /* defined (L_mulhq3) */ | |
307 | ||
308 | #if defined (L_muluhq3) | |
309 | ;;; Same code with and without MUL, but the interfaces differ: | |
310 | ;;; no MUL: (R25:R24) *= (R23:R22) | |
311 | ;;; Clobbers: ABI, called by optabs | |
312 | ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) | |
313 | ;;; Clobbers: __tmp_reg__, R22, R23 | |
314 | ;;; Rounding: -0.5 LSB < error <= 0.5 LSB | |
315 | DEFUN __muluhq3 | |
316 | XCALL __umulhisi3 | |
317 | ;; Round | |
318 | sbrc r23, 7 | |
319 | adiw r24, 1 | |
320 | ret | |
321 | ENDF __muluhq3 | |
322 | #endif /* L_muluhq3 */ | |
323 | ||
324 | ||
325 | /******************************************************* | |
326 | Fixed Multiply 8.8 x 8.8 with and without MUL | |
327 | *******************************************************/ | |
328 | ||
329 | #if defined (L_mulha3) | |
330 | ;;; Same code with and without MUL, but the interfaces differ: | |
331 | ;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) | |
332 | ;;; Clobbers: ABI, called by optabs | |
333 | ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) | |
334 | ;;; Clobbers: __tmp_reg__, R22, R23 | |
335 | ;;; Rounding: -0.5 LSB <= error <= 0.5 LSB | |
336 | DEFUN __mulha3 | |
337 | XCALL __mulhisi3 | |
e13d9d5a GJL |
338 | lsl r22 |
339 | rol r23 | |
340 | rol r24 | |
e55e4056 GJL |
341 | XJMP __muluha3_round |
342 | ENDF __mulha3 | |
343 | #endif /* L_mulha3 */ | |
344 | ||
345 | #if defined (L_muluha3) | |
346 | ;;; Same code with and without MUL, but the interfaces differ: | |
347 | ;;; no MUL: (R25:R24) *= (R23:R22) | |
348 | ;;; Clobbers: ABI, called by optabs | |
349 | ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) | |
350 | ;;; Clobbers: __tmp_reg__, R22, R23 | |
351 | ;;; Rounding: -0.5 LSB < error <= 0.5 LSB | |
352 | DEFUN __muluha3 | |
353 | XCALL __umulhisi3 | |
354 | XJMP __muluha3_round | |
355 | ENDF __muluha3 | |
356 | #endif /* L_muluha3 */ | |
357 | ||
358 | #if defined (L_muluha3_round) | |
359 | DEFUN __muluha3_round | |
360 | ;; Shift result into place | |
361 | mov r25, r24 | |
362 | mov r24, r23 | |
363 | ;; Round | |
364 | sbrc r22, 7 | |
365 | adiw r24, 1 | |
366 | ret | |
367 | ENDF __muluha3_round | |
368 | #endif /* L_muluha3_round */ | |
369 | ||
370 | ||
371 | /******************************************************* | |
372 | Fixed Multiplication 16.16 x 16.16 | |
373 | *******************************************************/ | |
374 | ||
e13d9d5a GJL |
375 | ;; Bits outside the result (below LSB), used in the signed version |
376 | #define GUARD __tmp_reg__ | |
377 | ||
e55e4056 GJL |
378 | #if defined (__AVR_HAVE_MUL__) |
379 | ||
380 | ;; Multiplier | |
381 | #define A0 16 | |
382 | #define A1 A0+1 | |
383 | #define A2 A1+1 | |
384 | #define A3 A2+1 | |
385 | ||
386 | ;; Multiplicand | |
387 | #define B0 20 | |
388 | #define B1 B0+1 | |
389 | #define B2 B1+1 | |
390 | #define B3 B2+1 | |
391 | ||
392 | ;; Result | |
393 | #define C0 24 | |
394 | #define C1 C0+1 | |
395 | #define C2 C1+1 | |
396 | #define C3 C2+1 | |
397 | ||
398 | #if defined (L_mulusa3) | |
399 | ;;; (C3:C0) = (A3:A0) * (B3:B0) | |
e13d9d5a GJL |
400 | DEFUN __mulusa3 |
401 | set | |
402 | ;; Fallthru | |
403 | ENDF __mulusa3 | |
404 | ||
405 | ;;; Round for last digit iff T = 1 | |
406 | ;;; Return guard bits in GUARD (__tmp_reg__). | |
407 | ;;; Rounding, T = 0: -1.0 LSB < error <= 0 LSB | |
408 | ;;; Rounding, T = 1: -0.5 LSB < error <= 0.5 LSB | |
409 | DEFUN __mulusa3_round | |
e55e4056 GJL |
410 | ;; Some of the MUL instructions have LSBs outside the result. |
411 | ;; Don't ignore these LSBs in order to tame rounding error. | |
412 | ;; Use C2/C3 for these LSBs. | |
413 | ||
414 | clr C0 | |
415 | clr C1 | |
416 | mul A0, B0 $ movw C2, r0 | |
417 | ||
418 | mul A1, B0 $ add C3, r0 $ adc C0, r1 | |
419 | mul A0, B1 $ add C3, r0 $ adc C0, r1 $ rol C1 | |
00892272 | 420 | |
e13d9d5a GJL |
421 | ;; Round if T = 1. Store guarding bits outside the result for rounding |
422 | ;; and left-shift by the signed version (function below). | |
423 | brtc 0f | |
e55e4056 GJL |
424 | sbrc C3, 7 |
425 | adiw C0, 1 | |
e13d9d5a | 426 | 0: push C3 |
00892272 | 427 | |
e55e4056 GJL |
428 | ;; The following MULs don't have LSBs outside the result. |
429 | ;; C2/C3 is the high part. | |
430 | ||
431 | mul A0, B2 $ add C0, r0 $ adc C1, r1 $ sbc C2, C2 | |
432 | mul A1, B1 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 | |
433 | mul A2, B0 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 | |
434 | neg C2 | |
435 | ||
436 | mul A0, B3 $ add C1, r0 $ adc C2, r1 $ sbc C3, C3 | |
437 | mul A1, B2 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 | |
438 | mul A2, B1 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 | |
439 | mul A3, B0 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 | |
440 | neg C3 | |
00892272 | 441 | |
e55e4056 GJL |
442 | mul A1, B3 $ add C2, r0 $ adc C3, r1 |
443 | mul A2, B2 $ add C2, r0 $ adc C3, r1 | |
444 | mul A3, B1 $ add C2, r0 $ adc C3, r1 | |
00892272 | 445 | |
e55e4056 GJL |
446 | mul A2, B3 $ add C3, r0 |
447 | mul A3, B2 $ add C3, r0 | |
448 | ||
e13d9d5a GJL |
449 | ;; Guard bits used in the signed version below. |
450 | pop GUARD | |
e55e4056 GJL |
451 | clr __zero_reg__ |
452 | ret | |
e13d9d5a | 453 | ENDF __mulusa3_round |
e55e4056 GJL |
454 | #endif /* L_mulusa3 */ |
455 | ||
456 | #if defined (L_mulsa3) | |
457 | ;;; (C3:C0) = (A3:A0) * (B3:B0) | |
e13d9d5a | 458 | ;;; Clobbers: __tmp_reg__, T |
e55e4056 GJL |
459 | ;;; Rounding: -0.5 LSB <= error <= 0.5 LSB |
460 | DEFUN __mulsa3 | |
e13d9d5a GJL |
461 | clt |
462 | XCALL __mulusa3_round | |
463 | ;; A posteriori sign extension of the operands | |
e55e4056 | 464 | tst B3 |
e13d9d5a | 465 | brpl 1f |
e55e4056 GJL |
466 | sub C2, A0 |
467 | sbc C3, A1 | |
468 | 1: sbrs A3, 7 | |
e13d9d5a | 469 | rjmp 2f |
e55e4056 GJL |
470 | sub C2, B0 |
471 | sbc C3, B1 | |
e13d9d5a GJL |
472 | 2: |
473 | ;; Shift 1 bit left to adjust for 15 fractional bits | |
474 | lsl GUARD | |
475 | rol C0 | |
476 | rol C1 | |
477 | rol C2 | |
478 | rol C3 | |
479 | ;; Round last digit | |
480 | lsl GUARD | |
481 | adc C0, __zero_reg__ | |
482 | adc C1, __zero_reg__ | |
483 | adc C2, __zero_reg__ | |
484 | adc C3, __zero_reg__ | |
e55e4056 GJL |
485 | ret |
486 | ENDF __mulsa3 | |
487 | #endif /* L_mulsa3 */ | |
488 | ||
489 | #undef A0 | |
490 | #undef A1 | |
491 | #undef A2 | |
492 | #undef A3 | |
493 | #undef B0 | |
494 | #undef B1 | |
495 | #undef B2 | |
496 | #undef B3 | |
497 | #undef C0 | |
498 | #undef C1 | |
499 | #undef C2 | |
500 | #undef C3 | |
501 | ||
502 | #else /* __AVR_HAVE_MUL__ */ | |
503 | ||
504 | #define A0 18 | |
505 | #define A1 A0+1 | |
506 | #define A2 A0+2 | |
507 | #define A3 A0+3 | |
508 | ||
509 | #define B0 22 | |
510 | #define B1 B0+1 | |
511 | #define B2 B0+2 | |
512 | #define B3 B0+3 | |
513 | ||
514 | #define C0 22 | |
515 | #define C1 C0+1 | |
516 | #define C2 C0+2 | |
517 | #define C3 C0+3 | |
518 | ||
519 | ;; __tmp_reg__ | |
520 | #define CC0 0 | |
521 | ;; __zero_reg__ | |
522 | #define CC1 1 | |
523 | #define CC2 16 | |
524 | #define CC3 17 | |
525 | ||
526 | #define AA0 26 | |
527 | #define AA1 AA0+1 | |
528 | #define AA2 30 | |
529 | #define AA3 AA2+1 | |
530 | ||
531 | #if defined (L_mulsa3) | |
532 | ;;; (R25:R22) *= (R21:R18) | |
533 | ;;; Clobbers: ABI, called by optabs | |
534 | ;;; Rounding: -1 LSB <= error <= 1 LSB | |
535 | DEFUN __mulsa3 | |
536 | push B0 | |
537 | push B1 | |
e13d9d5a GJL |
538 | push B3 |
539 | clt | |
540 | XCALL __mulusa3_round | |
541 | pop r30 | |
542 | ;; sign-extend B | |
543 | bst r30, 7 | |
544 | brtc 1f | |
545 | ;; A1, A0 survived in R27:R26 | |
546 | sub C2, AA0 | |
547 | sbc C3, AA1 | |
548 | 1: | |
549 | pop AA1 ;; B1 | |
550 | pop AA0 ;; B0 | |
551 | ||
552 | ;; sign-extend A. A3 survived in R31 | |
e55e4056 | 553 | bst AA3, 7 |
e13d9d5a | 554 | brtc 2f |
e55e4056 GJL |
555 | sub C2, AA0 |
556 | sbc C3, AA1 | |
e13d9d5a GJL |
557 | 2: |
558 | ;; Shift 1 bit left to adjust for 15 fractional bits | |
559 | lsl GUARD | |
560 | rol C0 | |
561 | rol C1 | |
562 | rol C2 | |
563 | rol C3 | |
564 | ;; Round last digit | |
565 | lsl GUARD | |
566 | adc C0, __zero_reg__ | |
567 | adc C1, __zero_reg__ | |
568 | adc C2, __zero_reg__ | |
569 | adc C3, __zero_reg__ | |
570 | ret | |
e55e4056 GJL |
571 | ENDF __mulsa3 |
572 | #endif /* L_mulsa3 */ | |
573 | ||
574 | #if defined (L_mulusa3) | |
575 | ;;; (R25:R22) *= (R21:R18) | |
e13d9d5a | 576 | ;;; Clobbers: ABI, called by optabs |
e55e4056 | 577 | ;;; Rounding: -1 LSB <= error <= 1 LSB |
e13d9d5a GJL |
578 | DEFUN __mulusa3 |
579 | set | |
580 | ;; Fallthru | |
581 | ENDF __mulusa3 | |
582 | ||
583 | ;;; A[] survives in 26, 27, 30, 31 | |
584 | ;;; Also used by __mulsa3 with T = 0 | |
585 | ;;; Round if T = 1 | |
586 | ;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version. | |
587 | DEFUN __mulusa3_round | |
e55e4056 GJL |
588 | push CC2 |
589 | push CC3 | |
590 | ; clear result | |
591 | clr __tmp_reg__ | |
592 | wmov CC2, CC0 | |
593 | ; save multiplicand | |
594 | wmov AA0, A0 | |
595 | wmov AA2, A2 | |
596 | rjmp 3f | |
597 | ||
598 | ;; Loop the integral part | |
599 | ||
600 | 1: ;; CC += A * 2^n; n >= 0 | |
601 | add CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 | |
602 | ||
603 | 2: ;; A <<= 1 | |
604 | lsl A0 $ rol A1 $ rol A2 $ rol A3 | |
605 | ||
606 | 3: ;; IBIT(B) >>= 1 | |
607 | ;; Carry = n-th bit of B; n >= 0 | |
608 | lsr B3 | |
609 | ror B2 | |
610 | brcs 1b | |
611 | sbci B3, 0 | |
612 | brne 2b | |
613 | ||
614 | ;; Loop the fractional part | |
615 | ;; B2/B3 is 0 now, use as guard bits for rounding | |
616 | ;; Restore multiplicand | |
617 | wmov A0, AA0 | |
618 | wmov A2, AA2 | |
619 | rjmp 5f | |
620 | ||
621 | 4: ;; CC += A:Guard * 2^n; n < 0 | |
622 | add B3,B2 $ adc CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 | |
623 | 5: | |
624 | ;; A:Guard >>= 1 | |
625 | lsr A3 $ ror A2 $ ror A1 $ ror A0 $ ror B2 | |
626 | ||
627 | ;; FBIT(B) <<= 1 | |
628 | ;; Carry = n-th bit of B; n < 0 | |
629 | lsl B0 | |
630 | rol B1 | |
631 | brcs 4b | |
632 | sbci B0, 0 | |
633 | brne 5b | |
634 | ||
e13d9d5a GJL |
635 | ;; Save guard bits and set carry for rounding |
636 | push B3 | |
e55e4056 | 637 | lsl B3 |
e13d9d5a | 638 | ;; Move result into place |
e55e4056 GJL |
639 | wmov C2, CC2 |
640 | wmov C0, CC0 | |
641 | clr __zero_reg__ | |
e13d9d5a GJL |
642 | brtc 6f |
643 | ;; Round iff T = 1 | |
e55e4056 GJL |
644 | adc C0, __zero_reg__ |
645 | adc C1, __zero_reg__ | |
646 | adc C2, __zero_reg__ | |
647 | adc C3, __zero_reg__ | |
00892272 | 648 | 6: |
e13d9d5a | 649 | pop GUARD |
e55e4056 GJL |
650 | ;; Epilogue |
651 | pop CC3 | |
652 | pop CC2 | |
653 | ret | |
e13d9d5a | 654 | ENDF __mulusa3_round |
e55e4056 GJL |
655 | #endif /* L_mulusa3 */ |
656 | ||
657 | #undef A0 | |
658 | #undef A1 | |
659 | #undef A2 | |
660 | #undef A3 | |
661 | #undef B0 | |
662 | #undef B1 | |
663 | #undef B2 | |
664 | #undef B3 | |
665 | #undef C0 | |
666 | #undef C1 | |
667 | #undef C2 | |
668 | #undef C3 | |
669 | #undef AA0 | |
670 | #undef AA1 | |
671 | #undef AA2 | |
672 | #undef AA3 | |
673 | #undef CC0 | |
674 | #undef CC1 | |
675 | #undef CC2 | |
676 | #undef CC3 | |
677 | ||
678 | #endif /* __AVR_HAVE_MUL__ */ | |
679 | ||
e13d9d5a GJL |
680 | #undef GUARD |
681 | ||
e68a4ef6 GJL |
682 | /*********************************************************** |
683 | Fixed unsigned saturated Multiplication 8.8 x 8.8 | |
684 | ***********************************************************/ | |
685 | ||
686 | #define C0 22 | |
687 | #define C1 C0+1 | |
688 | #define C2 C0+2 | |
689 | #define C3 C0+3 | |
690 | #define SS __tmp_reg__ | |
691 | ||
692 | #if defined (L_usmuluha3) | |
693 | DEFUN __usmuluha3 | |
694 | ;; Widening multiply | |
695 | #ifdef __AVR_HAVE_MUL__ | |
696 | ;; Adjust interface | |
697 | movw R26, R22 | |
698 | movw R18, R24 | |
699 | #endif /* HAVE MUL */ | |
700 | XCALL __umulhisi3 | |
701 | tst C3 | |
702 | brne .Lmax | |
703 | ;; Round, target is in C1..C2 | |
704 | lsl C0 | |
705 | adc C1, __zero_reg__ | |
706 | adc C2, __zero_reg__ | |
707 | brcs .Lmax | |
708 | ;; Move result into place | |
709 | mov C3, C2 | |
710 | mov C2, C1 | |
711 | ret | |
712 | .Lmax: | |
713 | ;; Saturate | |
714 | ldi C2, 0xff | |
715 | ldi C3, 0xff | |
716 | ret | |
717 | ENDF __usmuluha3 | |
718 | #endif /* L_usmuluha3 */ | |
719 | ||
720 | /*********************************************************** | |
721 | Fixed signed saturated Multiplication s8.7 x s8.7 | |
722 | ***********************************************************/ | |
723 | ||
724 | #if defined (L_ssmulha3) | |
725 | DEFUN __ssmulha3 | |
726 | ;; Widening multiply | |
727 | #ifdef __AVR_HAVE_MUL__ | |
728 | ;; Adjust interface | |
729 | movw R26, R22 | |
730 | movw R18, R24 | |
731 | #endif /* HAVE MUL */ | |
732 | XCALL __mulhisi3 | |
733 | ;; Adjust decimal point | |
734 | lsl C0 | |
735 | rol C1 | |
736 | rol C2 | |
737 | brvs .LsatC3.3 | |
738 | ;; The 9 MSBs must be the same | |
739 | rol C3 | |
740 | sbc SS, SS | |
741 | cp C3, SS | |
742 | brne .LsatSS | |
743 | ;; Round | |
744 | lsl C0 | |
745 | adc C1, __zero_reg__ | |
746 | adc C2, __zero_reg__ | |
747 | brvs .Lmax | |
748 | ;; Move result into place | |
749 | mov C3, C2 | |
750 | mov C2, C1 | |
751 | ret | |
752 | .Lmax: | |
753 | ;; Load 0x7fff | |
754 | clr C3 | |
755 | .LsatC3.3: | |
756 | ;; C3 < 0 --> 0x8000 | |
757 | ;; C3 >= 0 --> 0x7fff | |
758 | mov SS, C3 | |
759 | .LsatSS: | |
760 | ;; Load min / max value: | |
761 | ;; SS = -1 --> 0x8000 | |
762 | ;; SS = 0 --> 0x7fff | |
763 | ldi C3, 0x7f | |
764 | ldi C2, 0xff | |
765 | sbrc SS, 7 | |
766 | adiw C2, 1 | |
767 | ret | |
768 | ENDF __ssmulha3 | |
769 | #endif /* L_ssmulha3 */ | |
770 | ||
771 | #undef C0 | |
772 | #undef C1 | |
773 | #undef C2 | |
774 | #undef C3 | |
775 | #undef SS | |
776 | ||
777 | /*********************************************************** | |
778 | Fixed unsigned saturated Multiplication 16.16 x 16.16 | |
779 | ***********************************************************/ | |
780 | ||
781 | #define C0 18 | |
782 | #define C1 C0+1 | |
783 | #define C2 C0+2 | |
784 | #define C3 C0+3 | |
785 | #define C4 C0+4 | |
786 | #define C5 C0+5 | |
787 | #define C6 C0+6 | |
788 | #define C7 C0+7 | |
789 | #define SS __tmp_reg__ | |
790 | ||
791 | #if defined (L_usmulusa3) | |
792 | ;; R22[4] = R22[4] *{ssat} R18[4] | |
793 | ;; Ordinary ABI function | |
794 | DEFUN __usmulusa3 | |
795 | ;; Widening multiply | |
796 | XCALL __umulsidi3 | |
797 | or C7, C6 | |
798 | brne .Lmax | |
799 | ;; Round, target is in C2..C5 | |
800 | lsl C1 | |
801 | adc C2, __zero_reg__ | |
802 | adc C3, __zero_reg__ | |
803 | adc C4, __zero_reg__ | |
804 | adc C5, __zero_reg__ | |
805 | brcs .Lmax | |
806 | ;; Move result into place | |
807 | wmov C6, C4 | |
808 | wmov C4, C2 | |
809 | ret | |
810 | .Lmax: | |
811 | ;; Saturate | |
812 | ldi C7, 0xff | |
813 | ldi C6, 0xff | |
814 | wmov C4, C6 | |
815 | ret | |
816 | ENDF __usmulusa3 | |
817 | #endif /* L_usmulusa3 */ | |
818 | ||
819 | /*********************************************************** | |
820 | Fixed signed saturated Multiplication s16.15 x s16.15 | |
821 | ***********************************************************/ | |
822 | ||
823 | #if defined (L_ssmulsa3) | |
824 | ;; R22[4] = R22[4] *{ssat} R18[4] | |
825 | ;; Ordinary ABI function | |
826 | DEFUN __ssmulsa3 | |
827 | ;; Widening multiply | |
828 | XCALL __mulsidi3 | |
829 | ;; Adjust decimal point | |
830 | lsl C1 | |
831 | rol C2 | |
832 | rol C3 | |
833 | rol C4 | |
834 | rol C5 | |
835 | brvs .LsatC7.7 | |
836 | ;; The 17 MSBs must be the same | |
837 | rol C6 | |
838 | rol C7 | |
839 | sbc SS, SS | |
840 | cp C6, SS | |
841 | cpc C7, SS | |
842 | brne .LsatSS | |
843 | ;; Round | |
844 | lsl C1 | |
845 | adc C2, __zero_reg__ | |
846 | adc C3, __zero_reg__ | |
847 | adc C4, __zero_reg__ | |
848 | adc C5, __zero_reg__ | |
849 | brvs .Lmax | |
850 | ;; Move result into place | |
851 | wmov C6, C4 | |
852 | wmov C4, C2 | |
853 | ret | |
854 | ||
855 | .Lmax: | |
856 | ;; Load 0x7fffffff | |
857 | clr C7 | |
858 | .LsatC7.7: | |
859 | ;; C7 < 0 --> 0x80000000 | |
860 | ;; C7 >= 0 --> 0x7fffffff | |
861 | lsl C7 | |
862 | sbc SS, SS | |
863 | .LsatSS: | |
864 | ;; Load min / max value: | |
865 | ;; SS = -1 --> 0x80000000 | |
866 | ;; SS = 0 --> 0x7fffffff | |
867 | com SS | |
868 | mov C4, SS | |
869 | mov C5, C4 | |
870 | wmov C6, C4 | |
871 | subi C7, 0x80 | |
872 | ret | |
873 | ENDF __ssmulsa3 | |
874 | #endif /* L_ssmulsa3 */ | |
875 | ||
876 | #undef C0 | |
877 | #undef C1 | |
878 | #undef C2 | |
879 | #undef C3 | |
880 | #undef C4 | |
881 | #undef C5 | |
882 | #undef C6 | |
883 | #undef C7 | |
884 | #undef SS | |
885 | ||
e55e4056 GJL |
886 | /******************************************************* |
887 | Fractional Division 8 / 8 | |
888 | *******************************************************/ | |
889 | ||
890 | #define r_divd r25 /* dividend */ | |
891 | #define r_quo r24 /* quotient */ | |
892 | #define r_div r22 /* divisor */ | |
e13d9d5a | 893 | #define r_sign __tmp_reg__ |
e55e4056 GJL |
894 | |
895 | #if defined (L_divqq3) | |
896 | DEFUN __divqq3 | |
e13d9d5a GJL |
897 | mov r_sign, r_divd |
898 | eor r_sign, r_div | |
e55e4056 GJL |
899 | sbrc r_div, 7 |
900 | neg r_div | |
901 | sbrc r_divd, 7 | |
902 | neg r_divd | |
e13d9d5a | 903 | XCALL __divqq_helper |
e55e4056 | 904 | lsr r_quo |
e13d9d5a | 905 | sbrc r_sign, 7 ; negate result if needed |
e55e4056 GJL |
906 | neg r_quo |
907 | ret | |
e55e4056 | 908 | ENDF __divqq3 |
e13d9d5a | 909 | #endif /* L_divqq3 */ |
e55e4056 GJL |
910 | |
911 | #if defined (L_udivuqq3) | |
912 | DEFUN __udivuqq3 | |
e13d9d5a GJL |
913 | cp r_divd, r_div |
914 | brsh 0f | |
915 | XJMP __divqq_helper | |
916 | ;; Result is out of [0, 1) ==> Return 1 - eps. | |
917 | 0: ldi r_quo, 0xff | |
918 | ret | |
919 | ENDF __udivuqq3 | |
920 | #endif /* L_udivuqq3 */ | |
921 | ||
922 | ||
923 | #if defined (L_divqq_helper) | |
924 | DEFUN __divqq_helper | |
e55e4056 GJL |
925 | clr r_quo ; clear quotient |
926 | inc __zero_reg__ ; init loop counter, used per shift | |
927 | __udivuqq3_loop: | |
928 | lsl r_divd ; shift dividend | |
929 | brcs 0f ; dividend overflow | |
930 | cp r_divd,r_div ; compare dividend & divisor | |
931 | brcc 0f ; dividend >= divisor | |
932 | rol r_quo ; shift quotient (with CARRY) | |
933 | rjmp __udivuqq3_cont | |
934 | 0: | |
935 | sub r_divd,r_div ; restore dividend | |
936 | lsl r_quo ; shift quotient (without CARRY) | |
937 | __udivuqq3_cont: | |
938 | lsl __zero_reg__ ; shift loop-counter bit | |
939 | brne __udivuqq3_loop | |
940 | com r_quo ; complement result | |
941 | ; because C flag was complemented in loop | |
942 | ret | |
e13d9d5a GJL |
943 | ENDF __divqq_helper |
944 | #endif /* L_divqq_helper */ | |
e55e4056 GJL |
945 | |
946 | #undef r_divd | |
947 | #undef r_quo | |
948 | #undef r_div | |
e13d9d5a | 949 | #undef r_sign |
e55e4056 GJL |
950 | |
951 | ||
952 | /******************************************************* | |
953 | Fractional Division 16 / 16 | |
954 | *******************************************************/ | |
955 | #define r_divdL 26 /* dividend Low */ | |
956 | #define r_divdH 27 /* dividend Hig */ | |
957 | #define r_quoL 24 /* quotient Low */ | |
958 | #define r_quoH 25 /* quotient High */ | |
959 | #define r_divL 22 /* divisor */ | |
960 | #define r_divH 23 /* divisor */ | |
961 | #define r_cnt 21 | |
962 | ||
963 | #if defined (L_divhq3) | |
964 | DEFUN __divhq3 | |
965 | mov r0, r_divdH | |
966 | eor r0, r_divH | |
967 | sbrs r_divH, 7 | |
968 | rjmp 1f | |
969 | NEG2 r_divL | |
970 | 1: | |
971 | sbrs r_divdH, 7 | |
972 | rjmp 2f | |
973 | NEG2 r_divdL | |
974 | 2: | |
975 | cp r_divdL, r_divL | |
976 | cpc r_divdH, r_divH | |
977 | breq __divhq3_minus1 ; if equal return -1 | |
978 | XCALL __udivuhq3 | |
979 | lsr r_quoH | |
980 | ror r_quoL | |
981 | brpl 9f | |
982 | ;; negate result if needed | |
983 | NEG2 r_quoL | |
984 | 9: | |
985 | ret | |
986 | __divhq3_minus1: | |
987 | ldi r_quoH, 0x80 | |
988 | clr r_quoL | |
989 | ret | |
990 | ENDF __divhq3 | |
991 | #endif /* defined (L_divhq3) */ | |
992 | ||
993 | #if defined (L_udivuhq3) | |
994 | DEFUN __udivuhq3 | |
995 | sub r_quoH,r_quoH ; clear quotient and carry | |
996 | ;; FALLTHRU | |
997 | ENDF __udivuhq3 | |
998 | ||
999 | DEFUN __udivuha3_common | |
1000 | clr r_quoL ; clear quotient | |
1001 | ldi r_cnt,16 ; init loop counter | |
1002 | __udivuhq3_loop: | |
1003 | rol r_divdL ; shift dividend (with CARRY) | |
1004 | rol r_divdH | |
1005 | brcs __udivuhq3_ep ; dividend overflow | |
1006 | cp r_divdL,r_divL ; compare dividend & divisor | |
1007 | cpc r_divdH,r_divH | |
1008 | brcc __udivuhq3_ep ; dividend >= divisor | |
1009 | rol r_quoL ; shift quotient (with CARRY) | |
1010 | rjmp __udivuhq3_cont | |
1011 | __udivuhq3_ep: | |
1012 | sub r_divdL,r_divL ; restore dividend | |
1013 | sbc r_divdH,r_divH | |
1014 | lsl r_quoL ; shift quotient (without CARRY) | |
1015 | __udivuhq3_cont: | |
1016 | rol r_quoH ; shift quotient | |
1017 | dec r_cnt ; decrement loop counter | |
1018 | brne __udivuhq3_loop | |
1019 | com r_quoL ; complement result | |
1020 | com r_quoH ; because C flag was complemented in loop | |
1021 | ret | |
1022 | ENDF __udivuha3_common | |
1023 | #endif /* defined (L_udivuhq3) */ | |
1024 | ||
1025 | /******************************************************* | |
1026 | Fixed Division 8.8 / 8.8 | |
1027 | *******************************************************/ | |
1028 | #if defined (L_divha3) | |
1029 | DEFUN __divha3 | |
1030 | mov r0, r_divdH | |
1031 | eor r0, r_divH | |
1032 | sbrs r_divH, 7 | |
1033 | rjmp 1f | |
1034 | NEG2 r_divL | |
1035 | 1: | |
1036 | sbrs r_divdH, 7 | |
1037 | rjmp 2f | |
1038 | NEG2 r_divdL | |
1039 | 2: | |
1040 | XCALL __udivuha3 | |
e13d9d5a GJL |
1041 | lsr r_quoH ; adjust to 7 fractional bits |
1042 | ror r_quoL | |
e55e4056 GJL |
1043 | sbrs r0, 7 ; negate result if needed |
1044 | ret | |
1045 | NEG2 r_quoL | |
1046 | ret | |
1047 | ENDF __divha3 | |
1048 | #endif /* defined (L_divha3) */ | |
1049 | ||
1050 | #if defined (L_udivuha3) | |
1051 | DEFUN __udivuha3 | |
1052 | mov r_quoH, r_divdL | |
1053 | mov r_divdL, r_divdH | |
1054 | clr r_divdH | |
1055 | lsl r_quoH ; shift quotient into carry | |
1056 | XJMP __udivuha3_common ; same as fractional after rearrange | |
1057 | ENDF __udivuha3 | |
1058 | #endif /* defined (L_udivuha3) */ | |
1059 | ||
1060 | #undef r_divdL | |
1061 | #undef r_divdH | |
1062 | #undef r_quoL | |
1063 | #undef r_quoH | |
1064 | #undef r_divL | |
1065 | #undef r_divH | |
1066 | #undef r_cnt | |
1067 | ||
1068 | /******************************************************* | |
1069 | Fixed Division 16.16 / 16.16 | |
1070 | *******************************************************/ | |
1071 | ||
1072 | #define r_arg1L 24 /* arg1 gets passed already in place */ | |
1073 | #define r_arg1H 25 | |
1074 | #define r_arg1HL 26 | |
1075 | #define r_arg1HH 27 | |
1076 | #define r_divdL 26 /* dividend Low */ | |
1077 | #define r_divdH 27 | |
1078 | #define r_divdHL 30 | |
1079 | #define r_divdHH 31 /* dividend High */ | |
1080 | #define r_quoL 22 /* quotient Low */ | |
1081 | #define r_quoH 23 | |
1082 | #define r_quoHL 24 | |
1083 | #define r_quoHH 25 /* quotient High */ | |
1084 | #define r_divL 18 /* divisor Low */ | |
1085 | #define r_divH 19 | |
1086 | #define r_divHL 20 | |
1087 | #define r_divHH 21 /* divisor High */ | |
1088 | #define r_cnt __zero_reg__ /* loop count (0 after the loop!) */ | |
1089 | ||
1090 | #if defined (L_divsa3) | |
1091 | DEFUN __divsa3 | |
1092 | mov r0, r_arg1HH | |
1093 | eor r0, r_divHH | |
1094 | sbrs r_divHH, 7 | |
1095 | rjmp 1f | |
1096 | NEG4 r_divL | |
1097 | 1: | |
1098 | sbrs r_arg1HH, 7 | |
1099 | rjmp 2f | |
1100 | NEG4 r_arg1L | |
1101 | 2: | |
1102 | XCALL __udivusa3 | |
e13d9d5a GJL |
1103 | lsr r_quoHH ; adjust to 15 fractional bits |
1104 | ror r_quoHL | |
1105 | ror r_quoH | |
1106 | ror r_quoL | |
e55e4056 GJL |
1107 | sbrs r0, 7 ; negate result if needed |
1108 | ret | |
51526856 GJL |
1109 | ;; negate r_quoL |
1110 | XJMP __negsi2 | |
e55e4056 GJL |
1111 | ENDF __divsa3 |
1112 | #endif /* defined (L_divsa3) */ | |
1113 | ||
1114 | #if defined (L_udivusa3) | |
1115 | DEFUN __udivusa3 | |
1116 | ldi r_divdHL, 32 ; init loop counter | |
1117 | mov r_cnt, r_divdHL | |
1118 | clr r_divdHL | |
1119 | clr r_divdHH | |
1120 | wmov r_quoL, r_divdHL | |
1121 | lsl r_quoHL ; shift quotient into carry | |
1122 | rol r_quoHH | |
1123 | __udivusa3_loop: | |
1124 | rol r_divdL ; shift dividend (with CARRY) | |
1125 | rol r_divdH | |
1126 | rol r_divdHL | |
1127 | rol r_divdHH | |
1128 | brcs __udivusa3_ep ; dividend overflow | |
1129 | cp r_divdL,r_divL ; compare dividend & divisor | |
1130 | cpc r_divdH,r_divH | |
1131 | cpc r_divdHL,r_divHL | |
1132 | cpc r_divdHH,r_divHH | |
1133 | brcc __udivusa3_ep ; dividend >= divisor | |
1134 | rol r_quoL ; shift quotient (with CARRY) | |
1135 | rjmp __udivusa3_cont | |
1136 | __udivusa3_ep: | |
1137 | sub r_divdL,r_divL ; restore dividend | |
1138 | sbc r_divdH,r_divH | |
1139 | sbc r_divdHL,r_divHL | |
1140 | sbc r_divdHH,r_divHH | |
1141 | lsl r_quoL ; shift quotient (without CARRY) | |
1142 | __udivusa3_cont: | |
1143 | rol r_quoH ; shift quotient | |
1144 | rol r_quoHL | |
1145 | rol r_quoHH | |
1146 | dec r_cnt ; decrement loop counter | |
1147 | brne __udivusa3_loop | |
1148 | com r_quoL ; complement result | |
1149 | com r_quoH ; because C flag was complemented in loop | |
1150 | com r_quoHL | |
1151 | com r_quoHH | |
1152 | ret | |
1153 | ENDF __udivusa3 | |
1154 | #endif /* defined (L_udivusa3) */ | |
1155 | ||
1156 | #undef r_arg1L | |
1157 | #undef r_arg1H | |
1158 | #undef r_arg1HL | |
1159 | #undef r_arg1HH | |
1160 | #undef r_divdL | |
1161 | #undef r_divdH | |
1162 | #undef r_divdHL | |
1163 | #undef r_divdHH | |
1164 | #undef r_quoL | |
1165 | #undef r_quoH | |
1166 | #undef r_quoHL | |
1167 | #undef r_quoHH | |
1168 | #undef r_divL | |
1169 | #undef r_divH | |
1170 | #undef r_divHL | |
1171 | #undef r_divHH | |
1172 | #undef r_cnt | |
51526856 | 1173 | |
85d768f3 GJL |
1174 | \f |
1175 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1176 | ;; Saturation, 1 Byte | |
1177 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1178 | ||
1179 | ;; First Argument and Return Register | |
1180 | #define A0 24 | |
1181 | ||
1182 | #if defined (L_ssabs_1) | |
1183 | DEFUN __ssabs_1 | |
1184 | sbrs A0, 7 | |
1185 | ret | |
1186 | neg A0 | |
1187 | sbrc A0,7 | |
1188 | dec A0 | |
1189 | ret | |
1190 | ENDF __ssabs_1 | |
1191 | #endif /* L_ssabs_1 */ | |
1192 | ||
1193 | #undef A0 | |
1194 | ||
1195 | ||
51526856 GJL |
1196 | \f |
1197 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1198 | ;; Saturation, 2 Bytes | |
1199 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1200 | ||
1201 | ;; First Argument and Return Register | |
1202 | #define A0 24 | |
1203 | #define A1 A0+1 | |
1204 | ||
1205 | #if defined (L_ssneg_2) | |
1206 | DEFUN __ssneg_2 | |
1207 | NEG2 A0 | |
1208 | brvc 0f | |
1209 | sbiw A0, 1 | |
1210 | 0: ret | |
1211 | ENDF __ssneg_2 | |
1212 | #endif /* L_ssneg_2 */ | |
1213 | ||
1214 | #if defined (L_ssabs_2) | |
1215 | DEFUN __ssabs_2 | |
1216 | sbrs A1, 7 | |
1217 | ret | |
1218 | XJMP __ssneg_2 | |
1219 | ENDF __ssabs_2 | |
1220 | #endif /* L_ssabs_2 */ | |
1221 | ||
1222 | #undef A0 | |
1223 | #undef A1 | |
1224 | ||
1225 | ||
1226 | \f | |
1227 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1228 | ;; Saturation, 4 Bytes | |
1229 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1230 | ||
1231 | ;; First Argument and Return Register | |
1232 | #define A0 22 | |
1233 | #define A1 A0+1 | |
1234 | #define A2 A0+2 | |
1235 | #define A3 A0+3 | |
1236 | ||
1237 | #if defined (L_ssneg_4) | |
1238 | DEFUN __ssneg_4 | |
1239 | XCALL __negsi2 | |
1240 | brvc 0f | |
1241 | ldi A3, 0x7f | |
1242 | ldi A2, 0xff | |
1243 | ldi A1, 0xff | |
1244 | ldi A0, 0xff | |
1245 | 0: ret | |
1246 | ENDF __ssneg_4 | |
1247 | #endif /* L_ssneg_4 */ | |
1248 | ||
1249 | #if defined (L_ssabs_4) | |
1250 | DEFUN __ssabs_4 | |
1251 | sbrs A3, 7 | |
1252 | ret | |
1253 | XJMP __ssneg_4 | |
1254 | ENDF __ssabs_4 | |
1255 | #endif /* L_ssabs_4 */ | |
1256 | ||
1257 | #undef A0 | |
1258 | #undef A1 | |
1259 | #undef A2 | |
1260 | #undef A3 | |
1261 | ||
1262 | ||
1263 | \f | |
1264 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1265 | ;; Saturation, 8 Bytes | |
1266 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1267 | ||
1268 | ;; First Argument and Return Register | |
1269 | #define A0 18 | |
1270 | #define A1 A0+1 | |
1271 | #define A2 A0+2 | |
1272 | #define A3 A0+3 | |
1273 | #define A4 A0+4 | |
1274 | #define A5 A0+5 | |
1275 | #define A6 A0+6 | |
1276 | #define A7 A0+7 | |
1277 | ||
1278 | #if defined (L_clr_8) | |
1279 | FALIAS __usneguta2 | |
1280 | FALIAS __usneguda2 | |
1281 | FALIAS __usnegudq2 | |
1282 | ||
1283 | ;; Clear Carry and all Bytes | |
1284 | DEFUN __clr_8 | |
1285 | ;; Clear Carry and set Z | |
1286 | sub A7, A7 | |
1287 | ;; FALLTHRU | |
1288 | ENDF __clr_8 | |
1289 | ;; Propagate Carry to all Bytes, Carry unaltered | |
1290 | DEFUN __sbc_8 | |
1291 | sbc A7, A7 | |
1292 | sbc A6, A6 | |
1293 | wmov A4, A6 | |
1294 | wmov A2, A6 | |
1295 | wmov A0, A6 | |
1296 | ret | |
1297 | ENDF __sbc_8 | |
1298 | #endif /* L_clr_8 */ | |
1299 | ||
1300 | #if defined (L_ssneg_8) | |
1301 | FALIAS __ssnegta2 | |
1302 | FALIAS __ssnegda2 | |
1303 | FALIAS __ssnegdq2 | |
1304 | ||
1305 | DEFUN __ssneg_8 | |
1306 | XCALL __negdi2 | |
1307 | brvc 0f | |
1308 | ;; A[] = 0x7fffffff | |
1309 | sec | |
1310 | XCALL __sbc_8 | |
1311 | ldi A7, 0x7f | |
1312 | 0: ret | |
1313 | ENDF __ssneg_8 | |
1314 | #endif /* L_ssneg_8 */ | |
1315 | ||
1316 | #if defined (L_ssabs_8) | |
1317 | FALIAS __ssabsta2 | |
1318 | FALIAS __ssabsda2 | |
1319 | FALIAS __ssabsdq2 | |
1320 | ||
1321 | DEFUN __ssabs_8 | |
1322 | sbrs A7, 7 | |
1323 | ret | |
1324 | XJMP __ssneg_8 | |
1325 | ENDF __ssabs_8 | |
1326 | #endif /* L_ssabs_8 */ | |
1327 | ||
1328 | ;; Second Argument | |
1329 | #define B0 10 | |
1330 | #define B1 B0+1 | |
1331 | #define B2 B0+2 | |
1332 | #define B3 B0+3 | |
1333 | #define B4 B0+4 | |
1334 | #define B5 B0+5 | |
1335 | #define B6 B0+6 | |
1336 | #define B7 B0+7 | |
1337 | ||
1338 | #if defined (L_usadd_8) | |
1339 | FALIAS __usadduta3 | |
1340 | FALIAS __usadduda3 | |
1341 | FALIAS __usaddudq3 | |
1342 | ||
1343 | DEFUN __usadd_8 | |
1344 | XCALL __adddi3 | |
1345 | brcs 0f | |
1346 | ret | |
e13d9d5a GJL |
1347 | 0: ;; A[] = 0xffffffff |
1348 | XJMP __sbc_8 | |
51526856 GJL |
1349 | ENDF __usadd_8 |
1350 | #endif /* L_usadd_8 */ | |
1351 | ||
1352 | #if defined (L_ussub_8) | |
1353 | FALIAS __ussubuta3 | |
1354 | FALIAS __ussubuda3 | |
1355 | FALIAS __ussubudq3 | |
1356 | ||
1357 | DEFUN __ussub_8 | |
1358 | XCALL __subdi3 | |
1359 | brcs 0f | |
1360 | ret | |
e13d9d5a GJL |
1361 | 0: ;; A[] = 0 |
1362 | XJMP __clr_8 | |
51526856 GJL |
1363 | ENDF __ussub_8 |
1364 | #endif /* L_ussub_8 */ | |
1365 | ||
1366 | #if defined (L_ssadd_8) | |
1367 | FALIAS __ssaddta3 | |
1368 | FALIAS __ssaddda3 | |
1369 | FALIAS __ssadddq3 | |
1370 | ||
1371 | DEFUN __ssadd_8 | |
51526856 GJL |
1372 | XCALL __adddi3 |
1373 | brvc 0f | |
e13d9d5a | 1374 | ;; A = (B >= 0) ? INT64_MAX : INT64_MIN |
51526856 GJL |
1375 | cpi B7, 0x80 |
1376 | XCALL __sbc_8 | |
1377 | subi A7, 0x80 | |
1378 | 0: ret | |
1379 | ENDF __ssadd_8 | |
1380 | #endif /* L_ssadd_8 */ | |
1381 | ||
1382 | #if defined (L_sssub_8) | |
1383 | FALIAS __sssubta3 | |
1384 | FALIAS __sssubda3 | |
1385 | FALIAS __sssubdq3 | |
1386 | ||
1387 | DEFUN __sssub_8 | |
1388 | XCALL __subdi3 | |
1389 | brvc 0f | |
e13d9d5a | 1390 | ;; A = (B < 0) ? INT64_MAX : INT64_MIN |
51526856 GJL |
1391 | ldi A7, 0x7f |
1392 | cp A7, B7 | |
1393 | XCALL __sbc_8 | |
1394 | subi A7, 0x80 | |
1395 | 0: ret | |
1396 | ENDF __sssub_8 | |
1397 | #endif /* L_sssub_8 */ | |
1398 | ||
1399 | #undef A0 | |
1400 | #undef A1 | |
1401 | #undef A2 | |
1402 | #undef A3 | |
1403 | #undef A4 | |
1404 | #undef A5 | |
1405 | #undef A6 | |
1406 | #undef A7 | |
1407 | #undef B0 | |
1408 | #undef B1 | |
1409 | #undef B2 | |
1410 | #undef B3 | |
1411 | #undef B4 | |
1412 | #undef B5 | |
1413 | #undef B6 | |
1414 | #undef B7 | |
85d768f3 GJL |
1415 | |
1416 | \f | |
1417 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1418 | ;; Rounding Helpers | |
1419 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1420 | ||
1421 | #ifdef L_mask1 | |
1422 | ||
1423 | #define AA 24 | |
1424 | #define CC 25 | |
1425 | ||
1426 | ;; R25 = 1 << (R24 & 7) | |
1427 | ;; CC = 1 << (AA & 7) | |
1428 | ;; Clobbers: None | |
1429 | DEFUN __mask1 | |
1430 | ;; CC = 2 ^ AA.1 | |
1431 | ldi CC, 1 << 2 | |
1432 | sbrs AA, 1 | |
1433 | ldi CC, 1 << 0 | |
1434 | ;; CC *= 2 ^ AA.0 | |
1435 | sbrc AA, 0 | |
1436 | lsl CC | |
1437 | ;; CC *= 2 ^ AA.2 | |
1438 | sbrc AA, 2 | |
1439 | swap CC | |
1440 | ret | |
1441 | ENDF __mask1 | |
1442 | ||
1443 | #undef AA | |
1444 | #undef CC | |
1445 | #endif /* L_mask1 */ | |
1446 | ||
1447 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1448 | ||
1449 | ;; The rounding point. Any bits smaller than | |
1450 | ;; 2^{-RP} will be cleared. | |
1451 | #define RP R24 | |
1452 | ||
1453 | #define A0 22 | |
1454 | #define A1 A0 + 1 | |
1455 | ||
1456 | #define C0 24 | |
1457 | #define C1 C0 + 1 | |
1458 | ||
1459 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1460 | ;; Rounding, 1 Byte | |
1461 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1462 | ||
1463 | #ifdef L_roundqq3 | |
1464 | ||
1465 | ;; R24 = round (R22, R24) | |
1466 | ;; Clobbers: R22, __tmp_reg__ | |
1467 | DEFUN __roundqq3 | |
1468 | mov __tmp_reg__, C1 | |
1469 | subi RP, __QQ_FBIT__ - 1 | |
1470 | neg RP | |
1471 | ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) | |
1472 | XCALL __mask1 | |
1473 | mov C0, C1 | |
1474 | ;; Add-Saturate 2^{-RP-1} | |
1475 | add A0, C0 | |
1476 | brvc 0f | |
02371798 GJL |
1477 | ldi C0, 0x7f |
1478 | rjmp 9f | |
85d768f3 GJL |
1479 | 0: ;; Mask out bits beyond RP |
1480 | lsl C0 | |
1481 | neg C0 | |
1482 | and C0, A0 | |
02371798 | 1483 | 9: mov C1, __tmp_reg__ |
85d768f3 GJL |
1484 | ret |
1485 | ENDF __roundqq3 | |
1486 | #endif /* L_roundqq3 */ | |
1487 | ||
1488 | #ifdef L_rounduqq3 | |
1489 | ||
1490 | ;; R24 = round (R22, R24) | |
1491 | ;; Clobbers: R22, __tmp_reg__ | |
1492 | DEFUN __rounduqq3 | |
1493 | mov __tmp_reg__, C1 | |
1494 | subi RP, __UQQ_FBIT__ - 1 | |
1495 | neg RP | |
1496 | ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) | |
1497 | XCALL __mask1 | |
1498 | mov C0, C1 | |
1499 | ;; Add-Saturate 2^{-RP-1} | |
1500 | add A0, C0 | |
1501 | brcc 0f | |
02371798 GJL |
1502 | ldi C0, 0xff |
1503 | rjmp 9f | |
85d768f3 GJL |
1504 | 0: ;; Mask out bits beyond RP |
1505 | lsl C0 | |
1506 | neg C0 | |
1507 | and C0, A0 | |
02371798 | 1508 | 9: mov C1, __tmp_reg__ |
85d768f3 GJL |
1509 | ret |
1510 | ENDF __rounduqq3 | |
1511 | #endif /* L_rounduqq3 */ | |
1512 | ||
1513 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1514 | ;; Rounding, 2 Bytes | |
1515 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1516 | ||
1517 | #ifdef L_addmask_2 | |
1518 | ||
1519 | ;; [ R25:R24 = 1 << (R24 & 15) | |
1520 | ;; R23:R22 += 1 << (R24 & 15) ] | |
1521 | ;; SREG is set according to the addition | |
1522 | DEFUN __addmask_2 | |
1523 | ;; R25 = 1 << (R24 & 7) | |
1524 | XCALL __mask1 | |
1525 | cpi RP, 1 << 3 | |
1526 | sbc C0, C0 | |
1527 | ;; Swap C0 and C1 if RP.3 was set | |
1528 | and C0, C1 | |
1529 | eor C1, C0 | |
1530 | ;; Finally, add the power-of-two: A[] += C[] | |
1531 | add A0, C0 | |
1532 | adc A1, C1 | |
1533 | ret | |
1534 | ENDF __addmask_2 | |
1535 | #endif /* L_addmask_2 */ | |
1536 | ||
1537 | #ifdef L_round_s2 | |
1538 | ||
1539 | ;; R25:R24 = round (R23:R22, R24) | |
1540 | ;; Clobbers: R23, R22 | |
1541 | DEFUN __roundhq3 | |
1542 | subi RP, __HQ_FBIT__ - __HA_FBIT__ | |
1543 | ENDF __roundhq3 | |
1544 | DEFUN __roundha3 | |
1545 | subi RP, __HA_FBIT__ - 1 | |
1546 | neg RP | |
1547 | ;; [ R25:R24 = 1 << (FBIT-1 - RP) | |
1548 | ;; R23:R22 += 1 << (FBIT-1 - RP) ] | |
1549 | XCALL __addmask_2 | |
1550 | XJMP __round_s2_const | |
1551 | ENDF __roundha3 | |
1552 | ||
1553 | #endif /* L_round_s2 */ | |
1554 | ||
1555 | #ifdef L_round_u2 | |
1556 | ||
1557 | ;; R25:R24 = round (R23:R22, R24) | |
1558 | ;; Clobbers: R23, R22 | |
1559 | DEFUN __rounduhq3 | |
1560 | subi RP, __UHQ_FBIT__ - __UHA_FBIT__ | |
1561 | ENDF __rounduhq3 | |
1562 | DEFUN __rounduha3 | |
1563 | subi RP, __UHA_FBIT__ - 1 | |
1564 | neg RP | |
1565 | ;; [ R25:R24 = 1 << (FBIT-1 - RP) | |
1566 | ;; R23:R22 += 1 << (FBIT-1 - RP) ] | |
1567 | XCALL __addmask_2 | |
1568 | XJMP __round_u2_const | |
1569 | ENDF __rounduha3 | |
1570 | ||
1571 | #endif /* L_round_u2 */ | |
1572 | ||
1573 | ||
1574 | #ifdef L_round_2_const | |
1575 | ||
1576 | ;; Helpers for 2 byte wide rounding | |
1577 | ||
1578 | DEFUN __round_s2_const | |
1579 | brvc 2f | |
02371798 | 1580 | ldi C1, 0x7f |
85d768f3 GJL |
1581 | rjmp 1f |
1582 | ;; FALLTHRU (Barrier) | |
1583 | ENDF __round_s2_const | |
1584 | ||
1585 | DEFUN __round_u2_const | |
1586 | brcc 2f | |
02371798 | 1587 | ldi C1, 0xff |
85d768f3 | 1588 | 1: |
02371798 GJL |
1589 | ldi C0, 0xff |
1590 | rjmp 9f | |
85d768f3 GJL |
1591 | 2: |
1592 | ;; Saturation is performed now. | |
1593 | ;; Currently, we have C[] = 2^{-RP-1} | |
1594 | ;; C[] = 2^{-RP} | |
1595 | lsl C0 | |
1596 | rol C1 | |
1597 | ;; | |
1598 | NEG2 C0 | |
1599 | ;; Clear the bits beyond the rounding point. | |
1600 | and C0, A0 | |
1601 | and C1, A1 | |
02371798 | 1602 | 9: ret |
85d768f3 GJL |
1603 | ENDF __round_u2_const |
1604 | ||
1605 | #endif /* L_round_2_const */ | |
1606 | ||
1607 | #undef A0 | |
1608 | #undef A1 | |
1609 | #undef C0 | |
1610 | #undef C1 | |
1611 | ||
1612 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1613 | ;; Rounding, 4 Bytes | |
1614 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1615 | ||
1616 | #define A0 18 | |
1617 | #define A1 A0 + 1 | |
1618 | #define A2 A0 + 2 | |
1619 | #define A3 A0 + 3 | |
1620 | ||
1621 | #define C0 22 | |
1622 | #define C1 C0 + 1 | |
1623 | #define C2 C0 + 2 | |
1624 | #define C3 C0 + 3 | |
1625 | ||
1626 | #ifdef L_addmask_4 | |
1627 | ||
1628 | ;; [ R25:R22 = 1 << (R24 & 31) | |
1629 | ;; R21:R18 += 1 << (R24 & 31) ] | |
1630 | ;; SREG is set according to the addition | |
1631 | DEFUN __addmask_4 | |
1632 | ;; R25 = 1 << (R24 & 7) | |
1633 | XCALL __mask1 | |
1634 | cpi RP, 1 << 4 | |
1635 | sbc C0, C0 | |
1636 | sbc C1, C1 | |
1637 | ;; Swap C2 with C3 if RP.3 is not set | |
1638 | cpi RP, 1 << 3 | |
1639 | sbc C2, C2 | |
1640 | and C2, C3 | |
1641 | eor C3, C2 | |
1642 | ;; Swap C3:C2 with C1:C0 if RP.4 is not set | |
1643 | and C0, C2 $ eor C2, C0 | |
1644 | and C1, C3 $ eor C3, C1 | |
1645 | ;; Finally, add the power-of-two: A[] += C[] | |
1646 | add A0, C0 | |
1647 | adc A1, C1 | |
1648 | adc A2, C2 | |
1649 | adc A3, C3 | |
1650 | ret | |
1651 | ENDF __addmask_4 | |
1652 | #endif /* L_addmask_4 */ | |
1653 | ||
1654 | #ifdef L_round_s4 | |
1655 | ||
1656 | ;; R25:R22 = round (R21:R18, R24) | |
1657 | ;; Clobbers: R18...R21 | |
1658 | DEFUN __roundsq3 | |
1659 | subi RP, __SQ_FBIT__ - __SA_FBIT__ | |
1660 | ENDF __roundsq3 | |
1661 | DEFUN __roundsa3 | |
1662 | subi RP, __SA_FBIT__ - 1 | |
1663 | neg RP | |
1664 | ;; [ R25:R22 = 1 << (FBIT-1 - RP) | |
1665 | ;; R21:R18 += 1 << (FBIT-1 - RP) ] | |
1666 | XCALL __addmask_4 | |
1667 | XJMP __round_s4_const | |
1668 | ENDF __roundsa3 | |
1669 | ||
1670 | #endif /* L_round_s4 */ | |
1671 | ||
1672 | #ifdef L_round_u4 | |
1673 | ||
1674 | ;; R25:R22 = round (R21:R18, R24) | |
1675 | ;; Clobbers: R18...R21 | |
1676 | DEFUN __roundusq3 | |
1677 | subi RP, __USQ_FBIT__ - __USA_FBIT__ | |
1678 | ENDF __roundusq3 | |
1679 | DEFUN __roundusa3 | |
1680 | subi RP, __USA_FBIT__ - 1 | |
1681 | neg RP | |
1682 | ;; [ R25:R22 = 1 << (FBIT-1 - RP) | |
1683 | ;; R21:R18 += 1 << (FBIT-1 - RP) ] | |
1684 | XCALL __addmask_4 | |
1685 | XJMP __round_u4_const | |
1686 | ENDF __roundusa3 | |
1687 | ||
1688 | #endif /* L_round_u4 */ | |
1689 | ||
1690 | ||
1691 | #ifdef L_round_4_const | |
1692 | ||
1693 | ;; Helpers for 4 byte wide rounding | |
1694 | ||
1695 | DEFUN __round_s4_const | |
1696 | brvc 2f | |
02371798 | 1697 | ldi C3, 0x7f |
85d768f3 GJL |
1698 | rjmp 1f |
1699 | ;; FALLTHRU (Barrier) | |
1700 | ENDF __round_s4_const | |
1701 | ||
1702 | DEFUN __round_u4_const | |
1703 | brcc 2f | |
02371798 | 1704 | ldi C3, 0xff |
85d768f3 | 1705 | 1: |
02371798 GJL |
1706 | ldi C2, 0xff |
1707 | ldi C1, 0xff | |
1708 | ldi C0, 0xff | |
1709 | rjmp 9f | |
85d768f3 GJL |
1710 | 2: |
1711 | ;; Saturation is performed now. | |
1712 | ;; Currently, we have C[] = 2^{-RP-1} | |
1713 | ;; C[] = 2^{-RP} | |
1714 | lsl C0 | |
1715 | rol C1 | |
1716 | rol C2 | |
1717 | rol C3 | |
1718 | XCALL __negsi2 | |
1719 | ;; Clear the bits beyond the rounding point. | |
1720 | and C0, A0 | |
1721 | and C1, A1 | |
1722 | and C2, A2 | |
1723 | and C3, A3 | |
02371798 | 1724 | 9: ret |
85d768f3 GJL |
1725 | ENDF __round_u4_const |
1726 | ||
1727 | #endif /* L_round_4_const */ | |
1728 | ||
1729 | #undef A0 | |
1730 | #undef A1 | |
1731 | #undef A2 | |
1732 | #undef A3 | |
1733 | #undef C0 | |
1734 | #undef C1 | |
1735 | #undef C2 | |
1736 | #undef C3 | |
1737 | ||
1738 | #undef RP | |
1739 | ||
1740 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1741 | ;; Rounding, 8 Bytes | |
1742 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
1743 | ||
1744 | #define RP 16 | |
1745 | #define FBITm1 31 | |
1746 | ||
1747 | #define C0 18 | |
1748 | #define C1 C0 + 1 | |
1749 | #define C2 C0 + 2 | |
1750 | #define C3 C0 + 3 | |
1751 | #define C4 C0 + 4 | |
1752 | #define C5 C0 + 5 | |
1753 | #define C6 C0 + 6 | |
1754 | #define C7 C0 + 7 | |
1755 | ||
1756 | #define A0 16 | |
1757 | #define A1 17 | |
1758 | #define A2 26 | |
1759 | #define A3 27 | |
1760 | #define A4 28 | |
1761 | #define A5 29 | |
1762 | #define A6 30 | |
1763 | #define A7 31 | |
1764 | ||
1765 | ||
1766 | #ifdef L_rounddq3 | |
1767 | ;; R25:R18 = round (R25:R18, R16) | |
1768 | ;; Clobbers: ABI | |
1769 | DEFUN __rounddq3 | |
1770 | ldi FBITm1, __DQ_FBIT__ - 1 | |
1771 | clt | |
1772 | XJMP __round_x8 | |
1773 | ENDF __rounddq3 | |
1774 | #endif /* L_rounddq3 */ | |
1775 | ||
1776 | #ifdef L_roundudq3 | |
1777 | ;; R25:R18 = round (R25:R18, R16) | |
1778 | ;; Clobbers: ABI | |
1779 | DEFUN __roundudq3 | |
1780 | ldi FBITm1, __UDQ_FBIT__ - 1 | |
1781 | set | |
1782 | XJMP __round_x8 | |
1783 | ENDF __roundudq3 | |
1784 | #endif /* L_roundudq3 */ | |
1785 | ||
1786 | #ifdef L_roundda3 | |
1787 | ;; R25:R18 = round (R25:R18, R16) | |
1788 | ;; Clobbers: ABI | |
1789 | DEFUN __roundda3 | |
1790 | ldi FBITm1, __DA_FBIT__ - 1 | |
1791 | clt | |
1792 | XJMP __round_x8 | |
1793 | ENDF __roundda3 | |
1794 | #endif /* L_roundda3 */ | |
1795 | ||
1796 | #ifdef L_rounduda3 | |
1797 | ;; R25:R18 = round (R25:R18, R16) | |
1798 | ;; Clobbers: ABI | |
1799 | DEFUN __rounduda3 | |
1800 | ldi FBITm1, __UDA_FBIT__ - 1 | |
1801 | set | |
1802 | XJMP __round_x8 | |
1803 | ENDF __rounduda3 | |
1804 | #endif /* L_rounduda3 */ | |
1805 | ||
1806 | #ifdef L_roundta3 | |
1807 | ;; R25:R18 = round (R25:R18, R16) | |
1808 | ;; Clobbers: ABI | |
1809 | DEFUN __roundta3 | |
1810 | ldi FBITm1, __TA_FBIT__ - 1 | |
1811 | clt | |
1812 | XJMP __round_x8 | |
1813 | ENDF __roundta3 | |
1814 | #endif /* L_roundta3 */ | |
1815 | ||
1816 | #ifdef L_rounduta3 | |
1817 | ;; R25:R18 = round (R25:R18, R16) | |
1818 | ;; Clobbers: ABI | |
1819 | DEFUN __rounduta3 | |
1820 | ldi FBITm1, __UTA_FBIT__ - 1 | |
1821 | set | |
1822 | XJMP __round_x8 | |
1823 | ENDF __rounduta3 | |
1824 | #endif /* L_rounduta3 */ | |
1825 | ||
1826 | ||
1827 | #ifdef L_round_x8 | |
1828 | DEFUN __round_x8 | |
1829 | push r16 | |
1830 | push r17 | |
1831 | push r28 | |
1832 | push r29 | |
1833 | ;; Compute log2 of addend from rounding point | |
1834 | sub RP, FBITm1 | |
1835 | neg RP | |
1836 | ;; Move input to work register A[] | |
1837 | push C0 | |
1838 | mov A1, C1 | |
1839 | wmov A2, C2 | |
1840 | wmov A4, C4 | |
1841 | wmov A6, C6 | |
1842 | ;; C[] = 1 << (FBIT-1 - RP) | |
1843 | XCALL __clr_8 | |
1844 | inc C0 | |
1845 | XCALL __ashldi3 | |
1846 | pop A0 | |
1847 | ;; A[] += C[] | |
1848 | add A0, C0 | |
1849 | adc A1, C1 | |
1850 | adc A2, C2 | |
1851 | adc A3, C3 | |
1852 | adc A4, C4 | |
1853 | adc A5, C5 | |
1854 | adc A6, C6 | |
1855 | adc A7, C7 | |
1856 | brts 1f | |
1857 | ;; Signed | |
1858 | brvc 3f | |
1859 | ;; Signed overflow: A[] = 0x7f... | |
1860 | brvs 2f | |
1861 | 1: ;; Unsigned | |
1862 | brcc 3f | |
1863 | ;; Unsigned overflow: A[] = 0xff... | |
02371798 GJL |
1864 | 2: ldi C7, 0xff |
1865 | ldi C6, 0xff | |
1866 | wmov C0, C6 | |
1867 | wmov C2, C6 | |
1868 | wmov C4, C6 | |
1869 | bld C7, 7 | |
1870 | rjmp 9f | |
85d768f3 GJL |
1871 | 3: |
1872 | ;; C[] = -C[] - C[] | |
1873 | push A0 | |
1874 | ldi r16, 1 | |
1875 | XCALL __ashldi3 | |
1876 | pop A0 | |
1877 | XCALL __negdi2 | |
1878 | ;; Clear the bits beyond the rounding point. | |
1879 | and C0, A0 | |
1880 | and C1, A1 | |
1881 | and C2, A2 | |
1882 | and C3, A3 | |
1883 | and C4, A4 | |
1884 | and C5, A5 | |
1885 | and C6, A6 | |
1886 | and C7, A7 | |
02371798 | 1887 | 9: ;; Epilogue |
85d768f3 GJL |
1888 | pop r29 |
1889 | pop r28 | |
1890 | pop r17 | |
1891 | pop r16 | |
1892 | ret | |
1893 | ENDF __round_x8 | |
1894 | ||
1895 | #endif /* L_round_x8 */ | |
1896 | ||
1897 | #undef A0 | |
1898 | #undef A1 | |
1899 | #undef A2 | |
1900 | #undef A3 | |
1901 | #undef A4 | |
1902 | #undef A5 | |
1903 | #undef A6 | |
1904 | #undef A7 | |
1905 | ||
1906 | #undef C0 | |
1907 | #undef C1 | |
1908 | #undef C2 | |
1909 | #undef C3 | |
1910 | #undef C4 | |
1911 | #undef C5 | |
1912 | #undef C6 | |
1913 | #undef C7 | |
1914 | ||
1915 | #undef RP | |
1916 | #undef FBITm1 | |
1917 | ||
1918 | ||
1919 | ;; Supply implementations / symbols for the bit-banging functions | |
1920 | ;; __builtin_avr_bitsfx and __builtin_avr_fxbits | |
1921 | #ifdef L_ret | |
1922 | DEFUN __ret | |
1923 | ret | |
1924 | ENDF __ret | |
1925 | #endif /* L_ret */ | |
c1dd9790 JR |
1926 | |
1927 | #endif /* if not __AVR_TINY__ */ |