1 /* Assembly functions for the Xtensa version of libgcc1.
2 Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009
3 Free Software Foundation, Inc.
4 Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 Under Section 7 of GPL version 3, you are granted additional
19 permissions described in the GCC Runtime Library Exception, version
20 3.1, as published by the Free Software Foundation.
22 You should have received a copy of the GNU General Public License and
23 a copy of the GCC Runtime Library Exception along with this program;
24 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
25 <http://www.gnu.org/licenses/>. */
27 #include "xtensa-config.h"
29 /* Define macros for the ABS and ADDX* instructions to handle cases
30 where they are not included in the Xtensa processor configuration. */
32 .macro do_abs dst, src, tmp
37 movgez \tmp, \src, \src
42 .macro do_addx2 dst, as, at, tmp
51 .macro do_addx4 dst, as, at, tmp
60 .macro do_addx8 dst, as, at, tmp
69 /* Define macros for leaf function entry and return, supporting either the
70 standard register windowed ABI or the non-windowed call0 ABI. These
71 macros do not allocate any extra stack space, so they only work for
72 leaf functions that do not need to spill anything to the stack. */
74 .macro leaf_entry reg, size
75 #if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
83 #if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
94 .type __mulsi3, @function
101 #elif XCHAL_HAVE_MUL16
117 #elif XCHAL_HAVE_MAC16
126 #else /* !MUL32 && !MUL16 && !MAC16 */
128 /* Multiply one bit at a time, but unroll the loop 4x to better
129 exploit the addx instructions and avoid overhead.
130 Peel the first iteration to save a cycle on init. */
132 /* Avoid negative numbers. */
133 xor a5, a2, a3 /* Top bit is 1 if one input is negative. */
137 /* Swap so the second argument is smaller. */
140 movgez a4, a2, a7 /* a4 = max (a2, a3) */
141 movltz a3, a2, a7 /* a3 = min (a2, a3) */
147 do_addx2 a7, a4, a2, a7
151 do_addx4 a7, a4, a2, a7
155 do_addx8 a7, a4, a2, a7
159 bgeui a3, 16, .Lmult_main_loop
173 do_addx2 a7, a4, a2, a7
177 do_addx4 a7, a4, a2, a7
181 do_addx8 a7, a4, a2, a7
185 bgeui a3, 16, .Lmult_main_loop
190 #endif /* !MUL32 && !MUL16 && !MAC16 */
193 .size __mulsi3, . - __mulsi3
195 #endif /* L_mulsi3 */
200 #if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
201 #define XCHAL_NO_MUL 1
206 .type __umulsidi3, @function
208 #if __XTENSA_CALL0_ABI__
216 /* This is not really a leaf function; allocate enough stack space
217 to allow CALL12s to a helper function. */
229 #endif /* __XTENSA_EB__ */
231 /* This code is taken from the mulsf3 routine in ieee754-sf.S.
232 See more comments there. */
234 #if XCHAL_HAVE_MUL32_HIGH
239 #else /* ! MUL32_HIGH */
241 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
242 /* a0 and a8 will be clobbered by calling the multiply function
243 but a8 is not used here and need not be saved. */
247 #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
252 /* Get the high halves of the inputs into registers. */
259 #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
260 /* Clear the high halves of the inputs. This does not matter
261 for MUL16 because the high bits are ignored. */
265 #endif /* MUL16 || MUL32 */
270 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
271 mul16u dst, xreg ## xhalf, yreg ## yhalf
273 #elif XCHAL_HAVE_MUL32
275 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
276 mull dst, xreg ## xhalf, yreg ## yhalf
278 #elif XCHAL_HAVE_MAC16
280 /* The preprocessor insists on inserting a space when concatenating after
281 a period in the definition of do_mul below. These macros are a workaround
282 using underscores instead of periods when doing the concatenation. */
283 #define umul_aa_ll umul.aa.ll
284 #define umul_aa_lh umul.aa.lh
285 #define umul_aa_hl umul.aa.hl
286 #define umul_aa_hh umul.aa.hh
288 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
289 umul_aa_ ## xhalf ## yhalf xreg, yreg; \
292 #else /* no multiply hardware */
294 #define set_arg_l(dst, src) \
295 extui dst, src, 0, 16
296 #define set_arg_h(dst, src) \
299 #if __XTENSA_CALL0_ABI__
300 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
301 set_arg_ ## xhalf (a13, xreg); \
302 set_arg_ ## yhalf (a14, yreg); \
303 call0 .Lmul_mulsi3; \
306 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
307 set_arg_ ## xhalf (a14, xreg); \
308 set_arg_ ## yhalf (a15, yreg); \
309 call12 .Lmul_mulsi3; \
311 #endif /* __XTENSA_CALL0_ABI__ */
313 #endif /* no multiply hardware */
315 /* Add pp1 and pp2 into a6 with carry-out in a9. */
316 do_mul(a6, a2, l, a3, h) /* pp 1 */
317 do_mul(a11, a2, h, a3, l) /* pp 2 */
323 /* Shift the high half of a9/a6 into position in a9. Note that
324 this value can be safely incremented without any carry-outs. */
328 /* Compute the low word into a6. */
329 do_mul(a11, a2, l, a3, l) /* pp 0 */
335 /* Compute the high word into wh. */
336 do_mul(wh, a2, h, a3, h) /* pp 3 */
340 #endif /* !MUL32_HIGH */
342 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
343 /* Restore the original return address. */
346 #if __XTENSA_CALL0_ABI__
357 /* For Xtensa processors with no multiply hardware, this simplified
358 version of _mulsi3 is used for multiplying 16-bit chunks of
359 the floating-point mantissas. When using CALL0, this function
360 uses a custom ABI: the inputs are passed in a13 and a14, the
361 result is returned in a12, and a8 and a15 are clobbered. */
365 .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
367 1: add \tmp1, \src2, \dst
368 extui \tmp2, \src1, 0, 1
369 movnez \dst, \tmp1, \tmp2
371 do_addx2 \tmp1, \src2, \dst, \tmp1
372 extui \tmp2, \src1, 1, 1
373 movnez \dst, \tmp1, \tmp2
375 do_addx4 \tmp1, \src2, \dst, \tmp1
376 extui \tmp2, \src1, 2, 1
377 movnez \dst, \tmp1, \tmp2
379 do_addx8 \tmp1, \src2, \dst, \tmp1
380 extui \tmp2, \src1, 3, 1
381 movnez \dst, \tmp1, \tmp2
387 #if __XTENSA_CALL0_ABI__
388 mul_mulsi3_body a12, a13, a14, a15, a8
390 /* The result will be written into a2, so save that argument in a4. */
392 mul_mulsi3_body a2, a4, a3, a5, a6
395 #endif /* XCHAL_NO_MUL */
397 .size __umulsidi3, . - __umulsidi3
399 #endif /* L_umulsidi3 */
402 /* Define a macro for the NSAU (unsigned normalize shift amount)
403 instruction, which computes the number of leading zero bits,
404 to handle cases where it is not included in the Xtensa processor
407 .macro do_nsau cnt, val, tmp, a
413 extui \tmp, \a, 16, 16
418 extui \tmp, \a, 24, 8
423 movi \tmp, __nsau_data
428 #endif /* !XCHAL_HAVE_NSA */
435 .type __nsau_data, @object
438 .byte 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
439 .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
440 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
441 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
442 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
443 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
444 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
445 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
446 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
447 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
448 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
449 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
450 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
451 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
452 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
453 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
454 #endif /* !XCHAL_HAVE_NSA */
455 .size __nsau_data, . - __nsau_data
463 .type __clzsi2, @function
466 do_nsau a2, a2, a3, a4
468 .size __clzsi2, . - __clzsi2
470 #endif /* L_clzsi2 */
476 .type __ctzsi2, @function
481 do_nsau a2, a3, a4, a5
485 .size __ctzsi2, . - __ctzsi2
487 #endif /* L_ctzsi2 */
493 .type __ffssi2, @function
498 do_nsau a2, a3, a4, a5
502 .size __ffssi2, . - __ffssi2
504 #endif /* L_ffssi2 */
510 .type __udivsi3, @function
516 bltui a3, 2, .Lle_one /* check if the divisor <= 1 */
518 mov a6, a2 /* keep dividend in a6 */
519 do_nsau a5, a6, a2, a7 /* dividend_shift = nsau (dividend) */
520 do_nsau a4, a3, a2, a7 /* divisor_shift = nsau (divisor) */
521 bgeu a5, a4, .Lspecial
523 sub a4, a4, a5 /* count = divisor_shift - dividend_shift */
525 sll a3, a3 /* divisor <<= count */
526 movi a2, 0 /* quotient = 0 */
528 /* test-subtract-and-shift loop; one quotient bit on each iteration */
530 loopnez a4, .Lloopend
531 #endif /* XCHAL_HAVE_LOOPS */
533 bltu a6, a3, .Lzerobit
539 #if !XCHAL_HAVE_LOOPS
542 #endif /* !XCHAL_HAVE_LOOPS */
545 bltu a6, a3, .Lreturn
546 addi a2, a2, 1 /* increment quotient if dividend >= divisor */
551 beqz a3, .Lerror /* if divisor == 1, return the dividend */
555 /* return dividend >= divisor */
556 bltu a6, a3, .Lreturn0
561 /* Divide by zero: Use an illegal instruction to force an exception.
562 The subsequent "DIV0" string can be recognized by the exception
563 handler to identify the real cause of the exception. */
569 #endif /* XCHAL_HAVE_DIV32 */
571 .size __udivsi3, . - __udivsi3
573 #endif /* L_udivsi3 */
579 .type __divsi3, @function
585 xor a7, a2, a3 /* sign = dividend ^ divisor */
586 do_abs a6, a2, a4 /* udividend = abs (dividend) */
587 do_abs a3, a3, a4 /* udivisor = abs (divisor) */
588 bltui a3, 2, .Lle_one /* check if udivisor <= 1 */
589 do_nsau a5, a6, a2, a8 /* udividend_shift = nsau (udividend) */
590 do_nsau a4, a3, a2, a8 /* udivisor_shift = nsau (udivisor) */
591 bgeu a5, a4, .Lspecial
593 sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */
595 sll a3, a3 /* udivisor <<= count */
596 movi a2, 0 /* quotient = 0 */
598 /* test-subtract-and-shift loop; one quotient bit on each iteration */
600 loopnez a4, .Lloopend
601 #endif /* XCHAL_HAVE_LOOPS */
603 bltu a6, a3, .Lzerobit
609 #if !XCHAL_HAVE_LOOPS
612 #endif /* !XCHAL_HAVE_LOOPS */
615 bltu a6, a3, .Lreturn
616 addi a2, a2, 1 /* increment if udividend >= udivisor */
619 movltz a2, a5, a7 /* return (sign < 0) ? -quotient : quotient */
624 neg a2, a6 /* if udivisor == 1, then return... */
625 movgez a2, a6, a7 /* (sign < 0) ? -udividend : udividend */
629 bltu a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */
632 movltz a2, a4, a7 /* else return (sign < 0) ? -1 : 1 */
636 /* Divide by zero: Use an illegal instruction to force an exception.
637 The subsequent "DIV0" string can be recognized by the exception
638 handler to identify the real cause of the exception. */
644 #endif /* XCHAL_HAVE_DIV32 */
646 .size __divsi3, . - __divsi3
648 #endif /* L_divsi3 */
654 .type __umodsi3, @function
660 bltui a3, 2, .Lle_one /* check if the divisor is <= 1 */
662 do_nsau a5, a2, a6, a7 /* dividend_shift = nsau (dividend) */
663 do_nsau a4, a3, a6, a7 /* divisor_shift = nsau (divisor) */
664 bgeu a5, a4, .Lspecial
666 sub a4, a4, a5 /* count = divisor_shift - dividend_shift */
668 sll a3, a3 /* divisor <<= count */
670 /* test-subtract-and-shift loop */
672 loopnez a4, .Lloopend
673 #endif /* XCHAL_HAVE_LOOPS */
675 bltu a2, a3, .Lzerobit
679 #if !XCHAL_HAVE_LOOPS
682 #endif /* !XCHAL_HAVE_LOOPS */
686 bltu a2, a3, .Lreturn
687 sub a2, a2, a3 /* subtract once more if dividend >= divisor */
694 /* Divide by zero: Use an illegal instruction to force an exception.
695 The subsequent "DIV0" string can be recognized by the exception
696 handler to identify the real cause of the exception. */
702 #endif /* XCHAL_HAVE_DIV32 */
704 .size __umodsi3, . - __umodsi3
706 #endif /* L_umodsi3 */
712 .type __modsi3, @function
718 mov a7, a2 /* save original (signed) dividend */
719 do_abs a2, a2, a4 /* udividend = abs (dividend) */
720 do_abs a3, a3, a4 /* udivisor = abs (divisor) */
721 bltui a3, 2, .Lle_one /* check if udivisor <= 1 */
722 do_nsau a5, a2, a6, a8 /* udividend_shift = nsau (udividend) */
723 do_nsau a4, a3, a6, a8 /* udivisor_shift = nsau (udivisor) */
724 bgeu a5, a4, .Lspecial
726 sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */
728 sll a3, a3 /* udivisor <<= count */
730 /* test-subtract-and-shift loop */
732 loopnez a4, .Lloopend
733 #endif /* XCHAL_HAVE_LOOPS */
735 bltu a2, a3, .Lzerobit
739 #if !XCHAL_HAVE_LOOPS
742 #endif /* !XCHAL_HAVE_LOOPS */
746 bltu a2, a3, .Lreturn
747 sub a2, a2, a3 /* subtract again if udividend >= udivisor */
750 neg a2, a2 /* if (dividend < 0), return -udividend */
757 /* Divide by zero: Use an illegal instruction to force an exception.
758 The subsequent "DIV0" string can be recognized by the exception
759 handler to identify the real cause of the exception. */
765 #endif /* XCHAL_HAVE_DIV32 */
767 .size __modsi3, . - __modsi3
769 #endif /* L_modsi3 */
778 #endif /* __XTENSA_EB__ */
784 .type __ashldi3, @function
788 bgei a4, 32, .Llow_only
797 .size __ashldi3, . - __ashldi3
799 #endif /* L_ashldi3 */
805 .type __ashrdi3, @function
809 bgei a4, 32, .Lhigh_only
818 .size __ashrdi3, . - __ashrdi3
820 #endif /* L_ashrdi3 */
826 .type __lshrdi3, @function
830 bgei a4, 32, .Lhigh_only1
839 .size __lshrdi3, . - __lshrdi3
841 #endif /* L_lshrdi3 */
844 #include "ieee754-df.S"
845 #include "ieee754-sf.S"