/*  -*- Mode: Asm -*-  */
;;    Copyright (C) 2012-2013 Free Software Foundation, Inc.
;;    Contributed by Sean D'Epagnier  (sean@depagnier.com)
;;                   Georg-Johann Lay (avr@gjlay.de)

;; This file is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by the
;; Free Software Foundation; either version 3, or (at your option) any
;; later version.

;; In addition to the permissions in the GNU General Public License, the
;; Free Software Foundation gives you unlimited permission to link the
;; compiled version of this file into combinations with other programs,
;; and to distribute those combinations without any restriction coming
;; from the use of this file.  (The General Public License restrictions
;; do apply in other respects; for example, they cover modification of
;; the file, and distribution when not linked into a combine
;; executable.)

;; This file is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING.  If not, write to
;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Fixed point library routines for AVR
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

.section .text.libgcc.fixed, "ax", @progbits

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Conversions to float
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

#if defined (L_fractqqsf)
DEFUN __fractqqsf
    ;; Move in place for SA -> SF conversion
    clr     r22
    mov     r23, r24
    ;; Sign-extend
    lsl     r24
    sbc     r24, r24
    mov     r25, r24
    XJMP    __fractsasf
ENDF __fractqqsf
#endif  /* L_fractqqsf */

#if defined (L_fractuqqsf)
DEFUN __fractuqqsf
    ;; Move in place for USA -> SF conversion
    clr     r22
    mov     r23, r24
    ;; Zero-extend
    clr     r24
    clr     r25
    XJMP    __fractusasf
ENDF __fractuqqsf
#endif  /* L_fractuqqsf */

#if defined (L_fracthqsf)
DEFUN __fracthqsf
    ;; Move in place for SA -> SF conversion
    wmov    22, 24
    ;; Sign-extend
    lsl     r25
    sbc     r24, r24
    mov     r25, r24
    XJMP    __fractsasf
ENDF __fracthqsf
#endif  /* L_fracthqsf */

#if defined (L_fractuhqsf)
DEFUN __fractuhqsf
    ;; Move in place for USA -> SF conversion
    wmov    22, 24
    ;; Zero-extend
    clr     r24
    clr     r25
    XJMP    __fractusasf
ENDF __fractuhqsf
#endif  /* L_fractuhqsf */

#if defined (L_fracthasf)
DEFUN __fracthasf
    ;; Move in place for SA -> SF conversion
    clr     r22
    mov     r23, r24
    mov     r24, r25
    ;; Sign-extend
    lsl     r25
    sbc     r25, r25
    XJMP    __fractsasf
ENDF __fracthasf
#endif  /* L_fracthasf */

#if defined (L_fractuhasf)
DEFUN __fractuhasf
    ;; Move in place for USA -> SF conversion
    clr     r22
    mov     r23, r24
    mov     r24, r25
    ;; Zero-extend
    clr     r25
    XJMP    __fractusasf
ENDF __fractuhasf
#endif  /* L_fractuhasf */


#if defined (L_fractsqsf)
DEFUN __fractsqsf
    XCALL   __floatsisf
    ;; Divide non-zero results by 2^31 to move the
    ;; decimal point into place
    tst     r25
    breq    0f
    subi    r24, exp_lo (31)
    sbci    r25, exp_hi (31)
0:  ret
ENDF __fractsqsf
#endif  /* L_fractsqsf */

#if defined (L_fractusqsf)
DEFUN __fractusqsf
    XCALL   __floatunsisf
    ;; Divide non-zero results by 2^32 to move the
    ;; decimal point into place
    cpse    r25, __zero_reg__
    subi    r25, exp_hi (32)
    ret
ENDF __fractusqsf
#endif  /* L_fractusqsf */

#if defined (L_fractsasf)
DEFUN __fractsasf
    XCALL   __floatsisf
    ;; Divide non-zero results by 2^15 to move the
    ;; decimal point into place
    tst     r25
    breq    0f
    subi    r24, exp_lo (15)
    sbci    r25, exp_hi (15)
0:  ret
ENDF __fractsasf
#endif  /* L_fractsasf */

#if defined (L_fractusasf)
DEFUN __fractusasf
    XCALL   __floatunsisf
    ;; Divide non-zero results by 2^16 to move the
    ;; decimal point into place
    cpse    r25, __zero_reg__
    subi    r25, exp_hi (16)
    ret
ENDF __fractusasf
#endif  /* L_fractusasf */

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Conversions from float
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

#if defined (L_fractsfqq)
DEFUN __fractsfqq
    ;; Multiply with 2^{24+7} to get a QQ result in r25
    subi    r24, exp_lo (-31)
    sbci    r25, exp_hi (-31)
    XCALL   __fixsfsi
    mov     r24, r25
    ret
ENDF __fractsfqq
#endif  /* L_fractsfqq */

#if defined (L_fractsfuqq)
DEFUN __fractsfuqq
    ;; Multiply with 2^{24+8} to get a UQQ result in r25
    subi    r25, exp_hi (-32)
    XCALL   __fixunssfsi
    mov     r24, r25
    ret
ENDF __fractsfuqq
#endif  /* L_fractsfuqq */

#if defined (L_fractsfha)
DEFUN __fractsfha
    ;; Multiply with 2^{16+7} to get a HA result in r25:r24
    subi    r24, exp_lo (-23)
    sbci    r25, exp_hi (-23)
    XJMP    __fixsfsi
ENDF __fractsfha
#endif  /* L_fractsfha */

#if defined (L_fractsfuha)
DEFUN __fractsfuha
    ;; Multiply with 2^24 to get a UHA result in r25:r24
    subi    r25, exp_hi (-24)
    XJMP    __fixunssfsi
ENDF __fractsfuha
#endif  /* L_fractsfuha */

#if defined (L_fractsfhq)
FALIAS __fractsfsq

DEFUN __fractsfhq
    ;; Multiply with 2^{16+15} to get a HQ result in r25:r24
    ;; resp. with 2^31 to get a SQ result in r25:r22
    subi    r24, exp_lo (-31)
    sbci    r25, exp_hi (-31)
    XJMP    __fixsfsi
ENDF __fractsfhq
#endif  /* L_fractsfhq */

#if defined (L_fractsfuhq)
FALIAS __fractsfusq

DEFUN __fractsfuhq
    ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24
    ;; resp. with 2^32 to get a USQ result in r25:r22
    subi    r25, exp_hi (-32)
    XJMP    __fixunssfsi
ENDF __fractsfuhq
#endif  /* L_fractsfuhq */

#if defined (L_fractsfsa)
DEFUN __fractsfsa
    ;; Multiply with 2^15 to get a SA result in r25:r22
    subi    r24, exp_lo (-15)
    sbci    r25, exp_hi (-15)
    XJMP    __fixsfsi
ENDF __fractsfsa
#endif  /* L_fractsfsa */

#if defined (L_fractsfusa)
DEFUN __fractsfusa
    ;; Multiply with 2^16 to get a USA result in r25:r22
    subi    r25, exp_hi (-16)
    XJMP    __fixunssfsi
ENDF __fractsfusa
#endif  /* L_fractsfusa */


;; For multiplication the functions here are called directly from
;; avr-fixed.md instead of using the standard libcall mechanisms.
;; This can make better code because GCC knows exactly which
;; of the call-used registers (not all of them) are clobbered.  */

/*******************************************************
    Fractional  Multiplication  8 x 8  without MUL
*******************************************************/

#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__)
;;; R23 = R24 * R25
;;; Clobbers: __tmp_reg__, R22, R24, R25
;;; Rounding: ???
DEFUN __mulqq3
    XCALL   __fmuls
    ;; TR 18037 requires that  (-1) * (-1)  does not overflow
    ;; The only input that can produce  -1  is  (-1)^2.
    dec     r23
    brvs    0f
    inc     r23
0:  ret
ENDF  __mulqq3
#endif /* L_mulqq3 && ! HAVE_MUL */

/*******************************************************
    Fractional Multiply  .16 x .16  with and without MUL
*******************************************************/

#if defined (L_mulhq3)
;;; Same code with and without MUL, but the interfaces differ:
;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
;;;         Clobbers: ABI, called by optabs
;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
;;;         Clobbers: __tmp_reg__, R22, R23
;;; Rounding:  -0.5 LSB  <= error  <=  0.5 LSB
DEFUN   __mulhq3
    XCALL   __mulhisi3
    ;; Shift result into place
    lsl     r23
    rol     r24
    rol     r25
    brvs    1f
    ;; Round
    sbrc    r23, 7
    adiw    r24, 1
    ret
1:  ;; Overflow.  TR 18037 requires  (-1)^2  not to overflow
    ldi     r24, lo8 (0x7fff)
    ldi     r25, hi8 (0x7fff)
    ret
ENDF __mulhq3
#endif  /* defined (L_mulhq3) */

#if defined (L_muluhq3)
;;; Same code with and without MUL, but the interfaces differ:
;;; no MUL: (R25:R24) *= (R23:R22)
;;;         Clobbers: ABI, called by optabs
;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
;;;         Clobbers: __tmp_reg__, R22, R23
;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
DEFUN   __muluhq3
    XCALL   __umulhisi3
    ;; Round
    sbrc    r23, 7
    adiw    r24, 1
    ret
ENDF __muluhq3
#endif  /* L_muluhq3 */


/*******************************************************
    Fixed  Multiply  8.8 x 8.8  with and without MUL
*******************************************************/

#if defined (L_mulha3)
;;; Same code with and without MUL, but the interfaces differ:
;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
;;;         Clobbers: ABI, called by optabs
;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
;;;         Clobbers: __tmp_reg__, R22, R23
;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
DEFUN   __mulha3
    XCALL   __mulhisi3
    lsl     r22
    rol     r23
    rol     r24
    XJMP    __muluha3_round
ENDF __mulha3
#endif  /* L_mulha3 */

#if defined (L_muluha3)
;;; Same code with and without MUL, but the interfaces differ:
;;; no MUL: (R25:R24) *= (R23:R22)
;;;         Clobbers: ABI, called by optabs
;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
;;;         Clobbers: __tmp_reg__, R22, R23
;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
DEFUN   __muluha3
    XCALL   __umulhisi3
    XJMP    __muluha3_round
ENDF __muluha3
#endif  /* L_muluha3 */

#if defined (L_muluha3_round)
DEFUN   __muluha3_round
    ;; Shift result into place
    mov     r25, r24
    mov     r24, r23
    ;; Round
    sbrc    r22, 7
    adiw    r24, 1
    ret
ENDF __muluha3_round
#endif  /* L_muluha3_round */


/*******************************************************
    Fixed  Multiplication  16.16 x 16.16
*******************************************************/

;; Bits outside the result (below LSB), used in the signed version
#define GUARD __tmp_reg__

#if defined (__AVR_HAVE_MUL__)

;; Multiplier
#define A0  16
#define A1  A0+1
#define A2  A1+1
#define A3  A2+1

;; Multiplicand
#define B0  20
#define B1  B0+1
#define B2  B1+1
#define B3  B2+1

;; Result
#define C0  24
#define C1  C0+1
#define C2  C1+1
#define C3  C2+1

#if defined (L_mulusa3)
;;; (C3:C0) = (A3:A0) * (B3:B0)
DEFUN __mulusa3
    set
    ;; Fallthru
ENDF  __mulusa3

;;; Round for last digit iff T = 1
;;; Return guard bits in GUARD (__tmp_reg__).
;;; Rounding, T = 0:  -1.0 LSB  <  error  <=  0   LSB
;;; Rounding, T = 1:  -0.5 LSB  <  error  <=  0.5 LSB
DEFUN __mulusa3_round
    ;; Some of the MUL instructions have LSBs outside the result.
    ;; Don't ignore these LSBs in order to tame rounding error.
    ;; Use C2/C3 for these LSBs.

    clr C0
    clr C1
    mul A0, B0  $  movw C2, r0

    mul A1, B0  $  add  C3, r0  $  adc C0, r1
    mul A0, B1  $  add  C3, r0  $  adc C0, r1  $  rol C1

    ;; Round if T = 1.  Store guarding bits outside the result for rounding
    ;; and left-shift by the signed version (function below).
    brtc 0f
    sbrc C3, 7
    adiw C0, 1
0:  push C3

    ;; The following MULs don't have LSBs outside the result.
    ;; C2/C3 is the high part.

    mul  A0, B2  $  add C0, r0  $  adc C1, r1  $  sbc  C2, C2
    mul  A1, B1  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
    mul  A2, B0  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
    neg  C2

    mul  A0, B3  $  add C1, r0  $  adc C2, r1  $  sbc  C3, C3
    mul  A1, B2  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
    mul  A2, B1  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
    mul  A3, B0  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
    neg  C3

    mul  A1, B3  $  add C2, r0  $  adc C3, r1
    mul  A2, B2  $  add C2, r0  $  adc C3, r1
    mul  A3, B1  $  add C2, r0  $  adc C3, r1

    mul  A2, B3  $  add C3, r0
    mul  A3, B2  $  add C3, r0

    ;; Guard bits used in the signed version below.
    pop  GUARD
    clr  __zero_reg__
    ret
ENDF __mulusa3_round
#endif /* L_mulusa3 */

#if defined (L_mulsa3)
;;; (C3:C0) = (A3:A0) * (B3:B0)
;;; Clobbers: __tmp_reg__, T
;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
DEFUN __mulsa3
    clt
    XCALL   __mulusa3_round
    ;; A posteriori sign extension of the operands
    tst     B3
    brpl 1f
    sub     C2, A0
    sbc     C3, A1
1:  sbrs    A3, 7
    rjmp 2f
    sub     C2, B0
    sbc     C3, B1
2:
    ;;  Shift 1 bit left to adjust for 15 fractional bits
    lsl     GUARD
    rol     C0
    rol     C1
    rol     C2
    rol     C3
    ;; Round last digit
    lsl     GUARD
    adc     C0, __zero_reg__
    adc     C1, __zero_reg__
    adc     C2, __zero_reg__
    adc     C3, __zero_reg__
    ret
ENDF __mulsa3
#endif /* L_mulsa3 */

#undef A0
#undef A1
#undef A2
#undef A3
#undef B0
#undef B1
#undef B2
#undef B3
#undef C0
#undef C1
#undef C2
#undef C3

#else /* __AVR_HAVE_MUL__ */

#define A0 18
#define A1 A0+1
#define A2 A0+2
#define A3 A0+3

#define B0 22
#define B1 B0+1
#define B2 B0+2
#define B3 B0+3

#define C0  22
#define C1  C0+1
#define C2  C0+2
#define C3  C0+3

;; __tmp_reg__
#define CC0  0
;; __zero_reg__
#define CC1  1
#define CC2  16
#define CC3  17

#define AA0  26
#define AA1  AA0+1
#define AA2  30
#define AA3  AA2+1

#if defined (L_mulsa3)
;;; (R25:R22)  *=  (R21:R18)
;;; Clobbers: ABI, called by optabs
;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
DEFUN   __mulsa3
    push    B0
    push    B1
    push    B3
    clt
    XCALL   __mulusa3_round
    pop     r30
    ;; sign-extend B
    bst     r30, 7
    brtc 1f
    ;; A1, A0 survived in  R27:R26
    sub     C2, AA0
    sbc     C3, AA1
1:
    pop     AA1  ;; B1
    pop     AA0  ;; B0

    ;; sign-extend A.  A3 survived in  R31
    bst     AA3, 7
    brtc 2f
    sub     C2, AA0
    sbc     C3, AA1
2:
    ;;  Shift 1 bit left to adjust for 15 fractional bits
    lsl     GUARD
    rol     C0
    rol     C1
    rol     C2
    rol     C3
    ;; Round last digit
    lsl     GUARD
    adc     C0, __zero_reg__
    adc     C1, __zero_reg__
    adc     C2, __zero_reg__
    adc     C3, __zero_reg__
    ret
ENDF __mulsa3
#endif  /* L_mulsa3 */

#if defined (L_mulusa3)
;;; (R25:R22)  *=  (R21:R18)
;;; Clobbers: ABI, called by optabs
;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
DEFUN __mulusa3
    set
    ;; Fallthru
ENDF  __mulusa3

;;; A[] survives in 26, 27, 30, 31
;;; Also used by __mulsa3 with T = 0
;;; Round if T = 1
;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version.
DEFUN __mulusa3_round
    push    CC2
    push    CC3
    ; clear result
    clr     __tmp_reg__
    wmov    CC2, CC0
    ; save multiplicand
    wmov    AA0, A0
    wmov    AA2, A2
    rjmp 3f

    ;; Loop the integral part

1:  ;; CC += A * 2^n;  n >= 0
    add  CC0,A0  $  adc CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3

2:  ;; A <<= 1
    lsl  A0      $  rol A1      $  rol  A2      $  rol  A3

3:  ;; IBIT(B) >>= 1
    ;; Carry = n-th bit of B;  n >= 0
    lsr     B3
    ror     B2
    brcs 1b
    sbci    B3, 0
    brne 2b

    ;; Loop the fractional part
    ;; B2/B3 is 0 now, use as guard bits for rounding
    ;; Restore multiplicand
    wmov    A0, AA0
    wmov    A2, AA2
    rjmp 5f

4:  ;; CC += A:Guard * 2^n;  n < 0
    add  B3,B2 $  adc  CC0,A0  $  adc  CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
5:
    ;; A:Guard >>= 1
    lsr  A3   $  ror  A2  $  ror  A1  $  ror   A0  $   ror  B2

    ;; FBIT(B) <<= 1
    ;; Carry = n-th bit of B;  n < 0
    lsl     B0
    rol     B1
    brcs 4b
    sbci    B0, 0
    brne 5b

    ;; Save guard bits and set carry for rounding
    push    B3
    lsl     B3
    ;; Move result into place
    wmov    C2, CC2
    wmov    C0, CC0
    clr     __zero_reg__
    brtc 6f
    ;; Round iff T = 1
    adc     C0, __zero_reg__
    adc     C1, __zero_reg__
    adc     C2, __zero_reg__
    adc     C3, __zero_reg__
6:
    pop     GUARD
    ;; Epilogue
    pop     CC3
    pop     CC2
    ret
ENDF __mulusa3_round
#endif  /* L_mulusa3 */

#undef A0
#undef A1
#undef A2
#undef A3
#undef B0
#undef B1
#undef B2
#undef B3
#undef C0
#undef C1
#undef C2
#undef C3
#undef AA0
#undef AA1
#undef AA2
#undef AA3
#undef CC0
#undef CC1
#undef CC2
#undef CC3

#endif /* __AVR_HAVE_MUL__ */

#undef GUARD

/*******************************************************
      Fractional Division 8 / 8
*******************************************************/

#define r_divd  r25     /* dividend */
#define r_quo   r24     /* quotient */
#define r_div   r22     /* divisor */
#define r_sign  __tmp_reg__

#if defined (L_divqq3)
DEFUN   __divqq3
    mov     r_sign, r_divd
    eor     r_sign, r_div
    sbrc    r_div, 7
    neg     r_div
    sbrc    r_divd, 7
    neg     r_divd
    XCALL   __divqq_helper
    lsr     r_quo
    sbrc    r_sign, 7   ; negate result if needed
    neg     r_quo
    ret
ENDF __divqq3
#endif  /* L_divqq3 */

#if defined (L_udivuqq3)
DEFUN   __udivuqq3
    cp      r_divd, r_div
    brsh    0f
    XJMP __divqq_helper
    ;; Result is out of [0, 1)  ==>  Return 1 - eps.
0:  ldi     r_quo, 0xff
    ret
ENDF __udivuqq3
#endif  /* L_udivuqq3 */


#if defined (L_divqq_helper)
DEFUN   __divqq_helper
    clr     r_quo           ; clear quotient
    inc     __zero_reg__    ; init loop counter, used per shift
__udivuqq3_loop:
    lsl     r_divd          ; shift dividend
    brcs    0f              ; dividend overflow
    cp      r_divd,r_div    ; compare dividend & divisor
    brcc    0f              ; dividend >= divisor
    rol     r_quo           ; shift quotient (with CARRY)
    rjmp    __udivuqq3_cont
0:
    sub     r_divd,r_div    ; restore dividend
    lsl     r_quo           ; shift quotient (without CARRY)
__udivuqq3_cont:
    lsl     __zero_reg__    ; shift loop-counter bit
    brne    __udivuqq3_loop
    com     r_quo           ; complement result
                            ; because C flag was complemented in loop
    ret
ENDF __divqq_helper
#endif  /* L_divqq_helper */

#undef  r_divd
#undef  r_quo
#undef  r_div
#undef  r_sign


/*******************************************************
    Fractional Division 16 / 16
*******************************************************/
#define r_divdL 26     /* dividend Low */
#define r_divdH 27     /* dividend Hig */
#define r_quoL  24     /* quotient Low */
#define r_quoH  25     /* quotient High */
#define r_divL  22     /* divisor */
#define r_divH  23     /* divisor */
#define r_cnt   21

#if defined (L_divhq3)
DEFUN   __divhq3
    mov     r0, r_divdH
    eor     r0, r_divH
    sbrs    r_divH, 7
    rjmp    1f
    NEG2    r_divL
1:
    sbrs    r_divdH, 7
    rjmp    2f
    NEG2    r_divdL
2:
    cp      r_divdL, r_divL
    cpc     r_divdH, r_divH
    breq    __divhq3_minus1  ; if equal return -1
    XCALL   __udivuhq3
    lsr     r_quoH
    ror     r_quoL
    brpl    9f
    ;; negate result if needed
    NEG2    r_quoL
9:
    ret
__divhq3_minus1:
    ldi     r_quoH, 0x80
    clr     r_quoL
    ret
ENDF __divhq3
#endif  /* defined (L_divhq3) */

#if defined (L_udivuhq3)
DEFUN   __udivuhq3
    sub     r_quoH,r_quoH   ; clear quotient and carry
    ;; FALLTHRU
ENDF __udivuhq3

DEFUN   __udivuha3_common
    clr     r_quoL          ; clear quotient
    ldi     r_cnt,16        ; init loop counter
__udivuhq3_loop:
    rol     r_divdL         ; shift dividend (with CARRY)
    rol     r_divdH
    brcs    __udivuhq3_ep   ; dividend overflow
    cp      r_divdL,r_divL  ; compare dividend & divisor
    cpc     r_divdH,r_divH
    brcc    __udivuhq3_ep   ; dividend >= divisor
    rol     r_quoL          ; shift quotient (with CARRY)
    rjmp    __udivuhq3_cont
__udivuhq3_ep:
    sub     r_divdL,r_divL  ; restore dividend
    sbc     r_divdH,r_divH
    lsl     r_quoL          ; shift quotient (without CARRY)
__udivuhq3_cont:
    rol     r_quoH          ; shift quotient
    dec     r_cnt           ; decrement loop counter
    brne    __udivuhq3_loop
    com     r_quoL          ; complement result
    com     r_quoH          ; because C flag was complemented in loop
    ret
ENDF __udivuha3_common
#endif  /* defined (L_udivuhq3) */

/*******************************************************
    Fixed Division 8.8 / 8.8
*******************************************************/
#if defined (L_divha3)
DEFUN   __divha3
    mov     r0, r_divdH
    eor     r0, r_divH
    sbrs    r_divH, 7
    rjmp    1f
    NEG2    r_divL
1:
    sbrs    r_divdH, 7
    rjmp    2f
    NEG2    r_divdL
2:
    XCALL   __udivuha3
    lsr     r_quoH  ; adjust to 7 fractional bits
    ror     r_quoL
    sbrs    r0, 7   ; negate result if needed
    ret
    NEG2    r_quoL
    ret
ENDF __divha3
#endif  /* defined (L_divha3) */

#if defined (L_udivuha3)
DEFUN   __udivuha3
    mov     r_quoH, r_divdL
    mov     r_divdL, r_divdH
    clr     r_divdH
    lsl     r_quoH     ; shift quotient into carry
    XJMP    __udivuha3_common ; same as fractional after rearrange
ENDF __udivuha3
#endif  /* defined (L_udivuha3) */

#undef  r_divdL
#undef  r_divdH
#undef  r_quoL
#undef  r_quoH
#undef  r_divL
#undef  r_divH
#undef  r_cnt

/*******************************************************
    Fixed Division 16.16 / 16.16
*******************************************************/

#define r_arg1L  24    /* arg1 gets passed already in place */
#define r_arg1H  25
#define r_arg1HL 26
#define r_arg1HH 27
#define r_divdL  26    /* dividend Low */
#define r_divdH  27
#define r_divdHL 30
#define r_divdHH 31    /* dividend High */
#define r_quoL   22    /* quotient Low */
#define r_quoH   23
#define r_quoHL  24
#define r_quoHH  25    /* quotient High */
#define r_divL   18    /* divisor Low */
#define r_divH   19
#define r_divHL  20
#define r_divHH  21    /* divisor High */
#define r_cnt  __zero_reg__  /* loop count (0 after the loop!) */

#if defined (L_divsa3)
DEFUN   __divsa3
    mov     r0, r_arg1HH
    eor     r0, r_divHH
    sbrs    r_divHH, 7
    rjmp    1f
    NEG4    r_divL
1:
    sbrs    r_arg1HH, 7
    rjmp    2f
    NEG4    r_arg1L
2:
    XCALL   __udivusa3
    lsr     r_quoHH ; adjust to 15 fractional bits
    ror     r_quoHL
    ror     r_quoH
    ror     r_quoL
    sbrs    r0, 7   ; negate result if needed
    ret
    ;; negate r_quoL
    XJMP    __negsi2
ENDF __divsa3
#endif  /* defined (L_divsa3) */

#if defined (L_udivusa3)
DEFUN   __udivusa3
    ldi     r_divdHL, 32    ; init loop counter
    mov     r_cnt, r_divdHL
    clr     r_divdHL
    clr     r_divdHH
    wmov    r_quoL, r_divdHL
    lsl     r_quoHL         ; shift quotient into carry
    rol     r_quoHH
__udivusa3_loop:
    rol     r_divdL         ; shift dividend (with CARRY)
    rol     r_divdH
    rol     r_divdHL
    rol     r_divdHH
    brcs    __udivusa3_ep   ; dividend overflow
    cp      r_divdL,r_divL  ; compare dividend & divisor
    cpc     r_divdH,r_divH
    cpc     r_divdHL,r_divHL
    cpc     r_divdHH,r_divHH
    brcc    __udivusa3_ep   ; dividend >= divisor
    rol     r_quoL          ; shift quotient (with CARRY)
    rjmp    __udivusa3_cont
__udivusa3_ep:
    sub     r_divdL,r_divL  ; restore dividend
    sbc     r_divdH,r_divH
    sbc     r_divdHL,r_divHL
    sbc     r_divdHH,r_divHH
    lsl     r_quoL          ; shift quotient (without CARRY)
__udivusa3_cont:
    rol     r_quoH          ; shift quotient
    rol     r_quoHL
    rol     r_quoHH
    dec     r_cnt           ; decrement loop counter
    brne    __udivusa3_loop
    com     r_quoL          ; complement result
    com     r_quoH          ; because C flag was complemented in loop
    com     r_quoHL
    com     r_quoHH
    ret
ENDF __udivusa3
#endif  /* defined (L_udivusa3) */

#undef  r_arg1L
#undef  r_arg1H
#undef  r_arg1HL
#undef  r_arg1HH
#undef  r_divdL
#undef  r_divdH
#undef  r_divdHL
#undef  r_divdHH
#undef  r_quoL
#undef  r_quoH
#undef  r_quoHL
#undef  r_quoHH
#undef  r_divL
#undef  r_divH
#undef  r_divHL
#undef  r_divHH
#undef  r_cnt


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Saturation, 2 Bytes
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; First Argument and Return Register
#define A0  24
#define A1  A0+1

#if defined (L_ssneg_2)
DEFUN __ssneg_2
    NEG2    A0
    brvc 0f
    sbiw    A0, 1
0:  ret
ENDF __ssneg_2
#endif /* L_ssneg_2 */

#if defined (L_ssabs_2)
DEFUN __ssabs_2
    sbrs    A1, 7
    ret
    XJMP    __ssneg_2
ENDF __ssabs_2
#endif /* L_ssabs_2 */

#undef A0
#undef A1


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Saturation, 4 Bytes
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; First Argument and Return Register
#define A0  22
#define A1  A0+1
#define A2  A0+2
#define A3  A0+3

#if defined (L_ssneg_4)
DEFUN __ssneg_4
    XCALL   __negsi2
    brvc 0f
    ldi     A3, 0x7f
    ldi     A2, 0xff
    ldi     A1, 0xff
    ldi     A0, 0xff
0:  ret
ENDF __ssneg_4
#endif /* L_ssneg_4 */

#if defined (L_ssabs_4)
DEFUN __ssabs_4
    sbrs    A3, 7
    ret
    XJMP    __ssneg_4
ENDF __ssabs_4
#endif /* L_ssabs_4 */

#undef A0
#undef A1
#undef A2
#undef A3


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Saturation, 8 Bytes
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; First Argument and Return Register
#define A0  18
#define A1  A0+1
#define A2  A0+2
#define A3  A0+3
#define A4  A0+4
#define A5  A0+5
#define A6  A0+6
#define A7  A0+7

#if defined (L_clr_8)
FALIAS __usneguta2
FALIAS __usneguda2
FALIAS __usnegudq2

;; Clear Carry and all Bytes
DEFUN __clr_8
    ;; Clear Carry and set Z
    sub     A7, A7
    ;; FALLTHRU
ENDF  __clr_8
;; Propagate Carry to all Bytes, Carry unaltered
DEFUN __sbc_8
    sbc     A7, A7
    sbc     A6, A6
    wmov    A4, A6
    wmov    A2, A6
    wmov    A0, A6
    ret
ENDF __sbc_8
#endif /* L_clr_8 */

#if defined (L_ssneg_8)
FALIAS __ssnegta2
FALIAS __ssnegda2
FALIAS __ssnegdq2

DEFUN __ssneg_8
    XCALL   __negdi2
    brvc 0f
    ;; A[] = 0x7fffffff
    sec
    XCALL   __sbc_8
    ldi     A7, 0x7f
0:  ret
ENDF __ssneg_8
#endif /* L_ssneg_8 */

#if defined (L_ssabs_8)
FALIAS __ssabsta2
FALIAS __ssabsda2
FALIAS __ssabsdq2

DEFUN __ssabs_8
    sbrs    A7, 7
    ret
    XJMP    __ssneg_8
ENDF __ssabs_8
#endif /* L_ssabs_8 */

;; Second Argument
#define B0  10
#define B1  B0+1
#define B2  B0+2
#define B3  B0+3
#define B4  B0+4
#define B5  B0+5
#define B6  B0+6
#define B7  B0+7

#if defined (L_usadd_8)
FALIAS __usadduta3
FALIAS __usadduda3
FALIAS __usaddudq3

DEFUN __usadd_8
    XCALL   __adddi3
    brcs 0f
    ret
0:  ;; A[] = 0xffffffff
    XJMP    __sbc_8
ENDF __usadd_8
#endif /* L_usadd_8 */

#if defined (L_ussub_8)
FALIAS __ussubuta3
FALIAS __ussubuda3
FALIAS __ussubudq3

DEFUN __ussub_8
    XCALL   __subdi3
    brcs 0f
    ret
0:  ;; A[] = 0
    XJMP    __clr_8
ENDF __ussub_8
#endif /* L_ussub_8 */

#if defined (L_ssadd_8)
FALIAS __ssaddta3
FALIAS __ssaddda3
FALIAS __ssadddq3

DEFUN __ssadd_8
    XCALL   __adddi3
    brvc 0f
    ;; A = (B >= 0) ? INT64_MAX : INT64_MIN
    cpi     B7, 0x80
    XCALL   __sbc_8
    subi    A7, 0x80
0:  ret
ENDF __ssadd_8
#endif /* L_ssadd_8 */

#if defined (L_sssub_8)
FALIAS __sssubta3
FALIAS __sssubda3
FALIAS __sssubdq3

DEFUN __sssub_8
    XCALL   __subdi3
    brvc 0f
    ;; A = (B < 0) ? INT64_MAX : INT64_MIN
    ldi     A7, 0x7f
    cp      A7, B7
    XCALL   __sbc_8
    subi    A7, 0x80
0:  ret
ENDF __sssub_8
#endif /* L_sssub_8 */

#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7
#undef B0
#undef B1
#undef B2
#undef B3
#undef B4
#undef B5
#undef B6
#undef B7