[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / adddf3.S

/* Copyright (C) 2008-2024 Free Software Foundation, Inc.
   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
		on behalf of Synopsys Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

#include "arc-ieee-754.h"
#if 0 /* DEBUG */
	.global __adddf3
	.balign 4
__adddf3:
	push_s blink
	push_s r2
	push_s r3
	push_s r0
	bl.d __adddf3_c
	push_s r1
	ld_s r2,[sp,12]
	ld_s r3,[sp,8]
	st_s r0,[sp,12]
	st_s r1,[sp,8]
	pop_s r1
	bl.d __adddf3_asm
	pop_s r0
	pop_s r3
	pop_s r2
	pop_s blink
	cmp r0,r2
	cmp.eq r1,r3
	jeq_s [blink]
	bl abort
	.global __subdf3
	.balign 4
__subdf3:
	push_s blink
	push_s r2
	push_s r3
	push_s r0
	bl.d __subdf3_c
	push_s r1
	ld_s r2,[sp,12]
	ld_s r3,[sp,8]
	st_s r0,[sp,12]
	st_s r1,[sp,8]
	pop_s r1
	bl.d __subdf3_asm
	pop_s r0
	pop_s r3
	pop_s r2
	pop_s blink
	cmp r0,r2
	cmp.eq r1,r3
	jeq_s [blink]
	bl abort
#define __adddf3 __adddf3_asm
#define __subdf3 __subdf3_asm
#endif /* DEBUG */
/* N.B. This is optimized for ARC700.
  ARC600 has very different scheduling / instruction selection criteria.  */

/* inputs: DBL0, DBL1 (r0-r3)
   output: DBL0 (r0, r1)
   clobber: r2-r10, r12, flags
   All NaN highword bits must be 1.  NaN low word is random.  */

	.balign 4
	.global __adddf3
	.global __subdf3
	.long 0x7ff00000 ; exponent mask
	FUNC(__adddf3)
	FUNC(__subdf3)
__subdf3:
	bxor_l DBL1H,DBL1H,31
__adddf3:
	ld r9,[pcl,-8]
	bmsk r4,DBL0H,30
	xor r10,DBL0H,DBL1H
	and r6,DBL1H,r9
	sub.f r12,r4,r6
	asr_s r12,r12,20
	blo .Ldbl1_gt
	brhs r4,r9,.Linf_nan
	brhs r12,32,.Large_shift
	brne r12,0,.Lsmall_shift
	brge r10,0,.Ladd_same_exp ; r12 == 0

/* After subtracting, we need to normalize; when shifting to place the
  leading 1 into position for the implicit 1 and adding that to DBL0H,
  we increment the exponent.  Thus, we have to subtract one more than
  the shift count from the exponent beforehand.  Iff the exponent drops thus
  below zero (before adding in the fraction with the leading one), we have
  generated a denormal number.  Denormal handling is basicallly reducing the
  shift count so that we produce a zero exponent instead; however, this way
  the shift count can become zero (if we started out with exponent 1).
  Therefore, a simple min operation is not good enough, since we don't
  want to handle a zero normalizing shift in the main path.
  On the plus side, we don't need to check for denorm input, the result
  of subtracing these looks just the same as denormals generated during
  subtraction.  */
	bmsk r7,DBL1H,30
	cmp r4,r7
	cmp.eq DBL0L,DBL1L
	blo .L_rsub_same_exp
	sub.f DBL0L,DBL0L,DBL1L
	bmsk r12,DBL0H,19
	bic DBL1H,DBL0H,r12
	sbc.f r4,r4,r7
	beq_l .Large_cancel
	norm DBL1L,r4
	b.d .Lsub_done_same_exp
	sub r12,DBL1L,9

	.balign 4
.Linf_nan:
	; If both inputs are inf, but with different signs, the result is NaN.
	asr r12,r10,31
	or_s DBL1H,DBL1H,r12
	j_s.d [blink]
	or.eq DBL0H,DBL0H,DBL1H

	.balign 4
.L_rsub_same_exp:
	rsub.f DBL0L,DBL0L,DBL1L
	bmsk r12,DBL1H,19
	bic_s DBL1H,DBL1H,r12
	sbc.f r4,r7,r4
	beq_l .Large_cancel
	norm DBL1L,r4

	sub r12,DBL1L,9
.Lsub_done_same_exp:
	asl_s r12,r12,20
	sub_s DBL1L,DBL1L,10
	sub DBL0H,DBL1H,r12
	xor.f 0,DBL0H,DBL1H
	bmi .Ldenorm
.Lpast_denorm:
	neg_s r12,DBL1L
	lsr r7,DBL0L,r12
	asl r12,r4,DBL1L
	asl_s DBL0L,DBL0L,DBL1L
	add_s r12,r12,r7
	j_s.d [blink]
	add_l DBL0H,DBL0H,r12
	.balign 4
.Ladd_same_exp:
	/* This is a special case because we can't test for need to shift
	   down by checking if bit 20 of DBL0H changes.  OTOH, here we know
	   that we always need to shift down.  */
	; The implicit 1 of DBL0 is not shifted together with the
	;  fraction, thus effectively doubled, compensating for not setting
	;  implicit1 for DBL1
	add_s r12,DBL0L,DBL1L
	lsr.f 0,r12,2 ; round to even
	breq r6,0,.Ldenorm_add
	adc.f DBL0L,DBL0L,DBL1L
	sub r7,DBL1H,DBL0H
	sub1 r7,r7,r9 ; boost exponent by 2/2
	rrc DBL0L,DBL0L
	asr.f r7,r7 ; DBL1.fraction/2 - DBL0.fraction/2 ; exp++
	add.cs.f DBL0L,DBL0L,0x80000000
	add_l DBL0H,DBL0H,r7 ; DBL0.implicit1 not shifted for DBL1.implicit1
	add.cs DBL0H,DBL0H,1
	bic.f 0,r9,DBL0H ; check for overflow -> infinity.
	jne_l [blink]
	and DBL0H,DBL0H,0xfff00000
	j_s.d [blink]
	mov_s DBL0L,0
	.balign 4
.Large_shift:
	brhs r12,55,.Lret_dbl0
	bmsk_s DBL1H,DBL1H,19
	brne r6,0,.Lno_denorm_large_shift
	brhi.d r12,33,.Lfixed_denorm_large_shift
	sub_s r12,r12,1
	breq r12,31, .Lfixed_denorm_small_shift
.Lshift32:
	mov_s r12,DBL1L
	mov_s DBL1L,DBL1H
	brlt.d r10,0,.Lsub
	mov_s DBL1H,0
	b_s .Ladd
.Ldenorm_add:
	cmp_s r12,DBL1L
	mov_s DBL0L,r12
	j_s.d [blink]
	adc DBL0H,r4,DBL1H

.Lret_dbl0:
	j_s [blink]
	.balign 4
.Lsmall_shift:
	breq.d r6,0,.Ldenorm_small_shift
	bmsk_s DBL1H,DBL1H,19
	bset_s DBL1H,DBL1H,20
.Lfixed_denorm_small_shift:
	neg r8,r12
	asl r4,DBL1H,r8
	lsr_l DBL1H,DBL1H,r12
	lsr r5,DBL1L,r12
	asl r12,DBL1L,r8
	brge.d r10,0,.Ladd
	or DBL1L,r4,r5
/* subtract, abs(DBL0) > abs(DBL1) */
/* DBL0H, DBL0L: original values
   DBL1H, DBL1L: fraction with explicit leading 1, shifted into place
   r4:  orig. DBL0H & 0x7fffffff
   r6:  orig. DBL1H & 0x7ff00000
   r9:  0x7ff00000
   r10: orig. DBL0H ^ DBL1H
   r12: guard bits */
	.balign 4
.Lsub:
	neg.f r12,r12
	mov_s r7,DBL1H
	bmsk r5,DBL0H,19
	sbc.f DBL0L,DBL0L,DBL1L
	bic DBL1H,DBL0H,r5
	bset r5,r5,20
	sbc.f r4,r5,r7
	beq_l .Large_cancel_sub
	norm DBL1L,r4
	bmsk r6,DBL1H,30
.Lsub_done:
	sub_s DBL1L,DBL1L,9
	breq DBL1L,1,.Lsub_done_noshift
	asl r5,DBL1L,20
	sub_s DBL1L,DBL1L,1
	brlo r6,r5,.Ldenorm_sub
	sub DBL0H,DBL1H,r5
.Lpast_denorm_sub:
	neg_s DBL1H,DBL1L
	lsr r6,r12,DBL1H
	asl_s r12,r12,DBL1L
	and r8,r6,1
	add1.f 0,r8,r12
	add.ne.f r12,r12,r12
	asl r8,DBL0L,DBL1L
	lsr r12,DBL0L,DBL1H
	adc.f DBL0L,r8,r6
	asl r5,r4,DBL1L
	add_s DBL0H,DBL0H,r12
	j_s.d [blink]
	adc DBL0H,DBL0H,r5

	.balign 4
.Lno_denorm_large_shift:
	breq.d r12,32,.Lshift32
	bset_l DBL1H,DBL1H,20
.Lfixed_denorm_large_shift:
	neg r8,r12
	asl r4,DBL1H,r8
	lsr r5,DBL1L,r12
	asl.f 0,DBL1L,r8
	lsr DBL1L,DBL1H,r12
	or r12,r4,r5
	tst.eq r12,1
	or.ne r12,r12,2
	brlt.d r10,0,.Lsub
	mov_s DBL1H,0
	b_l .Ladd

	; If a denorm is produced without shifting, we have an exact result -
	; no need for rounding.
	.balign 4
.Ldenorm_sub:
	lsr DBL1L,r6,20
	xor DBL0H,r6,DBL1H
	brne.d DBL1L,1,.Lpast_denorm_sub
	sub_s DBL1L,DBL1L,1
.Lsub_done_noshift:
	add.f 0,r12,r12
	btst.eq DBL0L,0
	cmp.eq r12,r12
	add.cs.f DBL0L,DBL0L,1
	bclr r4,r4,20
	j_s.d [blink]
	adc DBL0H,DBL1H,r4

	.balign 4
.Ldenorm_small_shift:
	brne.d r12,1,.Lfixed_denorm_small_shift
	sub_l r12,r12,1
	brlt r10,0,.Lsub
.Ladd: ; bit 20 of DBL1H is clear and bit 0 of r12 does not matter
	add.f DBL0L,DBL0L,DBL1L
	add_s DBL1H,DBL1H,DBL0H
	add.cs DBL1H,DBL1H,1
	xor_l DBL0H,DBL0H,DBL1H
	bbit0 DBL0H,20,.Lno_shiftdown
	lsr.f DBL0H,DBL1H
	and r4,DBL0L,2
	bmsk DBL0H,DBL0H,18
	sbc DBL0H,DBL1H,DBL0H
	rrc.f DBL0L,DBL0L
	or.f r12,r12,r4
	cmp.eq r12,r12
	add.cs.f DBL0L,DBL0L,1
	bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
	jne.d [blink]    ; ... non-zero fraction
	add.cs DBL0H,DBL0H,1
	mov_s DBL0L,0
	bmsk DBL1H,DBL0H,19
	j_s.d [blink]
	bic_s DBL0H,DBL0H,DBL1H
.Lno_shiftdown:
	mov_s DBL0H,DBL1H
	add.f 0,r12,r12
	btst.eq DBL0L,0
	cmp.eq r12,r12
	add.cs.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.cs DBL0H,DBL0H,1
	.balign 4
.Ldenorm:
	bmsk DBL0H,DBL1H,30
	lsr r12,DBL0H,20
	xor_s DBL0H,DBL0H,DBL1H
	sub_l DBL1L,r12,1
	bgt .Lpast_denorm
	j_s.d [blink]
	add_l DBL0H,DBL0H,r4

	.balign 4
.Large_cancel:
	;DBL0L: mantissa DBL1H: sign & exponent
	norm.f DBL1L,DBL0L
	bmsk DBL0H,DBL1H,30
	add_s DBL1L,DBL1L,22
	mov.mi DBL1L,21
	add_s r12,DBL1L,1
	asl_s r12,r12,20
	beq_s .Lret0
	brhs.d DBL0H,r12,.Lpast_denorm_large_cancel
	sub DBL0H,DBL1H,r12
	bmsk DBL0H,DBL1H,30
	lsr r12,DBL0H,20
	xor_s DBL0H,DBL0H,DBL1H
	sub.f DBL1L,r12,1
	jle [blink]
.Lpast_denorm_large_cancel:
	rsub.f r7,DBL1L,32
	lsr r7,DBL0L,r7
	asl_s DBL0L,DBL0L,DBL1L
	mov.ls r7,DBL0L
	add_s DBL0H,DBL0H,r7
	j_s.d [blink]
	mov.ls DBL0L,0
.Lret0:
	j_s.d	[blink]
	mov_l	DBL0H,0

/* r4:DBL0L:r12 : unnormalized result fraction
   DBL1H: result sign and exponent         */
/* When seeing large cancellation, only the topmost guard bit might be set.  */
	.balign 4
.Large_cancel_sub:
	norm.f DBL1L,DBL0L
	bpnz.d 0f
	bmsk DBL0H,DBL1H,30
	mov r5,22<<20
	bne.d 1f
	mov_s DBL1L,21
	bset r5,r5,5+20
	add_s DBL1L,DBL1L,32
	brne r12,0,1f
	j_s.d	[blink]
	mov_l	DBL0H,0
	.balign 4
0:	add r5,DBL1L,23
	asl r5,r5,20
	add_s DBL1L,DBL1L,22
1:	brlo DBL0H,r5,.Ldenorm_large_cancel_sub
	sub DBL0H,DBL1H,r5
.Lpast_denorm_large_cancel_sub:
	rsub.f r7,DBL1L,32
	lsr r12,r12,r7
	lsr r7,DBL0L,r7
	asl_s DBL0L,DBL0L,DBL1L
	add.ge DBL0H,DBL0H,r7
	add_s DBL0L,DBL0L,r12
	add.lt DBL0H,DBL0H,DBL0L
	mov.eq DBL0L,r12
	j_s.d [blink]
	mov.lt DBL0L,0
	.balign 4
.Ldenorm_large_cancel_sub:
	lsr r5,DBL0H,20
	xor_s DBL0H,DBL0H,DBL1H
	brgt.d r5,1,.Lpast_denorm_large_cancel_sub
	sub DBL1L,r5,1
	j_l [blink] ; denorm, no shift -> no rounding needed.

/* r4: DBL0H & 0x7fffffff
   r6: DBL1H & 0x7ff00000
   r9: 0x7ff00000
   r10: sign difference
   r12: shift count (negative) */
	.balign 4
.Ldbl1_gt:
	brhs r6,r9,.Lret_dbl1 ; inf or NaN
	neg r8,r12
	brhs r8,32,.Large_shift_dbl0
.Lsmall_shift_dbl0:
	breq.d r6,0,.Ldenorm_small_shift_dbl0
	bmsk_s DBL0H,DBL0H,19
	bset_s DBL0H,DBL0H,20
.Lfixed_denorm_small_shift_dbl0:
	asl r4,DBL0H,r12
	lsr DBL0H,DBL0H,r8
	lsr r5,DBL0L,r8
	asl r12,DBL0L,r12
	brge.d r10,0,.Ladd_dbl1_gt
	or DBL0L,r4,r5
/* subtract, abs(DBL0) < abs(DBL1) */
/* DBL0H, DBL0L: fraction with explicit leading 1, shifted into place
   DBL1H, DBL1L: original values
   r6:  orig. DBL1H & 0x7ff00000
   r9:  0x7ff00000
   r12: guard bits */
	.balign 4
.Lrsub:
	neg.f r12,r12
	bmsk r7,DBL1H,19
	mov_s r5,DBL0H
	sbc.f DBL0L,DBL1L,DBL0L
	bic DBL1H,DBL1H,r7
	bset r7,r7,20
	sbc.f r4,r7,r5
	beq_l .Large_cancel_sub
	norm DBL1L,r4
	b_l .Lsub_done ; note: r6 is already set up.

.Lret_dbl1:
	mov_s DBL0H,DBL1H
	j_s.d [blink]
	mov_l DBL0L,DBL1L
	.balign 4
.Ldenorm_small_shift_dbl0:
	sub.f r8,r8,1
	bne.d .Lfixed_denorm_small_shift_dbl0
	add_s r12,r12,1
	brlt r10,0,.Lrsub
.Ladd_dbl1_gt: ; bit 20 of DBL0H is clear and bit 0 of r12 does not matter
	add.f DBL0L,DBL0L,DBL1L
	add_s DBL0H,DBL0H,DBL1H
	add.cs DBL0H,DBL0H,1
	xor DBL1H,DBL0H,DBL1H
	bbit0 DBL1H,20,.Lno_shiftdown_dbl1_gt
	lsr.f DBL1H,DBL0H
	and r4,DBL0L,2
	bmsk DBL1H,DBL1H,18
	sbc DBL0H,DBL0H,DBL1H
	rrc.f DBL0L,DBL0L
	or.f r12,r12,r4
	cmp.eq r12,r12
	add.cs.f DBL0L,DBL0L,1
	bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
	jne.d [blink]    ; ... non-zero fraction
	add.cs DBL0H,DBL0H,1
	mov_s DBL0L,0
	bmsk DBL1H,DBL0H,19
	j_s.d [blink]
	bic_s DBL0H,DBL0H,DBL1H
.Lno_shiftdown_dbl1_gt:
	add.f 0,r12,r12
	btst.eq DBL0L,0
	cmp.eq r12,r12
	add.cs.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.cs DBL0H,DBL0H,1

	.balign 4
.Large_shift_dbl0:
	brhs r8,55,.Lret_dbl1
	bmsk_s DBL0H,DBL0H,19
	brne r6,0,.Lno_denorm_large_shift_dbl0
	add_s r12,r12,1
	brne.d r8,33,.Lfixed_denorm_large_shift_dbl0
	sub r8,r8,1
	bset_s DBL0H,DBL0H,20
.Lshift32_dbl0:
	mov_s r12,DBL0L
	mov_s DBL0L,DBL0H
	brlt.d r10,0,.Lrsub
	mov_s DBL0H,0
	b_s .Ladd_dbl1_gt

	.balign 4
.Lno_denorm_large_shift_dbl0:
	breq.d r8,32,.Lshift32_dbl0
	bset_l DBL0H,DBL0H,20
.Lfixed_denorm_large_shift_dbl0:
	asl r4,DBL0H,r12
	lsr r5,DBL0L,r8
	asl.f 0,DBL0L,r12
	lsr DBL0L,DBL0H,r8
	or r12,r4,r5
	tst.eq r12,1
	or.ne r12,r12,2
	brlt.d r10,0,.Lrsub
	mov_s DBL0H,0
	b_l .Ladd_dbl1_gt
	ENDFUNC(__adddf3)
	ENDFUNC(__subdf3)
Commit	Line	Data
a945c346	1	/* Copyright (C) 2008-2024 Free Software Foundation, Inc.
d38a64b4 JR	2	Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
	3	on behalf of Synopsys Inc.
	4
	5	This file is part of GCC.
	6
	7	GCC is free software; you can redistribute it and/or modify it under
	8	the terms of the GNU General Public License as published by the Free
	9	Software Foundation; either version 3, or (at your option) any later
	10	version.
	11
	12	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	14	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	15	for more details.
	16
	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
	25
	26	#include "arc-ieee-754.h"
	27	#if 0 /* DEBUG */
	28	.global __adddf3
	29	.balign 4
	30	__adddf3:
	31	push_s blink
	32	push_s r2
	33	push_s r3
	34	push_s r0
	35	bl.d __adddf3_c
	36	push_s r1
	37	ld_s r2,[sp,12]
	38	ld_s r3,[sp,8]
	39	st_s r0,[sp,12]
	40	st_s r1,[sp,8]
	41	pop_s r1
	42	bl.d __adddf3_asm
	43	pop_s r0
	44	pop_s r3
	45	pop_s r2
	46	pop_s blink
	47	cmp r0,r2
	48	cmp.eq r1,r3
	49	jeq_s [blink]
	50	bl abort
	51	.global __subdf3
	52	.balign 4
	53	__subdf3:
	54	push_s blink
	55	push_s r2
	56	push_s r3
	57	push_s r0
	58	bl.d __subdf3_c
	59	push_s r1
	60	ld_s r2,[sp,12]
	61	ld_s r3,[sp,8]
	62	st_s r0,[sp,12]
	63	st_s r1,[sp,8]
	64	pop_s r1
	65	bl.d __subdf3_asm
66	pop_s r0
67	pop_s r3
68	pop_s r2
69	pop_s blink
70	cmp r0,r2
71	cmp.eq r1,r3
72	jeq_s [blink]
73	bl abort
74	#define __adddf3 __adddf3_asm
75	#define __subdf3 __subdf3_asm
76	#endif /* DEBUG */
77	/* N.B. This is optimized for ARC700.
78	ARC600 has very different scheduling / instruction selection criteria. */
79
80	/* inputs: DBL0, DBL1 (r0-r3)
81	output: DBL0 (r0, r1)
82	clobber: r2-r10, r12, flags
83	All NaN highword bits must be 1. NaN low word is random. */
84
85	.balign 4
86	.global __adddf3
87	.global __subdf3
88	.long 0x7ff00000 ; exponent mask
89	FUNC(__adddf3)
90	FUNC(__subdf3)
91	__subdf3:
92	bxor_l DBL1H,DBL1H,31
93	__adddf3:
94	ld r9,[pcl,-8]
95	bmsk r4,DBL0H,30
96	xor r10,DBL0H,DBL1H
97	and r6,DBL1H,r9
98	sub.f r12,r4,r6
99	asr_s r12,r12,20
100	blo .Ldbl1_gt
101	brhs r4,r9,.Linf_nan
102	brhs r12,32,.Large_shift
103	brne r12,0,.Lsmall_shift
104	brge r10,0,.Ladd_same_exp ; r12 == 0
105
106	/* After subtracting, we need to normalize; when shifting to place the
107	leading 1 into position for the implicit 1 and adding that to DBL0H,
108	we increment the exponent. Thus, we have to subtract one more than
109	the shift count from the exponent beforehand. Iff the exponent drops thus
110	below zero (before adding in the fraction with the leading one), we have
111	generated a denormal number. Denormal handling is basicallly reducing the
112	shift count so that we produce a zero exponent instead; however, this way
113	the shift count can become zero (if we started out with exponent 1).
114	Therefore, a simple min operation is not good enough, since we don't
115	want to handle a zero normalizing shift in the main path.
116	On the plus side, we don't need to check for denorm input, the result
117	of subtracing these looks just the same as denormals generated during
118	subtraction. */
119	bmsk r7,DBL1H,30
120	cmp r4,r7
121	cmp.eq DBL0L,DBL1L
122	blo .L_rsub_same_exp
123	sub.f DBL0L,DBL0L,DBL1L
124	bmsk r12,DBL0H,19
125	bic DBL1H,DBL0H,r12
126	sbc.f r4,r4,r7
127	beq_l .Large_cancel
128	norm DBL1L,r4
129	b.d .Lsub_done_same_exp
130	sub r12,DBL1L,9
131
132	.balign 4
133	.Linf_nan:
134	; If both inputs are inf, but with different signs, the result is NaN.
135	asr r12,r10,31
136	or_s DBL1H,DBL1H,r12
137	j_s.d [blink]
138	or.eq DBL0H,DBL0H,DBL1H
139
140	.balign 4
141	.L_rsub_same_exp:
142	rsub.f DBL0L,DBL0L,DBL1L
143	bmsk r12,DBL1H,19
144	bic_s DBL1H,DBL1H,r12
145	sbc.f r4,r7,r4
146	beq_l .Large_cancel
147	norm DBL1L,r4
148
149	sub r12,DBL1L,9
150	.Lsub_done_same_exp:
151	asl_s r12,r12,20
152	sub_s DBL1L,DBL1L,10
153	sub DBL0H,DBL1H,r12
154	xor.f 0,DBL0H,DBL1H
155	bmi .Ldenorm
156	.Lpast_denorm:
157	neg_s r12,DBL1L
158	lsr r7,DBL0L,r12
159	asl r12,r4,DBL1L
160	asl_s DBL0L,DBL0L,DBL1L
161	add_s r12,r12,r7
162	j_s.d [blink]
163	add_l DBL0H,DBL0H,r12
164	.balign 4
165	.Ladd_same_exp:
166	/* This is a special case because we can't test for need to shift
167	down by checking if bit 20 of DBL0H changes. OTOH, here we know
168	that we always need to shift down. */
169	; The implicit 1 of DBL0 is not shifted together with the
170	; fraction, thus effectively doubled, compensating for not setting
171	; implicit1 for DBL1
172	add_s r12,DBL0L,DBL1L
173	lsr.f 0,r12,2 ; round to even
174	breq r6,0,.Ldenorm_add
175	adc.f DBL0L,DBL0L,DBL1L
176	sub r7,DBL1H,DBL0H
177	sub1 r7,r7,r9 ; boost exponent by 2/2
178	rrc DBL0L,DBL0L
179	asr.f r7,r7 ; DBL1.fraction/2 - DBL0.fraction/2 ; exp++
180	add.cs.f DBL0L,DBL0L,0x80000000
181	add_l DBL0H,DBL0H,r7 ; DBL0.implicit1 not shifted for DBL1.implicit1
182	add.cs DBL0H,DBL0H,1
183	bic.f 0,r9,DBL0H ; check for overflow -> infinity.
184	jne_l [blink]
185	and DBL0H,DBL0H,0xfff00000
186	j_s.d [blink]
187	mov_s DBL0L,0
188	.balign 4
189	.Large_shift:
190	brhs r12,55,.Lret_dbl0
191	bmsk_s DBL1H,DBL1H,19
192	brne r6,0,.Lno_denorm_large_shift
193	brhi.d r12,33,.Lfixed_denorm_large_shift
194	sub_s r12,r12,1
195	breq r12,31, .Lfixed_denorm_small_shift
196	.Lshift32:
197	mov_s r12,DBL1L
198	mov_s DBL1L,DBL1H
199	brlt.d r10,0,.Lsub
200	mov_s DBL1H,0
201	b_s .Ladd
202	.Ldenorm_add:
203	cmp_s r12,DBL1L
204	mov_s DBL0L,r12
205	j_s.d [blink]
206	adc DBL0H,r4,DBL1H
207
208	.Lret_dbl0:
209	j_s [blink]
210	.balign 4
211	.Lsmall_shift:
212	breq.d r6,0,.Ldenorm_small_shift
213	bmsk_s DBL1H,DBL1H,19
214	bset_s DBL1H,DBL1H,20
215	.Lfixed_denorm_small_shift:
216	neg r8,r12
217	asl r4,DBL1H,r8
218	lsr_l DBL1H,DBL1H,r12
219	lsr r5,DBL1L,r12
220	asl r12,DBL1L,r8
221	brge.d r10,0,.Ladd
222	or DBL1L,r4,r5
223	/* subtract, abs(DBL0) > abs(DBL1) */
224	/* DBL0H, DBL0L: original values
225	DBL1H, DBL1L: fraction with explicit leading 1, shifted into place
226	r4: orig. DBL0H & 0x7fffffff
227	r6: orig. DBL1H & 0x7ff00000
228	r9: 0x7ff00000
229	r10: orig. DBL0H ^ DBL1H
230	r12: guard bits */
231	.balign 4
232	.Lsub:
233	neg.f r12,r12
234	mov_s r7,DBL1H
235	bmsk r5,DBL0H,19
236	sbc.f DBL0L,DBL0L,DBL1L
237	bic DBL1H,DBL0H,r5
238	bset r5,r5,20
239	sbc.f r4,r5,r7
240	beq_l .Large_cancel_sub
241	norm DBL1L,r4
242	bmsk r6,DBL1H,30
243	.Lsub_done:
244	sub_s DBL1L,DBL1L,9
245	breq DBL1L,1,.Lsub_done_noshift
246	asl r5,DBL1L,20
247	sub_s DBL1L,DBL1L,1
248	brlo r6,r5,.Ldenorm_sub
249	sub DBL0H,DBL1H,r5
250	.Lpast_denorm_sub:
251	neg_s DBL1H,DBL1L
252	lsr r6,r12,DBL1H
253	asl_s r12,r12,DBL1L
254	and r8,r6,1
255	add1.f 0,r8,r12
256	add.ne.f r12,r12,r12
257	asl r8,DBL0L,DBL1L
258	lsr r12,DBL0L,DBL1H
259	adc.f DBL0L,r8,r6
260	asl r5,r4,DBL1L
261	add_s DBL0H,DBL0H,r12
262	j_s.d [blink]
263	adc DBL0H,DBL0H,r5
264
265	.balign 4
266	.Lno_denorm_large_shift:
267	breq.d r12,32,.Lshift32
268	bset_l DBL1H,DBL1H,20
269	.Lfixed_denorm_large_shift:
270	neg r8,r12
271	asl r4,DBL1H,r8
272	lsr r5,DBL1L,r12
273	asl.f 0,DBL1L,r8
274	lsr DBL1L,DBL1H,r12
275	or r12,r4,r5
276	tst.eq r12,1
277	or.ne r12,r12,2
278	brlt.d r10,0,.Lsub
279	mov_s DBL1H,0
280	b_l .Ladd
281
282	; If a denorm is produced without shifting, we have an exact result -
283	; no need for rounding.
284	.balign 4
285	.Ldenorm_sub:
286	lsr DBL1L,r6,20
287	xor DBL0H,r6,DBL1H
288	brne.d DBL1L,1,.Lpast_denorm_sub
289	sub_s DBL1L,DBL1L,1
290	.Lsub_done_noshift:
291	add.f 0,r12,r12
292	btst.eq DBL0L,0
293	cmp.eq r12,r12
294	add.cs.f DBL0L,DBL0L,1
295	bclr r4,r4,20
296	j_s.d [blink]
297	adc DBL0H,DBL1H,r4
298
299	.balign 4
300	.Ldenorm_small_shift:
301	brne.d r12,1,.Lfixed_denorm_small_shift
302	sub_l r12,r12,1
303	brlt r10,0,.Lsub
304	.Ladd: ; bit 20 of DBL1H is clear and bit 0 of r12 does not matter
305	add.f DBL0L,DBL0L,DBL1L
306	add_s DBL1H,DBL1H,DBL0H
307	add.cs DBL1H,DBL1H,1
308	xor_l DBL0H,DBL0H,DBL1H
309	bbit0 DBL0H,20,.Lno_shiftdown
310	lsr.f DBL0H,DBL1H
311	and r4,DBL0L,2
312	bmsk DBL0H,DBL0H,18
313	sbc DBL0H,DBL1H,DBL0H
314	rrc.f DBL0L,DBL0L
315	or.f r12,r12,r4
316	cmp.eq r12,r12
317	add.cs.f DBL0L,DBL0L,1
318	bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
319	jne.d [blink] ; ... non-zero fraction
320	add.cs DBL0H,DBL0H,1
321	mov_s DBL0L,0
322	bmsk DBL1H,DBL0H,19
323	j_s.d [blink]
324	bic_s DBL0H,DBL0H,DBL1H
325	.Lno_shiftdown:
326	mov_s DBL0H,DBL1H
327	add.f 0,r12,r12
328	btst.eq DBL0L,0
329	cmp.eq r12,r12
330	add.cs.f DBL0L,DBL0L,1
331	j_s.d [blink]
332	add.cs DBL0H,DBL0H,1
333	.balign 4
334	.Ldenorm:
335	bmsk DBL0H,DBL1H,30
336	lsr r12,DBL0H,20
337	xor_s DBL0H,DBL0H,DBL1H
338	sub_l DBL1L,r12,1
339	bgt .Lpast_denorm
340	j_s.d [blink]
341	add_l DBL0H,DBL0H,r4
342
343	.balign 4
344	.Large_cancel:
345	;DBL0L: mantissa DBL1H: sign & exponent
346	norm.f DBL1L,DBL0L
347	bmsk DBL0H,DBL1H,30
348	add_s DBL1L,DBL1L,22
349	mov.mi DBL1L,21
350	add_s r12,DBL1L,1
351	asl_s r12,r12,20
352	beq_s .Lret0
353	brhs.d DBL0H,r12,.Lpast_denorm_large_cancel
354	sub DBL0H,DBL1H,r12
355	bmsk DBL0H,DBL1H,30
356	lsr r12,DBL0H,20
357	xor_s DBL0H,DBL0H,DBL1H
358	sub.f DBL1L,r12,1
359	jle [blink]
360	.Lpast_denorm_large_cancel:
361	rsub.f r7,DBL1L,32
362	lsr r7,DBL0L,r7
363	asl_s DBL0L,DBL0L,DBL1L
364	mov.ls r7,DBL0L
365	add_s DBL0H,DBL0H,r7
366	j_s.d [blink]
367	mov.ls DBL0L,0
368	.Lret0:
369	j_s.d [blink]
370	mov_l DBL0H,0
371
372	/* r4:DBL0L:r12 : unnormalized result fraction
373	DBL1H: result sign and exponent */
374	/* When seeing large cancellation, only the topmost guard bit might be set. */
375	.balign 4
376	.Large_cancel_sub:
377	norm.f DBL1L,DBL0L
378	bpnz.d 0f
379	bmsk DBL0H,DBL1H,30
380	mov r5,22<<20
381	bne.d 1f
382	mov_s DBL1L,21
383	bset r5,r5,5+20
384	add_s DBL1L,DBL1L,32
385	brne r12,0,1f
386	j_s.d [blink]
387	mov_l DBL0H,0
388	.balign 4
389	0: add r5,DBL1L,23
390	asl r5,r5,20
391	add_s DBL1L,DBL1L,22
392	1: brlo DBL0H,r5,.Ldenorm_large_cancel_sub
393	sub DBL0H,DBL1H,r5
394	.Lpast_denorm_large_cancel_sub:
395	rsub.f r7,DBL1L,32
396	lsr r12,r12,r7
397	lsr r7,DBL0L,r7
398	asl_s DBL0L,DBL0L,DBL1L
399	add.ge DBL0H,DBL0H,r7
400	add_s DBL0L,DBL0L,r12
401	add.lt DBL0H,DBL0H,DBL0L
402	mov.eq DBL0L,r12
403	j_s.d [blink]
404	mov.lt DBL0L,0
405	.balign 4
406	.Ldenorm_large_cancel_sub:
407	lsr r5,DBL0H,20
408	xor_s DBL0H,DBL0H,DBL1H
409	brgt.d r5,1,.Lpast_denorm_large_cancel_sub
410	sub DBL1L,r5,1
411	j_l [blink] ; denorm, no shift -> no rounding needed.
412
413	/* r4: DBL0H & 0x7fffffff
414	r6: DBL1H & 0x7ff00000
415	r9: 0x7ff00000
416	r10: sign difference
417	r12: shift count (negative) */
418	.balign 4
419	.Ldbl1_gt:
420	brhs r6,r9,.Lret_dbl1 ; inf or NaN
421	neg r8,r12
422	brhs r8,32,.Large_shift_dbl0
423	.Lsmall_shift_dbl0:
424	breq.d r6,0,.Ldenorm_small_shift_dbl0
425	bmsk_s DBL0H,DBL0H,19
426	bset_s DBL0H,DBL0H,20
427	.Lfixed_denorm_small_shift_dbl0:
428	asl r4,DBL0H,r12
429	lsr DBL0H,DBL0H,r8
430	lsr r5,DBL0L,r8
431	asl r12,DBL0L,r12
432	brge.d r10,0,.Ladd_dbl1_gt
433	or DBL0L,r4,r5
434	/* subtract, abs(DBL0) < abs(DBL1) */
435	/* DBL0H, DBL0L: fraction with explicit leading 1, shifted into place
436	DBL1H, DBL1L: original values
437	r6: orig. DBL1H & 0x7ff00000
438	r9: 0x7ff00000
439	r12: guard bits */
440	.balign 4
441	.Lrsub:
442	neg.f r12,r12
443	bmsk r7,DBL1H,19
444	mov_s r5,DBL0H
445	sbc.f DBL0L,DBL1L,DBL0L
446	bic DBL1H,DBL1H,r7
447	bset r7,r7,20
448	sbc.f r4,r7,r5
449	beq_l .Large_cancel_sub
450	norm DBL1L,r4
451	b_l .Lsub_done ; note: r6 is already set up.
452
453	.Lret_dbl1:
454	mov_s DBL0H,DBL1H
455	j_s.d [blink]
456	mov_l DBL0L,DBL1L
457	.balign 4
458	.Ldenorm_small_shift_dbl0:
459	sub.f r8,r8,1
460	bne.d .Lfixed_denorm_small_shift_dbl0
461	add_s r12,r12,1
462	brlt r10,0,.Lrsub
463	.Ladd_dbl1_gt: ; bit 20 of DBL0H is clear and bit 0 of r12 does not matter
464	add.f DBL0L,DBL0L,DBL1L
465	add_s DBL0H,DBL0H,DBL1H
466	add.cs DBL0H,DBL0H,1
467	xor DBL1H,DBL0H,DBL1H
468	bbit0 DBL1H,20,.Lno_shiftdown_dbl1_gt
469	lsr.f DBL1H,DBL0H
470	and r4,DBL0L,2
471	bmsk DBL1H,DBL1H,18
472	sbc DBL0H,DBL0H,DBL1H
473	rrc.f DBL0L,DBL0L
474	or.f r12,r12,r4
475	cmp.eq r12,r12
476	add.cs.f DBL0L,DBL0L,1
477	bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
478	jne.d [blink] ; ... non-zero fraction
479	add.cs DBL0H,DBL0H,1
480	mov_s DBL0L,0
481	bmsk DBL1H,DBL0H,19
482	j_s.d [blink]
483	bic_s DBL0H,DBL0H,DBL1H
484	.Lno_shiftdown_dbl1_gt:
485	add.f 0,r12,r12
486	btst.eq DBL0L,0
487	cmp.eq r12,r12
488	add.cs.f DBL0L,DBL0L,1
489	j_s.d [blink]
490	add.cs DBL0H,DBL0H,1
491
492	.balign 4
493	.Large_shift_dbl0:
494	brhs r8,55,.Lret_dbl1
495	bmsk_s DBL0H,DBL0H,19
496	brne r6,0,.Lno_denorm_large_shift_dbl0
497	add_s r12,r12,1
498	brne.d r8,33,.Lfixed_denorm_large_shift_dbl0
499	sub r8,r8,1
500	bset_s DBL0H,DBL0H,20
501	.Lshift32_dbl0:
502	mov_s r12,DBL0L
503	mov_s DBL0L,DBL0H
504	brlt.d r10,0,.Lrsub
505	mov_s DBL0H,0
506	b_s .Ladd_dbl1_gt
507
508	.balign 4
509	.Lno_denorm_large_shift_dbl0:
510	breq.d r8,32,.Lshift32_dbl0
511	bset_l DBL0H,DBL0H,20
512	.Lfixed_denorm_large_shift_dbl0:
513	asl r4,DBL0H,r12
514	lsr r5,DBL0L,r8
515	asl.f 0,DBL0L,r12
516	lsr DBL0L,DBL0H,r8
517	or r12,r4,r5
518	tst.eq r12,1
519	or.ne r12,r12,2
520	brlt.d r10,0,.Lrsub
521	mov_s DBL0H,0
522	b_l .Ladd_dbl1_gt
523	ENDFUNC(__adddf3)
524	ENDFUNC(__subdf3)