1 /* libgcc functions for Blackfin.
2 Copyright (C) 2005, 2009 Free Software Foundation, Inc.
3 Contributed by Analog Devices.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
30 .type ___divsi3, STT_FUNC;
61 .type ___modsi3, STT_FUNC;
79 .type ___udivsi3, STT_FUNC;
83 LSETUP (0f, 1f) LC0 = P0;
84 /* upper half of dividend */
87 /* The first time round in the loop we shift in garbage, but since we
88 perform 33 shifts, it doesn't matter. */
94 /* Last instruction of the loop. */
97 /* Shift in the last bit. */
99 /* R0 is the result, R3 contains the remainder. */
107 .type ___umodsi3, STT_FUNC;
117 #ifdef L_umulsi3_highpart
119 .global ___umulsi3_highpart;
120 .type ___umulsi3_highpart, STT_FUNC;
123 A1 = R1.L * R0.L (FU);
125 A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
126 A1 += R0.L * R1.H (FU);
133 #ifdef L_smulsi3_highpart
135 .global ___smulsi3_highpart;
136 .type ___smulsi3_highpart, STT_FUNC;
139 A1 = R1.L * R0.L (FU);
141 A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
142 A1 += R1.H * R0.L (IS,M);
151 .type ___muldi3, STT_FUNC;
155 = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
156 [X] = (R1.h * R3.h) * 2^96
157 [X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
158 [X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
159 [T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
160 [T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
161 [T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
164 We can discard the first three lines marked "X" since we produce
165 only a 64 bit result. So, we need ten 16-bit multiplies.
167 Individual mul-acc results:
168 [E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
169 [E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
170 [E3] = R0.l * R2.h + R2.l * R0.h
173 We also need to add high parts from lower-level results to higher ones:
174 E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
176 One interesting property is that all parts of the result that depend
177 on the sign of the multiplication are discarded. Those would be the
178 multiplications involving R1.h and R3.h, but only the top 16 bit of
179 the 32 bit result depend on the sign, and since R1.h and R3.h only
180 occur in E1, the top half of these results is cut off.
181 So, we can just use FU mode for all of the 16-bit multiplies, and
182 ignore questions of when to use mixed mode. */
185 /* [SP] technically is part of the caller's frame, but we can
186 use it as scratch space. */
187 A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */
188 A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */
191 A0 = R0.l * R3.l (FU); /* E2 */
192 A0 += R2.l * R1.l (FU); /* E2 */
194 A1 = R2.L * R0.L (FU); /* E4 */
196 A1 = A1 >> 16; /* E3c */
197 A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
198 A1 += R0.L * R2.H (FU); /* E3c */
200 A1 = A1 >> 16; /* E2c */
204 /* low(result) = low(E3c):low(E4) */
205 R0 = PACK (R0.l, R3.l);
206 /* high(result) = E2c + (E1 << 16) */
207 R1.h = R1.h + R4.l (NS) || R4 = [SP];
210 .size ___muldi3, .-___muldi3