]>
Commit | Line | Data |
---|---|---|
a1225f9a UD |
1 | # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add |
2 | # the result to a second limb vector. | |
3 | # | |
04277e02 | 4 | # Copyright (C) 2000-2019 Free Software Foundation, Inc. |
a1225f9a UD |
5 | # |
6 | # This file is part of the GNU MP Library. | |
7 | # | |
8 | # The GNU MP Library is free software; you can redistribute it and/or modify | |
9 | # it under the terms of the GNU Lesser General Public License as published | |
10 | # by the Free Software Foundation; either version 2.1 of the License, or (at | |
11 | # your option) any later version. | |
12 | # | |
13 | # The GNU MP Library is distributed in the hope that it will be useful, but | |
14 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
15 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
16 | # License for more details. | |
17 | # | |
18 | # You should have received a copy of the GNU Lesser General Public License | |
5a82c748 | 19 | # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>. |
a1225f9a UD |
20 | |
21 | # INPUT PARAMETERS | |
22 | # res_ptr $16 | |
23 | # s1_ptr $17 | |
24 | # size $18 | |
25 | # s2_limb $19 | |
26 | # | |
27 | # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and | |
28 | # exactly 3.625 cycles/limb on EV6... | |
29 | # | |
30 | # This code was written in close cooperation with ev6 pipeline expert | |
31 | # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. | |
32 | # | |
33 | # Register usages for unrolled loop: | |
34 | # 0-3 mul's | |
35 | # 4-7 acc's | |
36 | # 8-15 mul results | |
37 | # 20,21 carry's | |
38 | # 22,23 save for stores | |
39 | # | |
40 | # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. | |
41 | # | |
42 | # The stores can issue a cycle late so we have paired no-op's to 'catch' | |
43 | # them, so that further disturbance to the schedule is damped. | |
44 | # | |
45 | # We couldn't pair the loads, because the entangled schedule of the | |
46 | # carry's has to happen on one side {0} of the machine. Note, the total | |
47 | # use of U0, and the total use of L0 (after attending to the stores). | |
48 | # which is part of the reason why.... | |
49 | # | |
50 | # This is a great schedule for the d_cache, a poor schedule for the | |
51 | # b_cache. The lockup on U0 means that any stall can't be recovered | |
52 | # from. Consider a ldq in L1. say that load gets stalled because it | |
53 | # collides with a fill from the b_Cache. On the next cycle, this load | |
54 | # gets priority. If first looks at L0, and goes there. The instruction | |
55 | # we intended for L0 gets to look at L1, which is NOT where we want | |
56 | # it. It either stalls 1, because it can't go in L0, or goes there, and | |
57 | # causes a further instruction to stall. | |
58 | # | |
59 | # So for b_cache, we're likely going to want to put one or more cycles | |
60 | # back into the code! And, of course, put in prefetches. For the | |
61 | # accumulator, lds, intent to modify. For the multiplier, you might | |
62 | # want ldq, evict next, if you're not wanting to use it again soon. Use | |
63 | # 256 ahead of present pointer value. At a place where we have an mt | |
64 | # followed by a bookkeeping, put the bookkeeping in upper, and the | |
65 | # prefetch into lower. | |
66 | # | |
67 | # Note, the usage of physical registers per cycle is smoothed off, as | |
68 | # much as possible. | |
69 | # | |
70 | # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd | |
71 | # like not to have a ldq or stq to preceded a conditional branch in a | |
72 | # quadpack. The conditional branch moves the retire pointer one cycle | |
73 | # later. | |
74 | # | |
75 | # Optimization notes: | |
76 | # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? | |
77 | # Reserved regs: $29 $30 $31 | |
78 | # Free caller-saves regs in unrolled code: $24 $25 $28 | |
79 | # We should swap some of the callee-saves regs for some of the free | |
80 | # caller-saves regs, saving some overhead cycles. | |
81 | # Most importantly, we should write fast code for the 0-7 case. | |
82 | # The code we use there are for the 21164, and runs at 7 cycles/limb | |
83 | # on the 21264. Should not be hard, if we write specialized code for | |
84 | # 1-7 limbs (the one for 0 limbs should be straightforward). We then just | |
85 | # need a jump table indexed by the low 3 bits of the count argument. | |
86 | ||
87 | .set noreorder | |
88 | .set noat | |
89 | .text | |
90 | ||
91 | .globl __mpn_addmul_1 | |
92 | .ent __mpn_addmul_1 | |
93 | __mpn_addmul_1: | |
94 | .frame $30,0,$26,0 | |
95 | .prologue 0 | |
96 | ||
97 | cmpult $18, 8, $1 | |
98 | beq $1, $Large | |
99 | ||
100 | ldq $2, 0($17) # $2 = s1_limb | |
101 | addq $17, 8, $17 # s1_ptr++ | |
102 | subq $18, 1, $18 # size-- | |
103 | mulq $2, $19, $3 # $3 = prod_low | |
104 | ldq $5, 0($16) # $5 = *res_ptr | |
105 | umulh $2, $19, $0 # $0 = prod_high | |
106 | beq $18, $Lend0b # jump if size was == 1 | |
107 | ldq $2, 0($17) # $2 = s1_limb | |
108 | addq $17, 8, $17 # s1_ptr++ | |
109 | subq $18, 1, $18 # size-- | |
110 | addq $5, $3, $3 | |
111 | cmpult $3, $5, $4 | |
112 | stq $3, 0($16) | |
113 | addq $16, 8, $16 # res_ptr++ | |
114 | beq $18, $Lend0a # jump if size was == 2 | |
115 | ||
116 | .align 3 | |
117 | $Loop0: mulq $2, $19, $3 # $3 = prod_low | |
118 | ldq $5, 0($16) # $5 = *res_ptr | |
119 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' | |
120 | subq $18, 1, $18 # size-- | |
121 | umulh $2, $19, $4 # $4 = cy_limb | |
122 | ldq $2, 0($17) # $2 = s1_limb | |
123 | addq $17, 8, $17 # s1_ptr++ | |
124 | addq $3, $0, $3 # $3 = cy_limb + prod_low | |
125 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) | |
126 | addq $5, $3, $3 | |
127 | cmpult $3, $5, $5 | |
128 | stq $3, 0($16) | |
129 | addq $16, 8, $16 # res_ptr++ | |
130 | addq $5, $0, $0 # combine carries | |
131 | bne $18, $Loop0 | |
132 | $Lend0a: | |
133 | mulq $2, $19, $3 # $3 = prod_low | |
134 | ldq $5, 0($16) # $5 = *res_ptr | |
135 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' | |
136 | umulh $2, $19, $4 # $4 = cy_limb | |
137 | addq $3, $0, $3 # $3 = cy_limb + prod_low | |
138 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) | |
139 | addq $5, $3, $3 | |
140 | cmpult $3, $5, $5 | |
141 | stq $3, 0($16) | |
142 | addq $5, $0, $0 # combine carries | |
143 | addq $4, $0, $0 # cy_limb = prod_high + cy | |
144 | ret $31, ($26), 1 | |
145 | $Lend0b: | |
146 | addq $5, $3, $3 | |
147 | cmpult $3, $5, $5 | |
148 | stq $3, 0($16) | |
149 | addq $0, $5, $0 | |
150 | ret $31, ($26), 1 | |
151 | ||
152 | $Large: | |
153 | lda $30, -240($30) | |
154 | stq $9, 8($30) | |
155 | stq $10, 16($30) | |
156 | stq $11, 24($30) | |
157 | stq $12, 32($30) | |
158 | stq $13, 40($30) | |
159 | stq $14, 48($30) | |
160 | stq $15, 56($30) | |
161 | ||
162 | and $18, 7, $20 # count for the first loop, 0-7 | |
163 | srl $18, 3, $18 # count for unrolled loop | |
164 | bis $31, $31, $0 | |
165 | beq $20, $Lunroll | |
166 | ldq $2, 0($17) # $2 = s1_limb | |
167 | addq $17, 8, $17 # s1_ptr++ | |
168 | subq $20, 1, $20 # size-- | |
169 | mulq $2, $19, $3 # $3 = prod_low | |
170 | ldq $5, 0($16) # $5 = *res_ptr | |
171 | umulh $2, $19, $0 # $0 = prod_high | |
172 | beq $20, $Lend1b # jump if size was == 1 | |
173 | ldq $2, 0($17) # $2 = s1_limb | |
174 | addq $17, 8, $17 # s1_ptr++ | |
175 | subq $20, 1, $20 # size-- | |
176 | addq $5, $3, $3 | |
177 | cmpult $3, $5, $4 | |
178 | stq $3, 0($16) | |
179 | addq $16, 8, $16 # res_ptr++ | |
180 | beq $20, $Lend1a # jump if size was == 2 | |
181 | ||
182 | .align 3 | |
183 | $Loop1: mulq $2, $19, $3 # $3 = prod_low | |
184 | ldq $5, 0($16) # $5 = *res_ptr | |
185 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' | |
186 | subq $20, 1, $20 # size-- | |
187 | umulh $2, $19, $4 # $4 = cy_limb | |
188 | ldq $2, 0($17) # $2 = s1_limb | |
189 | addq $17, 8, $17 # s1_ptr++ | |
190 | addq $3, $0, $3 # $3 = cy_limb + prod_low | |
191 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) | |
192 | addq $5, $3, $3 | |
193 | cmpult $3, $5, $5 | |
194 | stq $3, 0($16) | |
195 | addq $16, 8, $16 # res_ptr++ | |
196 | addq $5, $0, $0 # combine carries | |
197 | bne $20, $Loop1 | |
198 | ||
199 | $Lend1a: | |
200 | mulq $2, $19, $3 # $3 = prod_low | |
201 | ldq $5, 0($16) # $5 = *res_ptr | |
202 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' | |
203 | umulh $2, $19, $4 # $4 = cy_limb | |
204 | addq $3, $0, $3 # $3 = cy_limb + prod_low | |
205 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) | |
206 | addq $5, $3, $3 | |
207 | cmpult $3, $5, $5 | |
208 | stq $3, 0($16) | |
209 | addq $16, 8, $16 # res_ptr++ | |
210 | addq $5, $0, $0 # combine carries | |
211 | addq $4, $0, $0 # cy_limb = prod_high + cy | |
212 | br $31, $Lunroll | |
213 | $Lend1b: | |
214 | addq $5, $3, $3 | |
215 | cmpult $3, $5, $5 | |
216 | stq $3, 0($16) | |
217 | addq $16, 8, $16 # res_ptr++ | |
218 | addq $0, $5, $0 | |
219 | ||
220 | $Lunroll: | |
221 | lda $17, -16($17) # L1 bookkeeping | |
222 | lda $16, -16($16) # L1 bookkeeping | |
223 | bis $0, $31, $12 | |
224 | ||
225 | # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ | |
226 | ||
227 | ldq $2, 16($17) # L1 | |
228 | ldq $3, 24($17) # L1 | |
229 | lda $18, -1($18) # L1 bookkeeping | |
230 | ldq $6, 16($16) # L1 | |
231 | ldq $7, 24($16) # L1 | |
232 | ldq $0, 32($17) # L1 | |
233 | mulq $19, $2, $13 # U1 | |
234 | ldq $1, 40($17) # L1 | |
235 | umulh $19, $2, $14 # U1 | |
236 | mulq $19, $3, $15 # U1 | |
237 | lda $17, 64($17) # L1 bookkeeping | |
238 | ldq $4, 32($16) # L1 | |
239 | ldq $5, 40($16) # L1 | |
240 | umulh $19, $3, $8 # U1 | |
241 | ldq $2, -16($17) # L1 | |
242 | mulq $19, $0, $9 # U1 | |
243 | ldq $3, -8($17) # L1 | |
244 | umulh $19, $0, $10 # U1 | |
245 | addq $6, $13, $6 # L0 lo + acc | |
246 | mulq $19, $1, $11 # U1 | |
247 | cmpult $6, $13, $20 # L0 lo add => carry | |
248 | lda $16, 64($16) # L1 bookkeeping | |
249 | addq $6, $12, $22 # U0 hi add => answer | |
250 | cmpult $22, $12, $21 # L0 hi add => carry | |
251 | addq $14, $20, $14 # U0 hi mul + carry | |
252 | ldq $6, -16($16) # L1 | |
253 | addq $7, $15, $23 # L0 lo + acc | |
254 | addq $14, $21, $14 # U0 hi mul + carry | |
255 | ldq $7, -8($16) # L1 | |
256 | umulh $19, $1, $12 # U1 | |
257 | cmpult $23, $15, $20 # L0 lo add => carry | |
258 | addq $23, $14, $23 # U0 hi add => answer | |
259 | ldq $0, 0($17) # L1 | |
260 | mulq $19, $2, $13 # U1 | |
261 | cmpult $23, $14, $21 # L0 hi add => carry | |
262 | addq $8, $20, $8 # U0 hi mul + carry | |
263 | ldq $1, 8($17) # L1 | |
264 | umulh $19, $2, $14 # U1 | |
265 | addq $4, $9, $4 # L0 lo + acc | |
266 | stq $22, -48($16) # L0 | |
267 | stq $23, -40($16) # L1 | |
268 | mulq $19, $3, $15 # U1 | |
269 | addq $8, $21, $8 # U0 hi mul + carry | |
270 | cmpult $4, $9, $20 # L0 lo add => carry | |
271 | addq $4, $8, $22 # U0 hi add => answer | |
272 | ble $18, $Lend # U1 bookkeeping | |
273 | ||
274 | # ____ MAIN UNROLLED LOOP ____ | |
275 | .align 4 | |
276 | $Loop: | |
277 | bis $31, $31, $31 # U1 mt | |
278 | cmpult $22, $8, $21 # L0 hi add => carry | |
279 | addq $10, $20, $10 # U0 hi mul + carry | |
280 | ldq $4, 0($16) # L1 | |
281 | ||
282 | bis $31, $31, $31 # U1 mt | |
283 | addq $5, $11, $23 # L0 lo + acc | |
284 | addq $10, $21, $10 # L0 hi mul + carry | |
285 | ldq $5, 8($16) # L1 | |
286 | ||
287 | umulh $19, $3, $8 # U1 | |
288 | cmpult $23, $11, $20 # L0 lo add => carry | |
289 | addq $23, $10, $23 # U0 hi add => answer | |
290 | ldq $2, 16($17) # L1 | |
291 | ||
292 | mulq $19, $0, $9 # U1 | |
293 | cmpult $23, $10, $21 # L0 hi add => carry | |
294 | addq $12, $20, $12 # U0 hi mul + carry | |
295 | ldq $3, 24($17) # L1 | |
296 | ||
297 | umulh $19, $0, $10 # U1 | |
298 | addq $6, $13, $6 # L0 lo + acc | |
299 | stq $22, -32($16) # L0 | |
300 | stq $23, -24($16) # L1 | |
301 | ||
302 | bis $31, $31, $31 # L0 st slosh | |
303 | mulq $19, $1, $11 # U1 | |
304 | bis $31, $31, $31 # L1 st slosh | |
305 | addq $12, $21, $12 # U0 hi mul + carry | |
306 | ||
307 | cmpult $6, $13, $20 # L0 lo add => carry | |
308 | bis $31, $31, $31 # U1 mt | |
309 | lda $18, -1($18) # L1 bookkeeping | |
310 | addq $6, $12, $22 # U0 hi add => answer | |
311 | ||
312 | bis $31, $31, $31 # U1 mt | |
313 | cmpult $22, $12, $21 # L0 hi add => carry | |
314 | addq $14, $20, $14 # U0 hi mul + carry | |
315 | ldq $6, 16($16) # L1 | |
316 | ||
317 | bis $31, $31, $31 # U1 mt | |
318 | addq $7, $15, $23 # L0 lo + acc | |
319 | addq $14, $21, $14 # U0 hi mul + carry | |
320 | ldq $7, 24($16) # L1 | |
321 | ||
322 | umulh $19, $1, $12 # U1 | |
323 | cmpult $23, $15, $20 # L0 lo add => carry | |
324 | addq $23, $14, $23 # U0 hi add => answer | |
325 | ldq $0, 32($17) # L1 | |
326 | ||
327 | mulq $19, $2, $13 # U1 | |
328 | cmpult $23, $14, $21 # L0 hi add => carry | |
329 | addq $8, $20, $8 # U0 hi mul + carry | |
330 | ldq $1, 40($17) # L1 | |
331 | ||
332 | umulh $19, $2, $14 # U1 | |
333 | addq $4, $9, $4 # U0 lo + acc | |
334 | stq $22, -16($16) # L0 | |
335 | stq $23, -8($16) # L1 | |
336 | ||
337 | bis $31, $31, $31 # L0 st slosh | |
338 | mulq $19, $3, $15 # U1 | |
339 | bis $31, $31, $31 # L1 st slosh | |
340 | addq $8, $21, $8 # L0 hi mul + carry | |
341 | ||
342 | cmpult $4, $9, $20 # L0 lo add => carry | |
343 | bis $31, $31, $31 # U1 mt | |
344 | lda $17, 64($17) # L1 bookkeeping | |
345 | addq $4, $8, $22 # U0 hi add => answer | |
346 | ||
347 | bis $31, $31, $31 # U1 mt | |
348 | cmpult $22, $8, $21 # L0 hi add => carry | |
349 | addq $10, $20, $10 # U0 hi mul + carry | |
350 | ldq $4, 32($16) # L1 | |
351 | ||
352 | bis $31, $31, $31 # U1 mt | |
353 | addq $5, $11, $23 # L0 lo + acc | |
354 | addq $10, $21, $10 # L0 hi mul + carry | |
355 | ldq $5, 40($16) # L1 | |
356 | ||
357 | umulh $19, $3, $8 # U1 | |
358 | cmpult $23, $11, $20 # L0 lo add => carry | |
359 | addq $23, $10, $23 # U0 hi add => answer | |
360 | ldq $2, -16($17) # L1 | |
361 | ||
362 | mulq $19, $0, $9 # U1 | |
363 | cmpult $23, $10, $21 # L0 hi add => carry | |
364 | addq $12, $20, $12 # U0 hi mul + carry | |
365 | ldq $3, -8($17) # L1 | |
366 | ||
367 | umulh $19, $0, $10 # U1 | |
368 | addq $6, $13, $6 # L0 lo + acc | |
369 | stq $22, 0($16) # L0 | |
370 | stq $23, 8($16) # L1 | |
371 | ||
372 | bis $31, $31, $31 # L0 st slosh | |
373 | mulq $19, $1, $11 # U1 | |
374 | bis $31, $31, $31 # L1 st slosh | |
375 | addq $12, $21, $12 # U0 hi mul + carry | |
376 | ||
377 | cmpult $6, $13, $20 # L0 lo add => carry | |
378 | bis $31, $31, $31 # U1 mt | |
379 | lda $16, 64($16) # L1 bookkeeping | |
380 | addq $6, $12, $22 # U0 hi add => answer | |
381 | ||
382 | bis $31, $31, $31 # U1 mt | |
383 | cmpult $22, $12, $21 # L0 hi add => carry | |
384 | addq $14, $20, $14 # U0 hi mul + carry | |
385 | ldq $6, -16($16) # L1 | |
386 | ||
387 | bis $31, $31, $31 # U1 mt | |
388 | addq $7, $15, $23 # L0 lo + acc | |
389 | addq $14, $21, $14 # U0 hi mul + carry | |
390 | ldq $7, -8($16) # L1 | |
391 | ||
392 | umulh $19, $1, $12 # U1 | |
393 | cmpult $23, $15, $20 # L0 lo add => carry | |
394 | addq $23, $14, $23 # U0 hi add => answer | |
395 | ldq $0, 0($17) # L1 | |
396 | ||
397 | mulq $19, $2, $13 # U1 | |
398 | cmpult $23, $14, $21 # L0 hi add => carry | |
399 | addq $8, $20, $8 # U0 hi mul + carry | |
400 | ldq $1, 8($17) # L1 | |
401 | ||
402 | umulh $19, $2, $14 # U1 | |
403 | addq $4, $9, $4 # L0 lo + acc | |
404 | stq $22, -48($16) # L0 | |
405 | stq $23, -40($16) # L1 | |
406 | ||
407 | bis $31, $31, $31 # L0 st slosh | |
408 | mulq $19, $3, $15 # U1 | |
409 | bis $31, $31, $31 # L1 st slosh | |
410 | addq $8, $21, $8 # U0 hi mul + carry | |
411 | ||
412 | cmpult $4, $9, $20 # L0 lo add => carry | |
413 | addq $4, $8, $22 # U0 hi add => answer | |
414 | bis $31, $31, $31 # L1 mt | |
415 | bgt $18, $Loop # U1 bookkeeping | |
416 | ||
417 | # ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ | |
418 | $Lend: | |
419 | cmpult $22, $8, $21 # L0 hi add => carry | |
420 | addq $10, $20, $10 # U0 hi mul + carry | |
421 | ldq $4, 0($16) # L1 | |
422 | addq $5, $11, $23 # L0 lo + acc | |
423 | addq $10, $21, $10 # L0 hi mul + carry | |
424 | ldq $5, 8($16) # L1 | |
425 | umulh $19, $3, $8 # U1 | |
426 | cmpult $23, $11, $20 # L0 lo add => carry | |
427 | addq $23, $10, $23 # U0 hi add => answer | |
428 | mulq $19, $0, $9 # U1 | |
429 | cmpult $23, $10, $21 # L0 hi add => carry | |
430 | addq $12, $20, $12 # U0 hi mul + carry | |
431 | umulh $19, $0, $10 # U1 | |
432 | addq $6, $13, $6 # L0 lo + acc | |
433 | stq $22, -32($16) # L0 | |
434 | stq $23, -24($16) # L1 | |
435 | mulq $19, $1, $11 # U1 | |
436 | addq $12, $21, $12 # U0 hi mul + carry | |
437 | cmpult $6, $13, $20 # L0 lo add => carry | |
438 | addq $6, $12, $22 # U0 hi add => answer | |
439 | cmpult $22, $12, $21 # L0 hi add => carry | |
440 | addq $14, $20, $14 # U0 hi mul + carry | |
441 | addq $7, $15, $23 # L0 lo + acc | |
442 | addq $14, $21, $14 # U0 hi mul + carry | |
443 | umulh $19, $1, $12 # U1 | |
444 | cmpult $23, $15, $20 # L0 lo add => carry | |
445 | addq $23, $14, $23 # U0 hi add => answer | |
446 | cmpult $23, $14, $21 # L0 hi add => carry | |
447 | addq $8, $20, $8 # U0 hi mul + carry | |
448 | addq $4, $9, $4 # U0 lo + acc | |
449 | stq $22, -16($16) # L0 | |
450 | stq $23, -8($16) # L1 | |
451 | bis $31, $31, $31 # L0 st slosh | |
452 | addq $8, $21, $8 # L0 hi mul + carry | |
453 | cmpult $4, $9, $20 # L0 lo add => carry | |
454 | addq $4, $8, $22 # U0 hi add => answer | |
455 | cmpult $22, $8, $21 # L0 hi add => carry | |
456 | addq $10, $20, $10 # U0 hi mul + carry | |
457 | addq $5, $11, $23 # L0 lo + acc | |
458 | addq $10, $21, $10 # L0 hi mul + carry | |
459 | cmpult $23, $11, $20 # L0 lo add => carry | |
460 | addq $23, $10, $23 # U0 hi add => answer | |
461 | cmpult $23, $10, $21 # L0 hi add => carry | |
462 | addq $12, $20, $12 # U0 hi mul + carry | |
463 | stq $22, 0($16) # L0 | |
464 | stq $23, 8($16) # L1 | |
465 | addq $12, $21, $0 # U0 hi mul + carry | |
466 | ||
467 | ldq $9, 8($30) | |
468 | ldq $10, 16($30) | |
469 | ldq $11, 24($30) | |
470 | ldq $12, 32($30) | |
471 | ldq $13, 40($30) | |
472 | ldq $14, 48($30) | |
473 | ldq $15, 56($30) | |
474 | lda $30, 240($30) | |
475 | ret $31, ($26), 1 | |
476 | ||
477 | .end __mpn_addmul_1 |