]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/ghash-sparcv9.pl
Following the license change, modify the boilerplates in crypto/modes/
[thirdparty/openssl.git] / crypto / modes / asm / ghash-sparcv9.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
81cae8ce 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
c3473126
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23# and are expressed in cycles per processed byte, less is better:
24#
25# gcc 3.3.x cc 5.2 this assembler
26#
d52d5ad1
AP
27# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
28# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
c3473126 29#
b2875087
AP
30# Here is data collected on UltraSPARC T1 system running Linux:
31#
32# gcc 4.4.1 this assembler
33#
34# 32-bit build 566 50 (+1000%)
35# 64-bit build 56 50 (+12%)
36#
c3473126
AP
37# I don't quite understand why difference between 32-bit and 64-bit
38# compiler-generated code is so big. Compilers *were* instructed to
39# generate code for UltraSPARC and should have used 64-bit registers
40# for Z vector (see C code) even in 32-bit build... Oh well, it only
41# means more impressive improvement coefficients for this assembler
42# module;-) Loops are aggressively modulo-scheduled in respect to
43# references to input data and Z.hi updates to achieve 12 cycles
44# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
b2875087 45# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
23328d4b
AP
46#
47# October 2012
48#
49# Add VIS3 lookup-table-free implementation using polynomial
50# multiplication xmulx[hi] and extended addition addxc[cc]
3766e7cc
AP
51# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53# saturates at ~15.5x single-process result on 8-core processor,
54# or ~20.5GBps per 2.85GHz socket.
c3473126 55
eb77e888 56$output=pop;
c3473126
AP
57open STDOUT,">$output";
58
eb77e888
AP
59$frame="STACK_FRAME";
60$bias="STACK_BIAS";
61
c3473126
AP
62$Zhi="%o0"; # 64-bit values
63$Zlo="%o1";
64$Thi="%o2";
65$Tlo="%o3";
66$rem="%o4";
67$tmp="%o5";
68
69$nhi="%l0"; # small values and pointers
70$nlo="%l1";
71$xi0="%l2";
72$xi1="%l3";
73$rem_4bit="%l4";
74$remi="%l5";
75$Htblo="%l6";
76$cnt="%l7";
77
4f39edbf
AP
78$Xi="%i0"; # input argument block
79$Htbl="%i1";
80$inp="%i2";
81$len="%i3";
c3473126 82
eb77e888
AP
83$code.=<<___;
84#include "sparc_arch.h"
85
86#ifdef __arch64__
23328d4b
AP
87.register %g2,#scratch
88.register %g3,#scratch
eb77e888
AP
89#endif
90
c3473126
AP
91.section ".text",#alloc,#execinstr
92
93.align 64
94rem_4bit:
95 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
96 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
97 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
98 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
99.type rem_4bit,#object
100.size rem_4bit,(.-rem_4bit)
101
102.globl gcm_ghash_4bit
103.align 32
104gcm_ghash_4bit:
105 save %sp,-$frame,%sp
106 ldub [$inp+15],$nlo
107 ldub [$Xi+15],$xi0
108 ldub [$Xi+14],$xi1
109 add $len,$inp,$len
110 add $Htbl,8,$Htblo
111
1121: call .+8
113 add %o7,rem_4bit-1b,$rem_4bit
114
115.Louter:
116 xor $xi0,$nlo,$nlo
117 and $nlo,0xf0,$nhi
118 and $nlo,0x0f,$nlo
119 sll $nlo,4,$nlo
120 ldx [$Htblo+$nlo],$Zlo
121 ldx [$Htbl+$nlo],$Zhi
122
123 ldub [$inp+14],$nlo
124
125 ldx [$Htblo+$nhi],$Tlo
126 and $Zlo,0xf,$remi
127 ldx [$Htbl+$nhi],$Thi
128 sll $remi,3,$remi
129 ldx [$rem_4bit+$remi],$rem
130 srlx $Zlo,4,$Zlo
131 mov 13,$cnt
132 sllx $Zhi,60,$tmp
133 xor $Tlo,$Zlo,$Zlo
134 srlx $Zhi,4,$Zhi
135 xor $Zlo,$tmp,$Zlo
136
137 xor $xi1,$nlo,$nlo
138 and $Zlo,0xf,$remi
139 and $nlo,0xf0,$nhi
140 and $nlo,0x0f,$nlo
141 ba .Lghash_inner
142 sll $nlo,4,$nlo
143.align 32
144.Lghash_inner:
145 ldx [$Htblo+$nlo],$Tlo
146 sll $remi,3,$remi
147 xor $Thi,$Zhi,$Zhi
148 ldx [$Htbl+$nlo],$Thi
149 srlx $Zlo,4,$Zlo
150 xor $rem,$Zhi,$Zhi
151 ldx [$rem_4bit+$remi],$rem
152 sllx $Zhi,60,$tmp
153 xor $Tlo,$Zlo,$Zlo
154 ldub [$inp+$cnt],$nlo
155 srlx $Zhi,4,$Zhi
156 xor $Zlo,$tmp,$Zlo
157 ldub [$Xi+$cnt],$xi1
158 xor $Thi,$Zhi,$Zhi
159 and $Zlo,0xf,$remi
160
161 ldx [$Htblo+$nhi],$Tlo
162 sll $remi,3,$remi
163 xor $rem,$Zhi,$Zhi
164 ldx [$Htbl+$nhi],$Thi
165 srlx $Zlo,4,$Zlo
166 ldx [$rem_4bit+$remi],$rem
167 sllx $Zhi,60,$tmp
168 xor $xi1,$nlo,$nlo
169 srlx $Zhi,4,$Zhi
170 and $nlo,0xf0,$nhi
171 addcc $cnt,-1,$cnt
172 xor $Zlo,$tmp,$Zlo
173 and $nlo,0x0f,$nlo
174 xor $Tlo,$Zlo,$Zlo
175 sll $nlo,4,$nlo
176 blu .Lghash_inner
177 and $Zlo,0xf,$remi
178
179 ldx [$Htblo+$nlo],$Tlo
180 sll $remi,3,$remi
181 xor $Thi,$Zhi,$Zhi
182 ldx [$Htbl+$nlo],$Thi
183 srlx $Zlo,4,$Zlo
184 xor $rem,$Zhi,$Zhi
185 ldx [$rem_4bit+$remi],$rem
186 sllx $Zhi,60,$tmp
187 xor $Tlo,$Zlo,$Zlo
188 srlx $Zhi,4,$Zhi
189 xor $Zlo,$tmp,$Zlo
190 xor $Thi,$Zhi,$Zhi
191
192 add $inp,16,$inp
193 cmp $inp,$len
eb77e888 194 be,pn SIZE_T_CC,.Ldone
c3473126
AP
195 and $Zlo,0xf,$remi
196
197 ldx [$Htblo+$nhi],$Tlo
198 sll $remi,3,$remi
199 xor $rem,$Zhi,$Zhi
200 ldx [$Htbl+$nhi],$Thi
201 srlx $Zlo,4,$Zlo
202 ldx [$rem_4bit+$remi],$rem
203 sllx $Zhi,60,$tmp
204 xor $Tlo,$Zlo,$Zlo
205 ldub [$inp+15],$nlo
206 srlx $Zhi,4,$Zhi
207 xor $Zlo,$tmp,$Zlo
208 xor $Thi,$Zhi,$Zhi
209 stx $Zlo,[$Xi+8]
210 xor $rem,$Zhi,$Zhi
211 stx $Zhi,[$Xi]
212 srl $Zlo,8,$xi1
213 and $Zlo,0xff,$xi0
214 ba .Louter
215 and $xi1,0xff,$xi1
216.align 32
217.Ldone:
218 ldx [$Htblo+$nhi],$Tlo
219 sll $remi,3,$remi
220 xor $rem,$Zhi,$Zhi
221 ldx [$Htbl+$nhi],$Thi
222 srlx $Zlo,4,$Zlo
223 ldx [$rem_4bit+$remi],$rem
224 sllx $Zhi,60,$tmp
225 xor $Tlo,$Zlo,$Zlo
226 srlx $Zhi,4,$Zhi
227 xor $Zlo,$tmp,$Zlo
228 xor $Thi,$Zhi,$Zhi
229 stx $Zlo,[$Xi+8]
230 xor $rem,$Zhi,$Zhi
231 stx $Zhi,[$Xi]
232
233 ret
234 restore
235.type gcm_ghash_4bit,#function
236.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
237___
238
c3473126
AP
239undef $inp;
240undef $len;
241
242$code.=<<___;
243.globl gcm_gmult_4bit
244.align 32
245gcm_gmult_4bit:
246 save %sp,-$frame,%sp
247 ldub [$Xi+15],$nlo
248 add $Htbl,8,$Htblo
249
2501: call .+8
251 add %o7,rem_4bit-1b,$rem_4bit
252
253 and $nlo,0xf0,$nhi
254 and $nlo,0x0f,$nlo
255 sll $nlo,4,$nlo
256 ldx [$Htblo+$nlo],$Zlo
257 ldx [$Htbl+$nlo],$Zhi
258
259 ldub [$Xi+14],$nlo
260
261 ldx [$Htblo+$nhi],$Tlo
262 and $Zlo,0xf,$remi
263 ldx [$Htbl+$nhi],$Thi
264 sll $remi,3,$remi
265 ldx [$rem_4bit+$remi],$rem
266 srlx $Zlo,4,$Zlo
267 mov 13,$cnt
268 sllx $Zhi,60,$tmp
269 xor $Tlo,$Zlo,$Zlo
270 srlx $Zhi,4,$Zhi
271 xor $Zlo,$tmp,$Zlo
272
273 and $Zlo,0xf,$remi
274 and $nlo,0xf0,$nhi
275 and $nlo,0x0f,$nlo
276 ba .Lgmult_inner
277 sll $nlo,4,$nlo
278.align 32
279.Lgmult_inner:
280 ldx [$Htblo+$nlo],$Tlo
281 sll $remi,3,$remi
282 xor $Thi,$Zhi,$Zhi
283 ldx [$Htbl+$nlo],$Thi
284 srlx $Zlo,4,$Zlo
285 xor $rem,$Zhi,$Zhi
286 ldx [$rem_4bit+$remi],$rem
287 sllx $Zhi,60,$tmp
288 xor $Tlo,$Zlo,$Zlo
289 ldub [$Xi+$cnt],$nlo
290 srlx $Zhi,4,$Zhi
291 xor $Zlo,$tmp,$Zlo
292 xor $Thi,$Zhi,$Zhi
293 and $Zlo,0xf,$remi
294
295 ldx [$Htblo+$nhi],$Tlo
296 sll $remi,3,$remi
297 xor $rem,$Zhi,$Zhi
298 ldx [$Htbl+$nhi],$Thi
299 srlx $Zlo,4,$Zlo
300 ldx [$rem_4bit+$remi],$rem
301 sllx $Zhi,60,$tmp
302 srlx $Zhi,4,$Zhi
303 and $nlo,0xf0,$nhi
304 addcc $cnt,-1,$cnt
305 xor $Zlo,$tmp,$Zlo
306 and $nlo,0x0f,$nlo
307 xor $Tlo,$Zlo,$Zlo
308 sll $nlo,4,$nlo
309 blu .Lgmult_inner
310 and $Zlo,0xf,$remi
311
312 ldx [$Htblo+$nlo],$Tlo
313 sll $remi,3,$remi
314 xor $Thi,$Zhi,$Zhi
315 ldx [$Htbl+$nlo],$Thi
316 srlx $Zlo,4,$Zlo
317 xor $rem,$Zhi,$Zhi
318 ldx [$rem_4bit+$remi],$rem
319 sllx $Zhi,60,$tmp
320 xor $Tlo,$Zlo,$Zlo
321 srlx $Zhi,4,$Zhi
322 xor $Zlo,$tmp,$Zlo
323 xor $Thi,$Zhi,$Zhi
324 and $Zlo,0xf,$remi
325
326 ldx [$Htblo+$nhi],$Tlo
327 sll $remi,3,$remi
328 xor $rem,$Zhi,$Zhi
329 ldx [$Htbl+$nhi],$Thi
330 srlx $Zlo,4,$Zlo
331 ldx [$rem_4bit+$remi],$rem
332 sllx $Zhi,60,$tmp
333 xor $Tlo,$Zlo,$Zlo
334 srlx $Zhi,4,$Zhi
335 xor $Zlo,$tmp,$Zlo
336 xor $Thi,$Zhi,$Zhi
337 stx $Zlo,[$Xi+8]
338 xor $rem,$Zhi,$Zhi
339 stx $Zhi,[$Xi]
340
341 ret
342 restore
343.type gcm_gmult_4bit,#function
344.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
23328d4b
AP
345___
346\f
347{{{
24798c5e
AP
348# Straightforward 128x128-bit multiplication using Karatsuba algorithm
349# followed by pair of 64-bit reductions [with a shortcut in first one,
350# which allowed to break dependency between reductions and remove one
3766e7cc 351# multiplication from critical path]. While it might be suboptimal
24798c5e
AP
352# with regard to sheer number of multiplications, other methods [such
353# as aggregate reduction] would require more 64-bit registers, which
354# we don't have in 32-bit application context.
23328d4b
AP
355
356($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
357
3766e7cc 358($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
24798c5e 359 (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
23328d4b 360
3766e7cc 361($shl,$shr)=map("%l$_",(0..7));
24798c5e
AP
362
363# For details regarding "twisted H" see ghash-x86.pl.
23328d4b 364$code.=<<___;
24798c5e 365.globl gcm_init_vis3
23328d4b 366.align 32
24798c5e 367gcm_init_vis3:
23328d4b
AP
368 save %sp,-$frame,%sp
369
24798c5e
AP
370 ldx [%i1+0],$Hhi
371 ldx [%i1+8],$Hlo
372 mov 0xE1,$Xhi
373 mov 1,$Xlo
374 sllx $Xhi,57,$Xhi
3766e7cc 375 srax $Hhi,63,$C0 ! broadcast carry
24798c5e
AP
376 addcc $Hlo,$Hlo,$Hlo ! H<<=1
377 addxc $Hhi,$Hhi,$Hhi
3766e7cc
AP
378 and $C0,$Xlo,$Xlo
379 and $C0,$Xhi,$Xhi
24798c5e
AP
380 xor $Xlo,$Hlo,$Hlo
381 xor $Xhi,$Hhi,$Hhi
382 stx $Hlo,[%i0+8] ! save twisted H
383 stx $Hhi,[%i0+0]
23328d4b 384
3766e7cc
AP
385 sethi %hi(0xA0406080),$V
386 sethi %hi(0x20C0E000),%l0
387 or $V,%lo(0xA0406080),$V
388 or %l0,%lo(0x20C0E000),%l0
389 sllx $V,32,$V
053fa39a 390 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
3766e7cc
AP
391 stx $V,[%i0+16]
392
24798c5e
AP
393 ret
394 restore
395.type gcm_init_vis3,#function
396.size gcm_init_vis3,.-gcm_init_vis3
23328d4b 397
24798c5e
AP
398.globl gcm_gmult_vis3
399.align 32
400gcm_gmult_vis3:
401 save %sp,-$frame,%sp
23328d4b 402
24798c5e
AP
403 ldx [$Xip+8],$Xlo ! load Xi
404 ldx [$Xip+0],$Xhi
405 ldx [$Htable+8],$Hlo ! load twisted H
406 ldx [$Htable+0],$Hhi
407
3766e7cc
AP
408 mov 0xE1,%l7
409 sllx %l7,57,$xE1 ! 57 is not a typo
053fa39a 410 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
24798c5e 411
3766e7cc 412 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
24798c5e
AP
413 xmulx $Xlo,$Hlo,$C0
414 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
415 xmulx $C2,$Hhl,$C1
416 xmulxhi $Xlo,$Hlo,$Xlo
417 xmulxhi $C2,$Hhl,$C2
418 xmulxhi $Xhi,$Hhi,$C3
419 xmulx $Xhi,$Hhi,$Xhi
420
421 sll $C0,3,$sqr
053fa39a 422 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
24798c5e 423 xor $C0,$sqr,$sqr
053fa39a 424 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
24798c5e
AP
425
426 xor $C0,$C1,$C1 ! Karatsuba post-processing
427 xor $Xlo,$C2,$C2
3766e7cc 428 xor $sqr,$Xlo,$Xlo ! real destination is $C1
24798c5e
AP
429 xor $C3,$C2,$C2
430 xor $Xlo,$C1,$C1
3766e7cc
AP
431 xor $Xhi,$C2,$C2
432 xor $Xhi,$C1,$C1
24798c5e 433
053fa39a 434 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
24798c5e
AP
435 xor $C0,$C2,$C2
436 xmulx $C1,$xE1,$C0
437 xor $C1,$C3,$C3
438 xmulxhi $C1,$xE1,$C1
439
440 xor $Xlo,$C2,$C2
24798c5e
AP
441 xor $C0,$C2,$C2
442 xor $C1,$C3,$C3
443
444 stx $C2,[$Xip+8] ! save Xi
445 stx $C3,[$Xip+0]
23328d4b
AP
446
447 ret
448 restore
449.type gcm_gmult_vis3,#function
450.size gcm_gmult_vis3,.-gcm_gmult_vis3
451
452.globl gcm_ghash_vis3
453.align 32
454gcm_ghash_vis3:
455 save %sp,-$frame,%sp
f198cc43
AP
456 nop
457 srln $len,0,$len ! needed on v8+, "nop" on v9
23328d4b 458
24798c5e
AP
459 ldx [$Xip+8],$C2 ! load Xi
460 ldx [$Xip+0],$C3
461 ldx [$Htable+8],$Hlo ! load twisted H
462 ldx [$Htable+0],$Hhi
463
24798c5e 464 mov 0xE1,%l7
24798c5e 465 sllx %l7,57,$xE1 ! 57 is not a typo
053fa39a 466 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
24798c5e 467
23328d4b
AP
468 and $inp,7,$shl
469 andn $inp,7,$inp
23328d4b
AP
470 sll $shl,3,$shl
471 prefetch [$inp+63], 20
23328d4b 472 sub %g0,$shl,$shr
24798c5e
AP
473
474 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
23328d4b 475.Loop:
24798c5e 476 ldx [$inp+8],$Xlo
23328d4b 477 brz,pt $shl,1f
24798c5e
AP
478 ldx [$inp+0],$Xhi
479
480 ldx [$inp+16],$C1 ! align data
481 srlx $Xlo,$shr,$C0
482 sllx $Xlo,$shl,$Xlo
483 sllx $Xhi,$shl,$Xhi
484 srlx $C1,$shr,$C1
485 or $C0,$Xhi,$Xhi
486 or $C1,$Xlo,$Xlo
23328d4b
AP
4871:
488 add $inp,16,$inp
489 sub $len,16,$len
24798c5e
AP
490 xor $C2,$Xlo,$Xlo
491 xor $C3,$Xhi,$Xhi
23328d4b
AP
492 prefetch [$inp+63], 20
493
24798c5e
AP
494 xmulx $Xlo,$Hlo,$C0
495 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
496 xmulx $C2,$Hhl,$C1
497 xmulxhi $Xlo,$Hlo,$Xlo
498 xmulxhi $C2,$Hhl,$C2
499 xmulxhi $Xhi,$Hhi,$C3
500 xmulx $Xhi,$Hhi,$Xhi
501
502 sll $C0,3,$sqr
053fa39a 503 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
24798c5e 504 xor $C0,$sqr,$sqr
053fa39a 505 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
24798c5e
AP
506
507 xor $C0,$C1,$C1 ! Karatsuba post-processing
508 xor $Xlo,$C2,$C2
3766e7cc 509 xor $sqr,$Xlo,$Xlo ! real destination is $C1
24798c5e
AP
510 xor $C3,$C2,$C2
511 xor $Xlo,$C1,$C1
3766e7cc
AP
512 xor $Xhi,$C2,$C2
513 xor $Xhi,$C1,$C1
24798c5e 514
053fa39a 515 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
24798c5e
AP
516 xor $C0,$C2,$C2
517 xmulx $C1,$xE1,$C0
518 xor $C1,$C3,$C3
519 xmulxhi $C1,$xE1,$C1
520
521 xor $Xlo,$C2,$C2
24798c5e 522 xor $C0,$C2,$C2
23328d4b 523 brnz,pt $len,.Loop
24798c5e 524 xor $C1,$C3,$C3
23328d4b 525
24798c5e
AP
526 stx $C2,[$Xip+8] ! save Xi
527 stx $C3,[$Xip+0]
23328d4b
AP
528
529 ret
530 restore
531.type gcm_ghash_vis3,#function
532.size gcm_ghash_vis3,.-gcm_ghash_vis3
533___
534}}}
535$code.=<<___;
536.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
c32fcca6 537.align 4
c3473126
AP
538___
539
23328d4b
AP
540\f
541# Purpose of these subroutines is to explicitly encode VIS instructions,
542# so that one can compile the module without having to specify VIS
478b50cf 543# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
23328d4b
AP
544# Idea is to reserve for option to produce "universal" binary and let
545# programmer detect if current CPU is VIS capable at run-time.
546sub unvis3 {
547my ($mnemonic,$rs1,$rs2,$rd)=@_;
548my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
549my ($ref,$opf);
550my %visopf = ( "addxc" => 0x011,
551 "addxccc" => 0x013,
552 "xmulx" => 0x115,
553 "xmulxhi" => 0x116 );
554
555 $ref = "$mnemonic\t$rs1,$rs2,$rd";
556
557 if ($opf=$visopf{$mnemonic}) {
558 foreach ($rs1,$rs2,$rd) {
559 return $ref if (!/%([goli])([0-9])/);
560 $_=$bias{$1}+$2;
561 }
562
563 return sprintf ".word\t0x%08x !%s",
564 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
565 $ref;
566 } else {
567 return $ref;
568 }
569}
570
571foreach (split("\n",$code)) {
572 s/\`([^\`]*)\`/eval $1/ge;
573
574 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
575 &unvis3($1,$2,$3,$4)
576 /ge;
577
578 print $_,"\n";
579}
580
c3473126 581close STDOUT;