]>
Commit | Line | Data |
---|---|---|
54a1f448 AP |
1 | #!/usr/bin/env perl |
2 | ||
dea80680 AP |
3 | # Specific modes implementations for SPARC Architecture 2011. There |
4 | # is T4 dependency though, an ASI value that is not specified in the | |
5 | # Architecture Manual. But as SPARC universe is rather monocultural, | |
6 | # we imply that processor capable of executing crypto instructions | |
7 | # can handle the ASI in question as well. This means that we ought to | |
8 | # keep eyes open when new processors emerge... | |
9 | # | |
10 | # As for above mentioned ASI. It's so called "block initializing | |
11 | # store" which cancels "read" in "read-update-write" on cache lines. | |
12 | # This is "cooperative" optimization, as it reduces overall pressure | |
13 | # on memory interface. Benefits can't be observed/quantified with | |
14 | # usual benchmarks, on the contrary you can notice that single-thread | |
fd3b0eb0 AP |
15 | # performance for parallelizable modes is ~1.5% worse for largest |
16 | # block sizes [though few percent better for not so long ones]. All | |
17 | # this based on suggestions from David Miller. | |
54a1f448 | 18 | |
38049c2b AP |
19 | sub asm_init { # to be called with @ARGV as argument |
20 | for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); } | |
21 | if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; } | |
22 | else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; } | |
23 | } | |
24 | ||
25 | # unified interface | |
26 | my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5)); | |
27 | # local variables | |
dea80680 | 28 | my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7)); |
54a1f448 AP |
29 | |
30 | sub alg_cbc_encrypt_implement { | |
31 | my ($alg,$bits) = @_; | |
32 | ||
33 | $::code.=<<___; | |
34 | .globl ${alg}${bits}_t4_cbc_encrypt | |
35 | .align 32 | |
36 | ${alg}${bits}_t4_cbc_encrypt: | |
37 | save %sp, -$::frame, %sp | |
dea80680 | 38 | sub $inp, $out, $blk_init ! $inp!=$out |
54a1f448 AP |
39 | ___ |
40 | $::code.=<<___ if (!$::evp); | |
41 | andcc $ivec, 7, $ivoff | |
42 | alignaddr $ivec, %g0, $ivec | |
43 | ||
44 | ldd [$ivec + 0], %f0 ! load ivec | |
45 | bz,pt %icc, 1f | |
46 | ldd [$ivec + 8], %f2 | |
47 | ldd [$ivec + 16], %f4 | |
48 | faligndata %f0, %f2, %f0 | |
49 | faligndata %f2, %f4, %f2 | |
50 | 1: | |
51 | ___ | |
52 | $::code.=<<___ if ($::evp); | |
53 | ld [$ivec + 0], %f0 | |
54 | ld [$ivec + 4], %f1 | |
55 | ld [$ivec + 8], %f2 | |
56 | ld [$ivec + 12], %f3 | |
57 | ___ | |
58 | $::code.=<<___; | |
dea80680 AP |
59 | prefetch [$inp], 20 |
60 | prefetch [$inp + 63], 20 | |
54a1f448 | 61 | call _${alg}${bits}_load_enckey |
54a1f448 AP |
62 | and $inp, 7, $ileft |
63 | andn $inp, 7, $inp | |
64 | sll $ileft, 3, $ileft | |
65 | mov 64, $iright | |
66 | mov 0xff, $omask | |
67 | sub $iright, $ileft, $iright | |
68 | and $out, 7, $ooff | |
dea80680 AP |
69 | cmp $len, 127 |
70 | movrnz $ooff, 0, $blk_init ! if ( $out&7 || | |
71 | movleu $::size_t_cc, 0, $blk_init ! $len<128 || | |
72 | brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out) | |
54a1f448 AP |
73 | srl $omask, $ooff, $omask |
74 | ||
dea80680 AP |
75 | alignaddrl $out, %g0, $out |
76 | srlx $len, 4, $len | |
77 | prefetch [$out], 22 | |
78 | ||
54a1f448 AP |
79 | .L${bits}_cbc_enc_loop: |
80 | ldx [$inp + 0], %o0 | |
81 | brz,pt $ileft, 4f | |
82 | ldx [$inp + 8], %o1 | |
83 | ||
84 | ldx [$inp + 16], %o2 | |
85 | sllx %o0, $ileft, %o0 | |
86 | srlx %o1, $iright, %g1 | |
87 | sllx %o1, $ileft, %o1 | |
88 | or %g1, %o0, %o0 | |
89 | srlx %o2, $iright, %o2 | |
90 | or %o2, %o1, %o1 | |
91 | 4: | |
92 | xor %g4, %o0, %o0 ! ^= rk[0] | |
93 | xor %g5, %o1, %o1 | |
94 | movxtod %o0, %f12 | |
95 | movxtod %o1, %f14 | |
96 | ||
97 | fxor %f12, %f0, %f0 ! ^= ivec | |
98 | fxor %f14, %f2, %f2 | |
dea80680 AP |
99 | prefetch [$out + 63], 22 |
100 | prefetch [$inp + 16+63], 20 | |
54a1f448 AP |
101 | call _${alg}${bits}_encrypt_1x |
102 | add $inp, 16, $inp | |
103 | ||
104 | brnz,pn $ooff, 2f | |
105 | sub $len, 1, $len | |
106 | ||
107 | std %f0, [$out + 0] | |
108 | std %f2, [$out + 8] | |
109 | brnz,pt $len, .L${bits}_cbc_enc_loop | |
110 | add $out, 16, $out | |
111 | ___ | |
112 | $::code.=<<___ if ($::evp); | |
113 | st %f0, [$ivec + 0] | |
114 | st %f1, [$ivec + 4] | |
115 | st %f2, [$ivec + 8] | |
116 | st %f3, [$ivec + 12] | |
117 | ___ | |
118 | $::code.=<<___ if (!$::evp); | |
119 | brnz,pn $ivoff, 3f | |
120 | nop | |
121 | ||
122 | std %f0, [$ivec + 0] ! write out ivec | |
123 | std %f2, [$ivec + 8] | |
124 | ___ | |
125 | $::code.=<<___; | |
126 | ret | |
127 | restore | |
128 | ||
129 | .align 16 | |
130 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
131 | ! and ~3x deterioration | |
132 | ! in inp==out case | |
133 | faligndata %f0, %f0, %f4 ! handle unaligned output | |
134 | faligndata %f0, %f2, %f6 | |
135 | faligndata %f2, %f2, %f8 | |
136 | ||
137 | stda %f4, [$out + $omask]0xc0 ! partial store | |
138 | std %f6, [$out + 8] | |
139 | add $out, 16, $out | |
140 | orn %g0, $omask, $omask | |
141 | stda %f8, [$out + $omask]0xc0 ! partial store | |
142 | ||
143 | brnz,pt $len, .L${bits}_cbc_enc_loop+4 | |
144 | orn %g0, $omask, $omask | |
145 | ___ | |
146 | $::code.=<<___ if ($::evp); | |
147 | st %f0, [$ivec + 0] | |
148 | st %f1, [$ivec + 4] | |
149 | st %f2, [$ivec + 8] | |
150 | st %f3, [$ivec + 12] | |
151 | ___ | |
152 | $::code.=<<___ if (!$::evp); | |
153 | brnz,pn $ivoff, 3f | |
154 | nop | |
155 | ||
156 | std %f0, [$ivec + 0] ! write out ivec | |
157 | std %f2, [$ivec + 8] | |
158 | ret | |
159 | restore | |
160 | ||
161 | .align 16 | |
162 | 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec | |
163 | mov 0xff, $omask | |
164 | srl $omask, $ivoff, $omask | |
165 | faligndata %f0, %f0, %f4 | |
166 | faligndata %f0, %f2, %f6 | |
167 | faligndata %f2, %f2, %f8 | |
168 | stda %f4, [$ivec + $omask]0xc0 | |
169 | std %f6, [$ivec + 8] | |
170 | add $ivec, 16, $ivec | |
171 | orn %g0, $omask, $omask | |
172 | stda %f8, [$ivec + $omask]0xc0 | |
173 | ___ | |
dea80680 AP |
174 | $::code.=<<___; |
175 | ret | |
176 | restore | |
177 | ||
178 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
179 | .align 32 | |
180 | .L${bits}cbc_enc_blk: | |
181 | add $out, $len, $blk_init | |
182 | and $blk_init, 63, $blk_init ! tail | |
183 | sub $len, $blk_init, $len | |
184 | add $blk_init, 15, $blk_init ! round up to 16n | |
185 | srlx $len, 4, $len | |
186 | srl $blk_init, 4, $blk_init | |
187 | ||
188 | .L${bits}_cbc_enc_blk_loop: | |
189 | ldx [$inp + 0], %o0 | |
190 | brz,pt $ileft, 5f | |
191 | ldx [$inp + 8], %o1 | |
192 | ||
193 | ldx [$inp + 16], %o2 | |
194 | sllx %o0, $ileft, %o0 | |
195 | srlx %o1, $iright, %g1 | |
196 | sllx %o1, $ileft, %o1 | |
197 | or %g1, %o0, %o0 | |
198 | srlx %o2, $iright, %o2 | |
199 | or %o2, %o1, %o1 | |
200 | 5: | |
201 | xor %g4, %o0, %o0 ! ^= rk[0] | |
202 | xor %g5, %o1, %o1 | |
203 | movxtod %o0, %f12 | |
204 | movxtod %o1, %f14 | |
205 | ||
206 | fxor %f12, %f0, %f0 ! ^= ivec | |
207 | fxor %f14, %f2, %f2 | |
208 | prefetch [$inp + 16+63], 20 | |
209 | call _${alg}${bits}_encrypt_1x | |
210 | add $inp, 16, $inp | |
211 | sub $len, 1, $len | |
212 | ||
38049c2b | 213 | stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 214 | add $out, 8, $out |
38049c2b | 215 | stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 AP |
216 | brnz,pt $len, .L${bits}_cbc_enc_blk_loop |
217 | add $out, 8, $out | |
218 | ||
fd3b0eb0 | 219 | membar #StoreLoad|#StoreStore |
dea80680 AP |
220 | brnz,pt $blk_init, .L${bits}_cbc_enc_loop |
221 | mov $blk_init, $len | |
222 | ___ | |
223 | $::code.=<<___ if ($::evp); | |
224 | st %f0, [$ivec + 0] | |
225 | st %f1, [$ivec + 4] | |
226 | st %f2, [$ivec + 8] | |
227 | st %f3, [$ivec + 12] | |
228 | ___ | |
229 | $::code.=<<___ if (!$::evp); | |
230 | brnz,pn $ivoff, 3b | |
231 | nop | |
232 | ||
233 | std %f0, [$ivec + 0] ! write out ivec | |
234 | std %f2, [$ivec + 8] | |
235 | ___ | |
54a1f448 AP |
236 | $::code.=<<___; |
237 | ret | |
238 | restore | |
239 | .type ${alg}${bits}_t4_cbc_encrypt,#function | |
240 | .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt | |
241 | ___ | |
242 | } | |
243 | ||
244 | sub alg_cbc_decrypt_implement { | |
245 | my ($alg,$bits) = @_; | |
246 | ||
247 | $::code.=<<___; | |
248 | .globl ${alg}${bits}_t4_cbc_decrypt | |
249 | .align 32 | |
250 | ${alg}${bits}_t4_cbc_decrypt: | |
251 | save %sp, -$::frame, %sp | |
dea80680 | 252 | sub $inp, $out, $blk_init ! $inp!=$out |
54a1f448 AP |
253 | ___ |
254 | $::code.=<<___ if (!$::evp); | |
255 | andcc $ivec, 7, $ivoff | |
256 | alignaddr $ivec, %g0, $ivec | |
257 | ||
258 | ldd [$ivec + 0], %f12 ! load ivec | |
259 | bz,pt %icc, 1f | |
260 | ldd [$ivec + 8], %f14 | |
261 | ldd [$ivec + 16], %f0 | |
262 | faligndata %f12, %f14, %f12 | |
263 | faligndata %f14, %f0, %f14 | |
264 | 1: | |
265 | ___ | |
266 | $::code.=<<___ if ($::evp); | |
267 | ld [$ivec + 0], %f12 ! load ivec | |
268 | ld [$ivec + 4], %f13 | |
269 | ld [$ivec + 8], %f14 | |
270 | ld [$ivec + 12], %f15 | |
271 | ___ | |
272 | $::code.=<<___; | |
dea80680 AP |
273 | prefetch [$inp], 20 |
274 | prefetch [$inp + 63], 20 | |
54a1f448 | 275 | call _${alg}${bits}_load_deckey |
54a1f448 AP |
276 | and $inp, 7, $ileft |
277 | andn $inp, 7, $inp | |
278 | sll $ileft, 3, $ileft | |
279 | mov 64, $iright | |
280 | mov 0xff, $omask | |
281 | sub $iright, $ileft, $iright | |
282 | and $out, 7, $ooff | |
fd3b0eb0 | 283 | cmp $len, 255 |
dea80680 | 284 | movrnz $ooff, 0, $blk_init ! if ( $out&7 || |
fd3b0eb0 | 285 | movleu $::size_t_cc, 0, $blk_init ! $len<256 || |
dea80680 AP |
286 | brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out) |
287 | srl $omask, $ooff, $omask | |
288 | ||
289 | andcc $len, 16, %g0 ! is number of blocks even? | |
290 | srlx $len, 4, $len | |
54a1f448 AP |
291 | alignaddrl $out, %g0, $out |
292 | bz %icc, .L${bits}_cbc_dec_loop2x | |
dea80680 | 293 | prefetch [$out], 22 |
54a1f448 AP |
294 | .L${bits}_cbc_dec_loop: |
295 | ldx [$inp + 0], %o0 | |
296 | brz,pt $ileft, 4f | |
297 | ldx [$inp + 8], %o1 | |
298 | ||
299 | ldx [$inp + 16], %o2 | |
300 | sllx %o0, $ileft, %o0 | |
301 | srlx %o1, $iright, %g1 | |
302 | sllx %o1, $ileft, %o1 | |
303 | or %g1, %o0, %o0 | |
304 | srlx %o2, $iright, %o2 | |
305 | or %o2, %o1, %o1 | |
306 | 4: | |
307 | xor %g4, %o0, %o2 ! ^= rk[0] | |
308 | xor %g5, %o1, %o3 | |
309 | movxtod %o2, %f0 | |
310 | movxtod %o3, %f2 | |
311 | ||
dea80680 AP |
312 | prefetch [$out + 63], 22 |
313 | prefetch [$inp + 16+63], 20 | |
54a1f448 AP |
314 | call _${alg}${bits}_decrypt_1x |
315 | add $inp, 16, $inp | |
316 | ||
317 | fxor %f12, %f0, %f0 ! ^= ivec | |
318 | fxor %f14, %f2, %f2 | |
319 | movxtod %o0, %f12 | |
320 | movxtod %o1, %f14 | |
321 | ||
322 | brnz,pn $ooff, 2f | |
323 | sub $len, 1, $len | |
324 | ||
325 | std %f0, [$out + 0] | |
326 | std %f2, [$out + 8] | |
327 | brnz,pt $len, .L${bits}_cbc_dec_loop2x | |
328 | add $out, 16, $out | |
329 | ___ | |
330 | $::code.=<<___ if ($::evp); | |
331 | st %f12, [$ivec + 0] | |
332 | st %f13, [$ivec + 4] | |
333 | st %f14, [$ivec + 8] | |
334 | st %f15, [$ivec + 12] | |
335 | ___ | |
336 | $::code.=<<___ if (!$::evp); | |
337 | brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec | |
338 | nop | |
339 | ||
340 | std %f12, [$ivec + 0] ! write out ivec | |
341 | std %f14, [$ivec + 8] | |
342 | ___ | |
343 | $::code.=<<___; | |
344 | ret | |
345 | restore | |
346 | ||
347 | .align 16 | |
348 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
349 | ! and ~3x deterioration | |
350 | ! in inp==out case | |
351 | faligndata %f0, %f0, %f4 ! handle unaligned output | |
352 | faligndata %f0, %f2, %f6 | |
353 | faligndata %f2, %f2, %f8 | |
354 | ||
355 | stda %f4, [$out + $omask]0xc0 ! partial store | |
356 | std %f6, [$out + 8] | |
357 | add $out, 16, $out | |
358 | orn %g0, $omask, $omask | |
359 | stda %f8, [$out + $omask]0xc0 ! partial store | |
360 | ||
361 | brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 | |
362 | orn %g0, $omask, $omask | |
363 | ___ | |
364 | $::code.=<<___ if ($::evp); | |
365 | st %f12, [$ivec + 0] | |
366 | st %f13, [$ivec + 4] | |
367 | st %f14, [$ivec + 8] | |
368 | st %f15, [$ivec + 12] | |
369 | ___ | |
370 | $::code.=<<___ if (!$::evp); | |
371 | brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec | |
372 | nop | |
373 | ||
374 | std %f12, [$ivec + 0] ! write out ivec | |
375 | std %f14, [$ivec + 8] | |
376 | ___ | |
377 | $::code.=<<___; | |
378 | ret | |
379 | restore | |
380 | ||
381 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
382 | .align 32 | |
383 | .L${bits}_cbc_dec_loop2x: | |
384 | ldx [$inp + 0], %o0 | |
385 | ldx [$inp + 8], %o1 | |
386 | ldx [$inp + 16], %o2 | |
387 | brz,pt $ileft, 4f | |
388 | ldx [$inp + 24], %o3 | |
389 | ||
390 | ldx [$inp + 32], %o4 | |
391 | sllx %o0, $ileft, %o0 | |
392 | srlx %o1, $iright, %g1 | |
393 | or %g1, %o0, %o0 | |
394 | sllx %o1, $ileft, %o1 | |
395 | srlx %o2, $iright, %g1 | |
396 | or %g1, %o1, %o1 | |
397 | sllx %o2, $ileft, %o2 | |
398 | srlx %o3, $iright, %g1 | |
399 | or %g1, %o2, %o2 | |
400 | sllx %o3, $ileft, %o3 | |
401 | srlx %o4, $iright, %o4 | |
402 | or %o4, %o3, %o3 | |
403 | 4: | |
404 | xor %g4, %o0, %o4 ! ^= rk[0] | |
405 | xor %g5, %o1, %o5 | |
406 | movxtod %o4, %f0 | |
407 | movxtod %o5, %f2 | |
408 | xor %g4, %o2, %o4 | |
409 | xor %g5, %o3, %o5 | |
410 | movxtod %o4, %f4 | |
411 | movxtod %o5, %f6 | |
412 | ||
dea80680 AP |
413 | prefetch [$out + 63], 22 |
414 | prefetch [$inp + 32+63], 20 | |
54a1f448 AP |
415 | call _${alg}${bits}_decrypt_2x |
416 | add $inp, 32, $inp | |
417 | ||
418 | movxtod %o0, %f8 | |
419 | movxtod %o1, %f10 | |
420 | fxor %f12, %f0, %f0 ! ^= ivec | |
421 | fxor %f14, %f2, %f2 | |
422 | movxtod %o2, %f12 | |
423 | movxtod %o3, %f14 | |
424 | fxor %f8, %f4, %f4 | |
425 | fxor %f10, %f6, %f6 | |
426 | ||
427 | brnz,pn $ooff, 2f | |
428 | sub $len, 2, $len | |
429 | ||
430 | std %f0, [$out + 0] | |
431 | std %f2, [$out + 8] | |
432 | std %f4, [$out + 16] | |
433 | std %f6, [$out + 24] | |
434 | brnz,pt $len, .L${bits}_cbc_dec_loop2x | |
435 | add $out, 32, $out | |
436 | ___ | |
437 | $::code.=<<___ if ($::evp); | |
438 | st %f12, [$ivec + 0] | |
439 | st %f13, [$ivec + 4] | |
440 | st %f14, [$ivec + 8] | |
441 | st %f15, [$ivec + 12] | |
442 | ___ | |
443 | $::code.=<<___ if (!$::evp); | |
444 | brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec | |
445 | nop | |
446 | ||
447 | std %f12, [$ivec + 0] ! write out ivec | |
448 | std %f14, [$ivec + 8] | |
449 | ___ | |
450 | $::code.=<<___; | |
451 | ret | |
452 | restore | |
453 | ||
454 | .align 16 | |
455 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
456 | ! and ~3x deterioration | |
457 | ! in inp==out case | |
458 | faligndata %f0, %f0, %f8 ! handle unaligned output | |
459 | faligndata %f0, %f2, %f0 | |
460 | faligndata %f2, %f4, %f2 | |
461 | faligndata %f4, %f6, %f4 | |
462 | faligndata %f6, %f6, %f6 | |
463 | stda %f8, [$out + $omask]0xc0 ! partial store | |
464 | std %f0, [$out + 8] | |
465 | std %f2, [$out + 16] | |
466 | std %f4, [$out + 24] | |
467 | add $out, 32, $out | |
468 | orn %g0, $omask, $omask | |
469 | stda %f6, [$out + $omask]0xc0 ! partial store | |
470 | ||
471 | brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 | |
472 | orn %g0, $omask, $omask | |
473 | ___ | |
474 | $::code.=<<___ if ($::evp); | |
475 | st %f12, [$ivec + 0] | |
476 | st %f13, [$ivec + 4] | |
477 | st %f14, [$ivec + 8] | |
478 | st %f15, [$ivec + 12] | |
479 | ___ | |
480 | $::code.=<<___ if (!$::evp); | |
481 | brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec | |
482 | nop | |
483 | ||
484 | std %f12, [$ivec + 0] ! write out ivec | |
485 | std %f14, [$ivec + 8] | |
486 | ret | |
487 | restore | |
488 | ||
489 | .align 16 | |
490 | .L${bits}_cbc_dec_unaligned_ivec: | |
491 | alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec | |
492 | mov 0xff, $omask | |
493 | srl $omask, $ivoff, $omask | |
494 | faligndata %f12, %f12, %f0 | |
495 | faligndata %f12, %f14, %f2 | |
496 | faligndata %f14, %f14, %f4 | |
497 | stda %f0, [$ivec + $omask]0xc0 | |
498 | std %f2, [$ivec + 8] | |
499 | add $ivec, 16, $ivec | |
500 | orn %g0, $omask, $omask | |
501 | stda %f4, [$ivec + $omask]0xc0 | |
502 | ___ | |
dea80680 AP |
503 | $::code.=<<___; |
504 | ret | |
505 | restore | |
506 | ||
507 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
508 | .align 32 | |
509 | .L${bits}cbc_dec_blk: | |
510 | add $out, $len, $blk_init | |
511 | and $blk_init, 63, $blk_init ! tail | |
512 | sub $len, $blk_init, $len | |
513 | add $blk_init, 15, $blk_init ! round up to 16n | |
514 | srlx $len, 4, $len | |
515 | srl $blk_init, 4, $blk_init | |
516 | sub $len, 1, $len | |
517 | add $blk_init, 1, $blk_init | |
518 | ||
519 | .L${bits}_cbc_dec_blk_loop2x: | |
520 | ldx [$inp + 0], %o0 | |
521 | ldx [$inp + 8], %o1 | |
522 | ldx [$inp + 16], %o2 | |
523 | brz,pt $ileft, 5f | |
524 | ldx [$inp + 24], %o3 | |
525 | ||
526 | ldx [$inp + 32], %o4 | |
527 | sllx %o0, $ileft, %o0 | |
528 | srlx %o1, $iright, %g1 | |
529 | or %g1, %o0, %o0 | |
530 | sllx %o1, $ileft, %o1 | |
531 | srlx %o2, $iright, %g1 | |
532 | or %g1, %o1, %o1 | |
533 | sllx %o2, $ileft, %o2 | |
534 | srlx %o3, $iright, %g1 | |
535 | or %g1, %o2, %o2 | |
536 | sllx %o3, $ileft, %o3 | |
537 | srlx %o4, $iright, %o4 | |
538 | or %o4, %o3, %o3 | |
539 | 5: | |
540 | xor %g4, %o0, %o4 ! ^= rk[0] | |
541 | xor %g5, %o1, %o5 | |
542 | movxtod %o4, %f0 | |
543 | movxtod %o5, %f2 | |
544 | xor %g4, %o2, %o4 | |
545 | xor %g5, %o3, %o5 | |
546 | movxtod %o4, %f4 | |
547 | movxtod %o5, %f6 | |
548 | ||
549 | prefetch [$inp + 32+63], 20 | |
550 | call _${alg}${bits}_decrypt_2x | |
551 | add $inp, 32, $inp | |
552 | subcc $len, 2, $len | |
553 | ||
554 | movxtod %o0, %f8 | |
555 | movxtod %o1, %f10 | |
556 | fxor %f12, %f0, %f0 ! ^= ivec | |
557 | fxor %f14, %f2, %f2 | |
558 | movxtod %o2, %f12 | |
559 | movxtod %o3, %f14 | |
560 | fxor %f8, %f4, %f4 | |
561 | fxor %f10, %f6, %f6 | |
562 | ||
38049c2b | 563 | stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 564 | add $out, 8, $out |
38049c2b | 565 | stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 566 | add $out, 8, $out |
38049c2b | 567 | stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 568 | add $out, 8, $out |
38049c2b | 569 | stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 AP |
570 | bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x |
571 | add $out, 8, $out | |
572 | ||
573 | add $blk_init, $len, $len | |
574 | andcc $len, 1, %g0 ! is number of blocks even? | |
fd3b0eb0 | 575 | membar #StoreLoad|#StoreStore |
dea80680 AP |
576 | bnz,pt %icc, .L${bits}_cbc_dec_loop |
577 | srl $len, 0, $len | |
578 | brnz,pn $len, .L${bits}_cbc_dec_loop2x | |
579 | nop | |
580 | ___ | |
581 | $::code.=<<___ if ($::evp); | |
38049c2b AP |
582 | st %f12, [$ivec + 0] ! write out ivec |
583 | st %f13, [$ivec + 4] | |
584 | st %f14, [$ivec + 8] | |
585 | st %f15, [$ivec + 12] | |
dea80680 AP |
586 | ___ |
587 | $::code.=<<___ if (!$::evp); | |
588 | brnz,pn $ivoff, 3b | |
589 | nop | |
590 | ||
38049c2b AP |
591 | std %f12, [$ivec + 0] ! write out ivec |
592 | std %f14, [$ivec + 8] | |
dea80680 | 593 | ___ |
54a1f448 AP |
594 | $::code.=<<___; |
595 | ret | |
596 | restore | |
597 | .type ${alg}${bits}_t4_cbc_decrypt,#function | |
598 | .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt | |
599 | ___ | |
600 | } | |
601 | ||
602 | sub alg_ctr32_implement { | |
603 | my ($alg,$bits) = @_; | |
604 | ||
605 | $::code.=<<___; | |
606 | .globl ${alg}${bits}_t4_ctr32_encrypt | |
607 | .align 32 | |
608 | ${alg}${bits}_t4_ctr32_encrypt: | |
609 | save %sp, -$::frame, %sp | |
610 | ||
dea80680 AP |
611 | prefetch [$inp], 20 |
612 | prefetch [$inp + 63], 20 | |
54a1f448 | 613 | call _${alg}${bits}_load_enckey |
dea80680 | 614 | sllx $len, 4, $len |
54a1f448 AP |
615 | |
616 | ld [$ivec + 0], %l4 ! counter | |
617 | ld [$ivec + 4], %l5 | |
618 | ld [$ivec + 8], %l6 | |
619 | ld [$ivec + 12], %l7 | |
620 | ||
621 | sllx %l4, 32, %o5 | |
622 | or %l5, %o5, %o5 | |
623 | sllx %l6, 32, %g1 | |
624 | xor %o5, %g4, %g4 ! ^= rk[0] | |
625 | xor %g1, %g5, %g5 | |
626 | movxtod %g4, %f14 ! most significant 64 bits | |
627 | ||
dea80680 | 628 | sub $inp, $out, $blk_init ! $inp!=$out |
54a1f448 AP |
629 | and $inp, 7, $ileft |
630 | andn $inp, 7, $inp | |
631 | sll $ileft, 3, $ileft | |
632 | mov 64, $iright | |
633 | mov 0xff, $omask | |
634 | sub $iright, $ileft, $iright | |
635 | and $out, 7, $ooff | |
fd3b0eb0 | 636 | cmp $len, 255 |
dea80680 | 637 | movrnz $ooff, 0, $blk_init ! if ( $out&7 || |
fd3b0eb0 | 638 | movleu $::size_t_cc, 0, $blk_init ! $len<256 || |
dea80680 AP |
639 | brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out) |
640 | srl $omask, $ooff, $omask | |
641 | ||
642 | andcc $len, 16, %g0 ! is number of blocks even? | |
54a1f448 AP |
643 | alignaddrl $out, %g0, $out |
644 | bz %icc, .L${bits}_ctr32_loop2x | |
dea80680 | 645 | srlx $len, 4, $len |
54a1f448 AP |
646 | .L${bits}_ctr32_loop: |
647 | ldx [$inp + 0], %o0 | |
648 | brz,pt $ileft, 4f | |
649 | ldx [$inp + 8], %o1 | |
650 | ||
651 | ldx [$inp + 16], %o2 | |
652 | sllx %o0, $ileft, %o0 | |
653 | srlx %o1, $iright, %g1 | |
654 | sllx %o1, $ileft, %o1 | |
655 | or %g1, %o0, %o0 | |
656 | srlx %o2, $iright, %o2 | |
657 | or %o2, %o1, %o1 | |
658 | 4: | |
659 | xor %g5, %l7, %g1 ! ^= rk[0] | |
660 | add %l7, 1, %l7 | |
661 | movxtod %g1, %f2 | |
662 | srl %l7, 0, %l7 ! clruw | |
dea80680 AP |
663 | prefetch [$out + 63], 22 |
664 | prefetch [$inp + 16+63], 20 | |
54a1f448 AP |
665 | ___ |
666 | $::code.=<<___ if ($alg eq "aes"); | |
667 | aes_eround01 %f16, %f14, %f2, %f4 | |
668 | aes_eround23 %f18, %f14, %f2, %f2 | |
669 | ___ | |
670 | $::code.=<<___ if ($alg eq "cmll"); | |
671 | camellia_f %f16, %f2, %f14, %f2 | |
672 | camellia_f %f18, %f14, %f2, %f0 | |
673 | ___ | |
674 | $::code.=<<___; | |
675 | call _${alg}${bits}_encrypt_1x+8 | |
676 | add $inp, 16, $inp | |
677 | ||
678 | movxtod %o0, %f10 | |
679 | movxtod %o1, %f12 | |
680 | fxor %f10, %f0, %f0 ! ^= inp | |
681 | fxor %f12, %f2, %f2 | |
682 | ||
683 | brnz,pn $ooff, 2f | |
684 | sub $len, 1, $len | |
685 | ||
686 | std %f0, [$out + 0] | |
687 | std %f2, [$out + 8] | |
688 | brnz,pt $len, .L${bits}_ctr32_loop2x | |
689 | add $out, 16, $out | |
690 | ||
691 | ret | |
692 | restore | |
693 | ||
694 | .align 16 | |
695 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
696 | ! and ~3x deterioration | |
697 | ! in inp==out case | |
698 | faligndata %f0, %f0, %f4 ! handle unaligned output | |
699 | faligndata %f0, %f2, %f6 | |
700 | faligndata %f2, %f2, %f8 | |
701 | stda %f4, [$out + $omask]0xc0 ! partial store | |
702 | std %f6, [$out + 8] | |
703 | add $out, 16, $out | |
704 | orn %g0, $omask, $omask | |
705 | stda %f8, [$out + $omask]0xc0 ! partial store | |
706 | ||
707 | brnz,pt $len, .L${bits}_ctr32_loop2x+4 | |
708 | orn %g0, $omask, $omask | |
709 | ||
710 | ret | |
711 | restore | |
712 | ||
713 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
714 | .align 32 | |
715 | .L${bits}_ctr32_loop2x: | |
716 | ldx [$inp + 0], %o0 | |
717 | ldx [$inp + 8], %o1 | |
718 | ldx [$inp + 16], %o2 | |
719 | brz,pt $ileft, 4f | |
720 | ldx [$inp + 24], %o3 | |
721 | ||
722 | ldx [$inp + 32], %o4 | |
723 | sllx %o0, $ileft, %o0 | |
724 | srlx %o1, $iright, %g1 | |
725 | or %g1, %o0, %o0 | |
726 | sllx %o1, $ileft, %o1 | |
727 | srlx %o2, $iright, %g1 | |
728 | or %g1, %o1, %o1 | |
729 | sllx %o2, $ileft, %o2 | |
730 | srlx %o3, $iright, %g1 | |
731 | or %g1, %o2, %o2 | |
732 | sllx %o3, $ileft, %o3 | |
733 | srlx %o4, $iright, %o4 | |
734 | or %o4, %o3, %o3 | |
735 | 4: | |
736 | xor %g5, %l7, %g1 ! ^= rk[0] | |
737 | add %l7, 1, %l7 | |
738 | movxtod %g1, %f2 | |
739 | srl %l7, 0, %l7 ! clruw | |
740 | xor %g5, %l7, %g1 | |
741 | add %l7, 1, %l7 | |
742 | movxtod %g1, %f6 | |
743 | srl %l7, 0, %l7 ! clruw | |
dea80680 AP |
744 | prefetch [$out + 63], 22 |
745 | prefetch [$inp + 32+63], 20 | |
54a1f448 AP |
746 | ___ |
747 | $::code.=<<___ if ($alg eq "aes"); | |
748 | aes_eround01 %f16, %f14, %f2, %f8 | |
749 | aes_eround23 %f18, %f14, %f2, %f2 | |
750 | aes_eround01 %f16, %f14, %f6, %f10 | |
751 | aes_eround23 %f18, %f14, %f6, %f6 | |
752 | ___ | |
753 | $::code.=<<___ if ($alg eq "cmll"); | |
754 | camellia_f %f16, %f2, %f14, %f2 | |
755 | camellia_f %f16, %f6, %f14, %f6 | |
756 | camellia_f %f18, %f14, %f2, %f0 | |
757 | camellia_f %f18, %f14, %f6, %f4 | |
758 | ___ | |
759 | $::code.=<<___; | |
760 | call _${alg}${bits}_encrypt_2x+16 | |
761 | add $inp, 32, $inp | |
762 | ||
763 | movxtod %o0, %f8 | |
764 | movxtod %o1, %f10 | |
765 | movxtod %o2, %f12 | |
766 | fxor %f8, %f0, %f0 ! ^= inp | |
767 | movxtod %o3, %f8 | |
768 | fxor %f10, %f2, %f2 | |
769 | fxor %f12, %f4, %f4 | |
770 | fxor %f8, %f6, %f6 | |
771 | ||
772 | brnz,pn $ooff, 2f | |
773 | sub $len, 2, $len | |
774 | ||
775 | std %f0, [$out + 0] | |
776 | std %f2, [$out + 8] | |
777 | std %f4, [$out + 16] | |
778 | std %f6, [$out + 24] | |
779 | brnz,pt $len, .L${bits}_ctr32_loop2x | |
780 | add $out, 32, $out | |
781 | ||
782 | ret | |
783 | restore | |
784 | ||
785 | .align 16 | |
786 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
787 | ! and ~3x deterioration | |
788 | ! in inp==out case | |
789 | faligndata %f0, %f0, %f8 ! handle unaligned output | |
790 | faligndata %f0, %f2, %f0 | |
791 | faligndata %f2, %f4, %f2 | |
792 | faligndata %f4, %f6, %f4 | |
793 | faligndata %f6, %f6, %f6 | |
794 | ||
795 | stda %f8, [$out + $omask]0xc0 ! partial store | |
796 | std %f0, [$out + 8] | |
797 | std %f2, [$out + 16] | |
798 | std %f4, [$out + 24] | |
799 | add $out, 32, $out | |
800 | orn %g0, $omask, $omask | |
801 | stda %f6, [$out + $omask]0xc0 ! partial store | |
802 | ||
803 | brnz,pt $len, .L${bits}_ctr32_loop2x+4 | |
804 | orn %g0, $omask, $omask | |
805 | ||
dea80680 AP |
806 | ret |
807 | restore | |
808 | ||
809 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
810 | .align 32 | |
811 | .L${bits}_ctr32_blk: | |
812 | add $out, $len, $blk_init | |
813 | and $blk_init, 63, $blk_init ! tail | |
814 | sub $len, $blk_init, $len | |
815 | add $blk_init, 15, $blk_init ! round up to 16n | |
816 | srlx $len, 4, $len | |
817 | srl $blk_init, 4, $blk_init | |
818 | sub $len, 1, $len | |
819 | add $blk_init, 1, $blk_init | |
820 | ||
821 | .L${bits}_ctr32_blk_loop2x: | |
822 | ldx [$inp + 0], %o0 | |
823 | ldx [$inp + 8], %o1 | |
824 | ldx [$inp + 16], %o2 | |
825 | brz,pt $ileft, 5f | |
826 | ldx [$inp + 24], %o3 | |
827 | ||
828 | ldx [$inp + 32], %o4 | |
829 | sllx %o0, $ileft, %o0 | |
830 | srlx %o1, $iright, %g1 | |
831 | or %g1, %o0, %o0 | |
832 | sllx %o1, $ileft, %o1 | |
833 | srlx %o2, $iright, %g1 | |
834 | or %g1, %o1, %o1 | |
835 | sllx %o2, $ileft, %o2 | |
836 | srlx %o3, $iright, %g1 | |
837 | or %g1, %o2, %o2 | |
838 | sllx %o3, $ileft, %o3 | |
839 | srlx %o4, $iright, %o4 | |
840 | or %o4, %o3, %o3 | |
841 | 5: | |
842 | xor %g5, %l7, %g1 ! ^= rk[0] | |
843 | add %l7, 1, %l7 | |
844 | movxtod %g1, %f2 | |
845 | srl %l7, 0, %l7 ! clruw | |
846 | xor %g5, %l7, %g1 | |
847 | add %l7, 1, %l7 | |
848 | movxtod %g1, %f6 | |
849 | srl %l7, 0, %l7 ! clruw | |
850 | prefetch [$inp + 32+63], 20 | |
851 | ___ | |
852 | $::code.=<<___ if ($alg eq "aes"); | |
853 | aes_eround01 %f16, %f14, %f2, %f8 | |
854 | aes_eround23 %f18, %f14, %f2, %f2 | |
855 | aes_eround01 %f16, %f14, %f6, %f10 | |
856 | aes_eround23 %f18, %f14, %f6, %f6 | |
857 | ___ | |
858 | $::code.=<<___ if ($alg eq "cmll"); | |
859 | camellia_f %f16, %f2, %f14, %f2 | |
860 | camellia_f %f16, %f6, %f14, %f6 | |
861 | camellia_f %f18, %f14, %f2, %f0 | |
862 | camellia_f %f18, %f14, %f6, %f4 | |
863 | ___ | |
864 | $::code.=<<___; | |
865 | call _${alg}${bits}_encrypt_2x+16 | |
866 | add $inp, 32, $inp | |
867 | subcc $len, 2, $len | |
868 | ||
869 | movxtod %o0, %f8 | |
870 | movxtod %o1, %f10 | |
871 | movxtod %o2, %f12 | |
872 | fxor %f8, %f0, %f0 ! ^= inp | |
873 | movxtod %o3, %f8 | |
874 | fxor %f10, %f2, %f2 | |
875 | fxor %f12, %f4, %f4 | |
876 | fxor %f8, %f6, %f6 | |
877 | ||
38049c2b | 878 | stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 879 | add $out, 8, $out |
38049c2b | 880 | stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 881 | add $out, 8, $out |
38049c2b | 882 | stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 | 883 | add $out, 8, $out |
38049c2b | 884 | stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific |
dea80680 AP |
885 | bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x |
886 | add $out, 8, $out | |
887 | ||
888 | add $blk_init, $len, $len | |
889 | andcc $len, 1, %g0 ! is number of blocks even? | |
fd3b0eb0 | 890 | membar #StoreLoad|#StoreStore |
dea80680 AP |
891 | bnz,pt %icc, .L${bits}_ctr32_loop |
892 | srl $len, 0, $len | |
893 | brnz,pn $len, .L${bits}_ctr32_loop2x | |
894 | nop | |
895 | ||
54a1f448 AP |
896 | ret |
897 | restore | |
898 | .type ${alg}${bits}_t4_ctr32_encrypt,#function | |
899 | .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt | |
900 | ___ | |
901 | } | |
902 | ||
cd686946 AP |
903 | sub alg_xts_implement { |
904 | my ($alg,$bits,$dir) = @_; | |
905 | my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5)); | |
906 | my $rem=$ivec; | |
907 | ||
908 | $::code.=<<___; | |
909 | .globl ${alg}${bits}_t4_xts_${dir}crypt | |
910 | .align 32 | |
911 | ${alg}${bits}_t4_xts_${dir}crypt: | |
912 | save %sp, -$::frame-16, %sp | |
913 | ||
914 | mov $ivec, %o0 | |
915 | add %fp, $::bias-16, %o1 | |
916 | call ${alg}_t4_encrypt | |
917 | mov $key2, %o2 | |
918 | ||
919 | add %fp, $::bias-16, %l7 | |
920 | ldxa [%l7]0x88, %g2 | |
921 | add %fp, $::bias-8, %l7 | |
922 | ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak | |
923 | ||
924 | sethi %hi(0x76543210), %l7 | |
925 | or %l7, %lo(0x76543210), %l7 | |
926 | bmask %l7, %g0, %g0 ! byte swap mask | |
927 | ||
928 | prefetch [$inp], 20 | |
929 | prefetch [$inp + 63], 20 | |
930 | call _${alg}${bits}_load_${dir}ckey | |
931 | and $len, 15, $rem | |
932 | and $len, -16, $len | |
933 | ___ | |
934 | $code.=<<___ if ($dir eq "de"); | |
935 | mov 0, %l7 | |
936 | movrnz $rem, 16, %l7 | |
937 | sub $len, %l7, $len | |
938 | ___ | |
939 | $code.=<<___; | |
940 | ||
941 | sub $inp, $out, $blk_init ! $inp!=$out | |
942 | and $inp, 7, $ileft | |
943 | andn $inp, 7, $inp | |
944 | sll $ileft, 3, $ileft | |
945 | mov 64, $iright | |
946 | mov 0xff, $omask | |
947 | sub $iright, $ileft, $iright | |
948 | and $out, 7, $ooff | |
949 | cmp $len, 255 | |
950 | movrnz $ooff, 0, $blk_init ! if ( $out&7 || | |
951 | movleu $::size_t_cc, 0, $blk_init ! $len<256 || | |
952 | brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out) | |
953 | srl $omask, $ooff, $omask | |
954 | ||
955 | andcc $len, 16, %g0 ! is number of blocks even? | |
956 | ___ | |
957 | $code.=<<___ if ($dir eq "de"); | |
958 | brz,pn $len, .L${bits}_xts_${dir}steal | |
959 | ___ | |
960 | $code.=<<___; | |
961 | alignaddrl $out, %g0, $out | |
962 | bz %icc, .L${bits}_xts_${dir}loop2x | |
963 | srlx $len, 4, $len | |
964 | .L${bits}_xts_${dir}loop: | |
965 | ldx [$inp + 0], %o0 | |
966 | brz,pt $ileft, 4f | |
967 | ldx [$inp + 8], %o1 | |
968 | ||
969 | ldx [$inp + 16], %o2 | |
970 | sllx %o0, $ileft, %o0 | |
971 | srlx %o1, $iright, %g1 | |
972 | sllx %o1, $ileft, %o1 | |
973 | or %g1, %o0, %o0 | |
974 | srlx %o2, $iright, %o2 | |
975 | or %o2, %o1, %o1 | |
976 | 4: | |
977 | movxtod %g2, %f12 | |
978 | movxtod %g3, %f14 | |
979 | bshuffle %f12, %f12, %f12 | |
980 | bshuffle %f14, %f14, %f14 | |
981 | ||
982 | xor %g4, %o0, %o0 ! ^= rk[0] | |
983 | xor %g5, %o1, %o1 | |
984 | movxtod %o0, %f0 | |
985 | movxtod %o1, %f2 | |
986 | ||
987 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
988 | fxor %f14, %f2, %f2 | |
989 | ||
990 | prefetch [$out + 63], 22 | |
991 | prefetch [$inp + 16+63], 20 | |
992 | call _${alg}${bits}_${dir}crypt_1x | |
993 | add $inp, 16, $inp | |
994 | ||
995 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
996 | fxor %f14, %f2, %f2 | |
997 | ||
998 | srax %g3, 63, %l7 ! next tweak value | |
999 | addcc %g2, %g2, %g2 | |
1000 | and %l7, 0x87, %l7 | |
1001 | addxc %g3, %g3, %g3 | |
1002 | xor %l7, %g2, %g2 | |
1003 | ||
1004 | brnz,pn $ooff, 2f | |
1005 | sub $len, 1, $len | |
1006 | ||
1007 | std %f0, [$out + 0] | |
1008 | std %f2, [$out + 8] | |
1009 | brnz,pt $len, .L${bits}_xts_${dir}loop2x | |
1010 | add $out, 16, $out | |
1011 | ||
1012 | brnz,pn $rem, .L${bits}_xts_${dir}steal | |
1013 | nop | |
1014 | ||
1015 | ret | |
1016 | restore | |
1017 | ||
1018 | .align 16 | |
1019 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
1020 | ! and ~3x deterioration | |
1021 | ! in inp==out case | |
1022 | faligndata %f0, %f0, %f4 ! handle unaligned output | |
1023 | faligndata %f0, %f2, %f6 | |
1024 | faligndata %f2, %f2, %f8 | |
1025 | stda %f4, [$out + $omask]0xc0 ! partial store | |
1026 | std %f6, [$out + 8] | |
1027 | add $out, 16, $out | |
1028 | orn %g0, $omask, $omask | |
1029 | stda %f8, [$out + $omask]0xc0 ! partial store | |
1030 | ||
1031 | brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 | |
1032 | orn %g0, $omask, $omask | |
1033 | ||
1034 | brnz,pn $rem, .L${bits}_xts_${dir}steal | |
1035 | nop | |
1036 | ||
1037 | ret | |
1038 | restore | |
1039 | ||
1040 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
1041 | .align 32 | |
1042 | .L${bits}_xts_${dir}loop2x: | |
1043 | ldx [$inp + 0], %o0 | |
1044 | ldx [$inp + 8], %o1 | |
1045 | ldx [$inp + 16], %o2 | |
1046 | brz,pt $ileft, 4f | |
1047 | ldx [$inp + 24], %o3 | |
1048 | ||
1049 | ldx [$inp + 32], %o4 | |
1050 | sllx %o0, $ileft, %o0 | |
1051 | srlx %o1, $iright, %g1 | |
1052 | or %g1, %o0, %o0 | |
1053 | sllx %o1, $ileft, %o1 | |
1054 | srlx %o2, $iright, %g1 | |
1055 | or %g1, %o1, %o1 | |
1056 | sllx %o2, $ileft, %o2 | |
1057 | srlx %o3, $iright, %g1 | |
1058 | or %g1, %o2, %o2 | |
1059 | sllx %o3, $ileft, %o3 | |
1060 | srlx %o4, $iright, %o4 | |
1061 | or %o4, %o3, %o3 | |
1062 | 4: | |
1063 | movxtod %g2, %f12 | |
1064 | movxtod %g3, %f14 | |
1065 | bshuffle %f12, %f12, %f12 | |
1066 | bshuffle %f14, %f14, %f14 | |
1067 | ||
1068 | srax %g3, 63, %l7 ! next tweak value | |
1069 | addcc %g2, %g2, %g2 | |
1070 | and %l7, 0x87, %l7 | |
1071 | addxc %g3, %g3, %g3 | |
1072 | xor %l7, %g2, %g2 | |
1073 | ||
1074 | movxtod %g2, %f8 | |
1075 | movxtod %g3, %f10 | |
1076 | bshuffle %f8, %f8, %f8 | |
1077 | bshuffle %f10, %f10, %f10 | |
1078 | ||
1079 | xor %g4, %o0, %o0 ! ^= rk[0] | |
1080 | xor %g5, %o1, %o1 | |
1081 | xor %g4, %o2, %o2 ! ^= rk[0] | |
1082 | xor %g5, %o3, %o3 | |
1083 | movxtod %o0, %f0 | |
1084 | movxtod %o1, %f2 | |
1085 | movxtod %o2, %f4 | |
1086 | movxtod %o3, %f6 | |
1087 | ||
1088 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
1089 | fxor %f14, %f2, %f2 | |
1090 | fxor %f8, %f4, %f4 ! ^= tweak[0] | |
1091 | fxor %f10, %f6, %f6 | |
1092 | ||
1093 | prefetch [$out + 63], 22 | |
1094 | prefetch [$inp + 32+63], 20 | |
1095 | call _${alg}${bits}_${dir}crypt_2x | |
1096 | add $inp, 32, $inp | |
1097 | ||
1098 | movxtod %g2, %f8 | |
1099 | movxtod %g3, %f10 | |
1100 | ||
1101 | srax %g3, 63, %l7 ! next tweak value | |
1102 | addcc %g2, %g2, %g2 | |
1103 | and %l7, 0x87, %l7 | |
1104 | addxc %g3, %g3, %g3 | |
1105 | xor %l7, %g2, %g2 | |
1106 | ||
1107 | bshuffle %f8, %f8, %f8 | |
1108 | bshuffle %f10, %f10, %f10 | |
1109 | ||
1110 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
1111 | fxor %f14, %f2, %f2 | |
1112 | fxor %f8, %f4, %f4 | |
1113 | fxor %f10, %f6, %f6 | |
1114 | ||
1115 | brnz,pn $ooff, 2f | |
1116 | sub $len, 2, $len | |
1117 | ||
1118 | std %f0, [$out + 0] | |
1119 | std %f2, [$out + 8] | |
1120 | std %f4, [$out + 16] | |
1121 | std %f6, [$out + 24] | |
1122 | brnz,pt $len, .L${bits}_xts_${dir}loop2x | |
1123 | add $out, 32, $out | |
1124 | ||
1125 | fsrc2 %f4, %f0 | |
1126 | fsrc2 %f6, %f2 | |
1127 | brnz,pn $rem, .L${bits}_xts_${dir}steal | |
1128 | nop | |
1129 | ||
1130 | ret | |
1131 | restore | |
1132 | ||
1133 | .align 16 | |
1134 | 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard | |
1135 | ! and ~3x deterioration | |
1136 | ! in inp==out case | |
1137 | faligndata %f0, %f0, %f8 ! handle unaligned output | |
1138 | faligndata %f0, %f2, %f10 | |
1139 | faligndata %f2, %f4, %f12 | |
1140 | faligndata %f4, %f6, %f14 | |
1141 | faligndata %f6, %f6, %f0 | |
1142 | ||
1143 | stda %f8, [$out + $omask]0xc0 ! partial store | |
1144 | std %f10, [$out + 8] | |
1145 | std %f12, [$out + 16] | |
1146 | std %f14, [$out + 24] | |
1147 | add $out, 32, $out | |
1148 | orn %g0, $omask, $omask | |
1149 | stda %f0, [$out + $omask]0xc0 ! partial store | |
1150 | ||
1151 | brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 | |
1152 | orn %g0, $omask, $omask | |
1153 | ||
1154 | fsrc2 %f4, %f0 | |
1155 | fsrc2 %f6, %f2 | |
1156 | brnz,pn $rem, .L${bits}_xts_${dir}steal | |
1157 | nop | |
1158 | ||
1159 | ret | |
1160 | restore | |
1161 | ||
1162 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
1163 | .align 32 | |
1164 | .L${bits}_xts_${dir}blk: | |
1165 | add $out, $len, $blk_init | |
1166 | and $blk_init, 63, $blk_init ! tail | |
1167 | sub $len, $blk_init, $len | |
1168 | add $blk_init, 15, $blk_init ! round up to 16n | |
1169 | srlx $len, 4, $len | |
1170 | srl $blk_init, 4, $blk_init | |
1171 | sub $len, 1, $len | |
1172 | add $blk_init, 1, $blk_init | |
1173 | ||
1174 | .L${bits}_xts_${dir}blk2x: | |
1175 | ldx [$inp + 0], %o0 | |
1176 | ldx [$inp + 8], %o1 | |
1177 | ldx [$inp + 16], %o2 | |
1178 | brz,pt $ileft, 5f | |
1179 | ldx [$inp + 24], %o3 | |
1180 | ||
1181 | ldx [$inp + 32], %o4 | |
1182 | sllx %o0, $ileft, %o0 | |
1183 | srlx %o1, $iright, %g1 | |
1184 | or %g1, %o0, %o0 | |
1185 | sllx %o1, $ileft, %o1 | |
1186 | srlx %o2, $iright, %g1 | |
1187 | or %g1, %o1, %o1 | |
1188 | sllx %o2, $ileft, %o2 | |
1189 | srlx %o3, $iright, %g1 | |
1190 | or %g1, %o2, %o2 | |
1191 | sllx %o3, $ileft, %o3 | |
1192 | srlx %o4, $iright, %o4 | |
1193 | or %o4, %o3, %o3 | |
1194 | 5: | |
1195 | movxtod %g2, %f12 | |
1196 | movxtod %g3, %f14 | |
1197 | bshuffle %f12, %f12, %f12 | |
1198 | bshuffle %f14, %f14, %f14 | |
1199 | ||
1200 | srax %g3, 63, %l7 ! next tweak value | |
1201 | addcc %g2, %g2, %g2 | |
1202 | and %l7, 0x87, %l7 | |
1203 | addxc %g3, %g3, %g3 | |
1204 | xor %l7, %g2, %g2 | |
1205 | ||
1206 | movxtod %g2, %f8 | |
1207 | movxtod %g3, %f10 | |
1208 | bshuffle %f8, %f8, %f8 | |
1209 | bshuffle %f10, %f10, %f10 | |
1210 | ||
1211 | xor %g4, %o0, %o0 ! ^= rk[0] | |
1212 | xor %g5, %o1, %o1 | |
1213 | xor %g4, %o2, %o2 ! ^= rk[0] | |
1214 | xor %g5, %o3, %o3 | |
1215 | movxtod %o0, %f0 | |
1216 | movxtod %o1, %f2 | |
1217 | movxtod %o2, %f4 | |
1218 | movxtod %o3, %f6 | |
1219 | ||
1220 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
1221 | fxor %f14, %f2, %f2 | |
1222 | fxor %f8, %f4, %f4 ! ^= tweak[0] | |
1223 | fxor %f10, %f6, %f6 | |
1224 | ||
1225 | prefetch [$inp + 32+63], 20 | |
1226 | call _${alg}${bits}_${dir}crypt_2x | |
1227 | add $inp, 32, $inp | |
1228 | ||
1229 | movxtod %g2, %f8 | |
1230 | movxtod %g3, %f10 | |
1231 | ||
1232 | srax %g3, 63, %l7 ! next tweak value | |
1233 | addcc %g2, %g2, %g2 | |
1234 | and %l7, 0x87, %l7 | |
1235 | addxc %g3, %g3, %g3 | |
1236 | xor %l7, %g2, %g2 | |
1237 | ||
1238 | bshuffle %f8, %f8, %f8 | |
1239 | bshuffle %f10, %f10, %f10 | |
1240 | ||
1241 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
1242 | fxor %f14, %f2, %f2 | |
1243 | fxor %f8, %f4, %f4 | |
1244 | fxor %f10, %f6, %f6 | |
1245 | ||
1246 | stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific | |
1247 | add $out, 8, $out | |
1248 | stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific | |
1249 | add $out, 8, $out | |
1250 | stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific | |
1251 | add $out, 8, $out | |
1252 | stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific | |
1253 | bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x | |
1254 | add $out, 8, $out | |
1255 | ||
1256 | add $blk_init, $len, $len | |
1257 | andcc $len, 1, %g0 ! is number of blocks even? | |
1258 | membar #StoreLoad|#StoreStore | |
1259 | bnz,pt %icc, .L${bits}_xts_${dir}loop | |
1260 | srl $len, 0, $len | |
1261 | brnz,pn $len, .L${bits}_xts_${dir}loop2x | |
1262 | nop | |
1263 | ||
1264 | fsrc2 %f4, %f0 | |
1265 | fsrc2 %f6, %f2 | |
1266 | brnz,pn $rem, .L${bits}_xts_${dir}steal | |
1267 | nop | |
1268 | ||
1269 | ret | |
1270 | restore | |
1271 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
1272 | ___ | |
1273 | $code.=<<___ if ($dir eq "en"); | |
1274 | .align 32 | |
1275 | .L${bits}_xts_${dir}steal: | |
1276 | std %f0, [%fp + $::bias-16] ! copy of output | |
1277 | std %f2, [%fp + $::bias-8] | |
1278 | ||
1279 | srl $ileft, 3, $ileft | |
1280 | add %fp, $::bias-16, %l7 | |
1281 | add $inp, $ileft, $inp ! original $inp+$len&-15 | |
1282 | add $out, $ooff, $out ! original $out+$len&-15 | |
1283 | mov 0, $ileft | |
1284 | nop ! align | |
1285 | ||
1286 | .L${bits}_xts_${dir}stealing: | |
1287 | ldub [$inp + $ileft], %o0 | |
1288 | ldub [%l7 + $ileft], %o1 | |
1289 | dec $rem | |
1290 | stb %o0, [%l7 + $ileft] | |
1291 | stb %o1, [$out + $ileft] | |
1292 | brnz $rem, .L${bits}_xts_${dir}stealing | |
1293 | inc $ileft | |
1294 | ||
1295 | mov %l7, $inp | |
1296 | sub $out, 16, $out | |
1297 | mov 0, $ileft | |
1298 | sub $out, $ooff, $out | |
1299 | ba .L${bits}_xts_${dir}loop ! one more time | |
1300 | mov 1, $len ! $rem is 0 | |
1301 | ___ | |
1302 | $code.=<<___ if ($dir eq "de"); | |
1303 | .align 32 | |
1304 | .L${bits}_xts_${dir}steal: | |
1305 | ldx [$inp + 0], %o0 | |
1306 | brz,pt $ileft, 8f | |
1307 | ldx [$inp + 8], %o1 | |
1308 | ||
1309 | ldx [$inp + 16], %o2 | |
1310 | sllx %o0, $ileft, %o0 | |
1311 | srlx %o1, $iright, %g1 | |
1312 | sllx %o1, $ileft, %o1 | |
1313 | or %g1, %o0, %o0 | |
1314 | srlx %o2, $iright, %o2 | |
1315 | or %o2, %o1, %o1 | |
1316 | 8: | |
1317 | srax %g3, 63, %l7 ! next tweak value | |
1318 | addcc %g2, %g2, %o2 | |
1319 | and %l7, 0x87, %l7 | |
1320 | addxc %g3, %g3, %o3 | |
1321 | xor %l7, %o2, %o2 | |
1322 | ||
1323 | movxtod %o2, %f12 | |
1324 | movxtod %o3, %f14 | |
1325 | bshuffle %f12, %f12, %f12 | |
1326 | bshuffle %f14, %f14, %f14 | |
1327 | ||
1328 | xor %g4, %o0, %o0 ! ^= rk[0] | |
1329 | xor %g5, %o1, %o1 | |
1330 | movxtod %o0, %f0 | |
1331 | movxtod %o1, %f2 | |
1332 | ||
1333 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
1334 | fxor %f14, %f2, %f2 | |
1335 | ||
1336 | call _${alg}${bits}_${dir}crypt_1x | |
1337 | add $inp, 16, $inp | |
1338 | ||
1339 | fxor %f12, %f0, %f0 ! ^= tweak[0] | |
1340 | fxor %f14, %f2, %f2 | |
1341 | ||
1342 | std %f0, [%fp + $::bias-16] | |
1343 | std %f2, [%fp + $::bias-8] | |
1344 | ||
1345 | srl $ileft, 3, $ileft | |
1346 | add %fp, $::bias-16, %l7 | |
1347 | add $inp, $ileft, $inp ! original $inp+$len&-15 | |
1348 | add $out, $ooff, $out ! original $out+$len&-15 | |
1349 | mov 0, $ileft | |
1350 | add $out, 16, $out | |
1351 | nop ! align | |
1352 | ||
1353 | .L${bits}_xts_${dir}stealing: | |
1354 | ldub [$inp + $ileft], %o0 | |
1355 | ldub [%l7 + $ileft], %o1 | |
1356 | dec $rem | |
1357 | stb %o0, [%l7 + $ileft] | |
1358 | stb %o1, [$out + $ileft] | |
1359 | brnz $rem, .L${bits}_xts_${dir}stealing | |
1360 | inc $ileft | |
1361 | ||
1362 | mov %l7, $inp | |
1363 | sub $out, 16, $out | |
1364 | mov 0, $ileft | |
1365 | sub $out, $ooff, $out | |
1366 | ba .L${bits}_xts_${dir}loop ! one more time | |
1367 | mov 1, $len ! $rem is 0 | |
1368 | ___ | |
1369 | $code.=<<___; | |
1370 | ret | |
1371 | restore | |
1372 | .type ${alg}${bits}_t4_xts_${dir}crypt,#function | |
1373 | .size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt | |
1374 | ___ | |
1375 | } | |
1376 | ||
54a1f448 AP |
1377 | # Purpose of these subroutines is to explicitly encode VIS instructions, |
1378 | # so that one can compile the module without having to specify VIS | |
1379 | # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. | |
1380 | # Idea is to reserve for option to produce "universal" binary and let | |
1381 | # programmer detect if current CPU is VIS capable at run-time. | |
1382 | sub unvis { | |
1383 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1384 | my ($ref,$opf); | |
1385 | my %visopf = ( "faligndata" => 0x048, | |
cd686946 | 1386 | "bshuffle" => 0x04c, |
54a1f448 AP |
1387 | "fnot2" => 0x066, |
1388 | "fxor" => 0x06c, | |
1389 | "fsrc2" => 0x078 ); | |
1390 | ||
1391 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1392 | ||
1393 | if ($opf=$visopf{$mnemonic}) { | |
1394 | foreach ($rs1,$rs2,$rd) { | |
1395 | return $ref if (!/%f([0-9]{1,2})/); | |
1396 | $_=$1; | |
1397 | if ($1>=32) { | |
1398 | return $ref if ($1&1); | |
1399 | # re-encode for upper double register addressing | |
1400 | $_=($1|$1>>5)&31; | |
1401 | } | |
1402 | } | |
1403 | ||
1404 | return sprintf ".word\t0x%08x !%s", | |
1405 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
1406 | $ref; | |
1407 | } else { | |
1408 | return $ref; | |
1409 | } | |
1410 | } | |
cd686946 | 1411 | |
b3aee265 AP |
1412 | sub unvis3 { |
1413 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1414 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | |
1415 | my ($ref,$opf); | |
1416 | my %visopf = ( "addxc" => 0x011, | |
1417 | "addxccc" => 0x013, | |
cd686946 AP |
1418 | "umulxhi" => 0x016, |
1419 | "alignaddr" => 0x018, | |
1420 | "bmask" => 0x019, | |
1421 | "alignaddrl" => 0x01a ); | |
b3aee265 AP |
1422 | |
1423 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1424 | ||
1425 | if ($opf=$visopf{$mnemonic}) { | |
1426 | foreach ($rs1,$rs2,$rd) { | |
1427 | return $ref if (!/%([goli])([0-9])/); | |
1428 | $_=$bias{$1}+$2; | |
1429 | } | |
1430 | ||
1431 | return sprintf ".word\t0x%08x !%s", | |
1432 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
1433 | $ref; | |
1434 | } else { | |
1435 | return $ref; | |
1436 | } | |
1437 | } | |
54a1f448 AP |
1438 | |
1439 | sub unaes_round { # 4-argument instructions | |
1440 | my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; | |
1441 | my ($ref,$opf); | |
1442 | my %aesopf = ( "aes_eround01" => 0, | |
1443 | "aes_eround23" => 1, | |
1444 | "aes_dround01" => 2, | |
1445 | "aes_dround23" => 3, | |
1446 | "aes_eround01_l"=> 4, | |
1447 | "aes_eround23_l"=> 5, | |
1448 | "aes_dround01_l"=> 6, | |
1449 | "aes_dround23_l"=> 7, | |
1450 | "aes_kexpand1" => 8 ); | |
1451 | ||
1452 | $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; | |
1453 | ||
1454 | if (defined($opf=$aesopf{$mnemonic})) { | |
1455 | $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; | |
1456 | foreach ($rs1,$rs2,$rd) { | |
1457 | return $ref if (!/%f([0-9]{1,2})/); | |
1458 | $_=$1; | |
1459 | if ($1>=32) { | |
1460 | return $ref if ($1&1); | |
1461 | # re-encode for upper double register addressing | |
1462 | $_=($1|$1>>5)&31; | |
1463 | } | |
1464 | } | |
1465 | ||
1466 | return sprintf ".word\t0x%08x !%s", | |
1467 | 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, | |
1468 | $ref; | |
1469 | } else { | |
1470 | return $ref; | |
1471 | } | |
1472 | } | |
1473 | ||
1474 | sub unaes_kexpand { # 3-argument instructions | |
1475 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1476 | my ($ref,$opf); | |
1477 | my %aesopf = ( "aes_kexpand0" => 0x130, | |
1478 | "aes_kexpand2" => 0x131 ); | |
1479 | ||
1480 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1481 | ||
1482 | if (defined($opf=$aesopf{$mnemonic})) { | |
1483 | foreach ($rs1,$rs2,$rd) { | |
1484 | return $ref if (!/%f([0-9]{1,2})/); | |
1485 | $_=$1; | |
1486 | if ($1>=32) { | |
1487 | return $ref if ($1&1); | |
1488 | # re-encode for upper double register addressing | |
1489 | $_=($1|$1>>5)&31; | |
1490 | } | |
1491 | } | |
1492 | ||
1493 | return sprintf ".word\t0x%08x !%s", | |
1494 | 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, | |
1495 | $ref; | |
1496 | } else { | |
1497 | return $ref; | |
1498 | } | |
1499 | } | |
1500 | ||
1501 | sub uncamellia_f { # 4-argument instructions | |
1502 | my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; | |
1503 | my ($ref,$opf); | |
1504 | ||
1505 | $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; | |
1506 | ||
1507 | if (1) { | |
1508 | $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; | |
1509 | foreach ($rs1,$rs2,$rd) { | |
1510 | return $ref if (!/%f([0-9]{1,2})/); | |
1511 | $_=$1; | |
1512 | if ($1>=32) { | |
1513 | return $ref if ($1&1); | |
1514 | # re-encode for upper double register addressing | |
1515 | $_=($1|$1>>5)&31; | |
1516 | } | |
1517 | } | |
1518 | ||
1519 | return sprintf ".word\t0x%08x !%s", | |
1520 | 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2, | |
1521 | $ref; | |
1522 | } else { | |
1523 | return $ref; | |
1524 | } | |
1525 | } | |
1526 | ||
1527 | sub uncamellia3 { # 3-argument instructions | |
1528 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1529 | my ($ref,$opf); | |
1530 | my %cmllopf = ( "camellia_fl" => 0x13c, | |
1531 | "camellia_fli" => 0x13d ); | |
1532 | ||
1533 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1534 | ||
1535 | if (defined($opf=$cmllopf{$mnemonic})) { | |
1536 | foreach ($rs1,$rs2,$rd) { | |
1537 | return $ref if (!/%f([0-9]{1,2})/); | |
1538 | $_=$1; | |
1539 | if ($1>=32) { | |
1540 | return $ref if ($1&1); | |
1541 | # re-encode for upper double register addressing | |
1542 | $_=($1|$1>>5)&31; | |
1543 | } | |
1544 | } | |
1545 | ||
1546 | return sprintf ".word\t0x%08x !%s", | |
1547 | 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, | |
1548 | $ref; | |
1549 | } else { | |
1550 | return $ref; | |
1551 | } | |
1552 | } | |
1553 | ||
1554 | sub unmovxtox { # 2-argument instructions | |
1555 | my ($mnemonic,$rs,$rd)=@_; | |
1556 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 ); | |
1557 | my ($ref,$opf); | |
1558 | my %movxopf = ( "movdtox" => 0x110, | |
1559 | "movstouw" => 0x111, | |
1560 | "movstosw" => 0x113, | |
1561 | "movxtod" => 0x118, | |
1562 | "movwtos" => 0x119 ); | |
1563 | ||
1564 | $ref = "$mnemonic\t$rs,$rd"; | |
1565 | ||
1566 | if (defined($opf=$movxopf{$mnemonic})) { | |
1567 | foreach ($rs,$rd) { | |
1568 | return $ref if (!/%([fgoli])([0-9]{1,2})/); | |
1569 | $_=$bias{$1}+$2; | |
1570 | if ($2>=32) { | |
1571 | return $ref if ($2&1); | |
1572 | # re-encode for upper double register addressing | |
1573 | $_=($2|$2>>5)&31; | |
1574 | } | |
1575 | } | |
1576 | ||
1577 | return sprintf ".word\t0x%08x !%s", | |
1578 | 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs, | |
1579 | $ref; | |
1580 | } else { | |
1581 | return $ref; | |
1582 | } | |
1583 | } | |
1584 | ||
1585 | sub emit_assembler { | |
1586 | foreach (split("\n",$::code)) { | |
1587 | s/\`([^\`]*)\`/eval $1/ge; | |
1588 | ||
cd686946 | 1589 | s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go; |
54a1f448 AP |
1590 | |
1591 | s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ | |
1592 | &unaes_round($1,$2,$3,$4,$5) | |
cd686946 | 1593 | /geo or |
54a1f448 AP |
1594 | s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ |
1595 | &unaes_kexpand($1,$2,$3,$4) | |
cd686946 | 1596 | /geo or |
54a1f448 AP |
1597 | s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ |
1598 | &uncamellia_f($1,$2,$3,$4,$5) | |
cd686946 | 1599 | /geo or |
54a1f448 AP |
1600 | s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ |
1601 | &uncamellia3($1,$2,$3,$4) | |
cd686946 | 1602 | /geo or |
54a1f448 AP |
1603 | s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/ |
1604 | &unmovxtox($1,$2,$3) | |
cd686946 | 1605 | /geo or |
54a1f448 AP |
1606 | s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/ |
1607 | &unmovxtox($1,$2,$3) | |
cd686946 AP |
1608 | /geo or |
1609 | s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ | |
54a1f448 | 1610 | &unvis($1,$2,$3,$4) |
cd686946 AP |
1611 | /geo or |
1612 | s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | |
b3aee265 | 1613 | &unvis3($1,$2,$3,$4) |
cd686946 | 1614 | /geo; |
54a1f448 AP |
1615 | |
1616 | print $_,"\n"; | |
1617 | } | |
1618 | } | |
1619 | ||
1620 | 1; |