]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
81cae8ce | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
c3473126 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # March 2010 | |
18 | # | |
19 | # The module implements "4-bit" GCM GHASH function and underlying | |
20 | # single multiplication operation in GF(2^128). "4-bit" means that it | |
21 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance | |
22 | # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU | |
23 | # and are expressed in cycles per processed byte, less is better: | |
24 | # | |
25 | # gcc 3.3.x cc 5.2 this assembler | |
26 | # | |
d52d5ad1 AP |
27 | # 32-bit build 81.4 43.3 12.6 (+546%/+244%) |
28 | # 64-bit build 20.2 21.2 12.6 (+60%/+68%) | |
c3473126 | 29 | # |
b2875087 AP |
30 | # Here is data collected on UltraSPARC T1 system running Linux: |
31 | # | |
32 | # gcc 4.4.1 this assembler | |
33 | # | |
34 | # 32-bit build 566 50 (+1000%) | |
35 | # 64-bit build 56 50 (+12%) | |
36 | # | |
c3473126 AP |
37 | # I don't quite understand why difference between 32-bit and 64-bit |
38 | # compiler-generated code is so big. Compilers *were* instructed to | |
39 | # generate code for UltraSPARC and should have used 64-bit registers | |
40 | # for Z vector (see C code) even in 32-bit build... Oh well, it only | |
41 | # means more impressive improvement coefficients for this assembler | |
42 | # module;-) Loops are aggressively modulo-scheduled in respect to | |
43 | # references to input data and Z.hi updates to achieve 12 cycles | |
44 | # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 | |
b2875087 | 45 | # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. |
23328d4b AP |
46 | # |
47 | # October 2012 | |
48 | # | |
49 | # Add VIS3 lookup-table-free implementation using polynomial | |
50 | # multiplication xmulx[hi] and extended addition addxc[cc] | |
3766e7cc AP |
51 | # instructions. 4.52/7.63x improvement on T3/T4 or in absolute |
52 | # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark | |
53 | # saturates at ~15.5x single-process result on 8-core processor, | |
54 | # or ~20.5GBps per 2.85GHz socket. | |
c3473126 | 55 | |
eb77e888 | 56 | $output=pop; |
c3473126 AP |
57 | open STDOUT,">$output"; |
58 | ||
eb77e888 AP |
59 | $frame="STACK_FRAME"; |
60 | $bias="STACK_BIAS"; | |
61 | ||
c3473126 AP |
62 | $Zhi="%o0"; # 64-bit values |
63 | $Zlo="%o1"; | |
64 | $Thi="%o2"; | |
65 | $Tlo="%o3"; | |
66 | $rem="%o4"; | |
67 | $tmp="%o5"; | |
68 | ||
69 | $nhi="%l0"; # small values and pointers | |
70 | $nlo="%l1"; | |
71 | $xi0="%l2"; | |
72 | $xi1="%l3"; | |
73 | $rem_4bit="%l4"; | |
74 | $remi="%l5"; | |
75 | $Htblo="%l6"; | |
76 | $cnt="%l7"; | |
77 | ||
4f39edbf AP |
78 | $Xi="%i0"; # input argument block |
79 | $Htbl="%i1"; | |
80 | $inp="%i2"; | |
81 | $len="%i3"; | |
c3473126 | 82 | |
eb77e888 AP |
83 | $code.=<<___; |
84 | #include "sparc_arch.h" | |
85 | ||
86 | #ifdef __arch64__ | |
23328d4b AP |
87 | .register %g2,#scratch |
88 | .register %g3,#scratch | |
eb77e888 AP |
89 | #endif |
90 | ||
c3473126 AP |
91 | .section ".text",#alloc,#execinstr |
92 | ||
93 | .align 64 | |
94 | rem_4bit: | |
95 | .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 | |
96 | .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 | |
97 | .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 | |
98 | .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 | |
99 | .type rem_4bit,#object | |
100 | .size rem_4bit,(.-rem_4bit) | |
101 | ||
102 | .globl gcm_ghash_4bit | |
103 | .align 32 | |
104 | gcm_ghash_4bit: | |
105 | save %sp,-$frame,%sp | |
106 | ldub [$inp+15],$nlo | |
107 | ldub [$Xi+15],$xi0 | |
108 | ldub [$Xi+14],$xi1 | |
109 | add $len,$inp,$len | |
110 | add $Htbl,8,$Htblo | |
111 | ||
112 | 1: call .+8 | |
113 | add %o7,rem_4bit-1b,$rem_4bit | |
114 | ||
115 | .Louter: | |
116 | xor $xi0,$nlo,$nlo | |
117 | and $nlo,0xf0,$nhi | |
118 | and $nlo,0x0f,$nlo | |
119 | sll $nlo,4,$nlo | |
120 | ldx [$Htblo+$nlo],$Zlo | |
121 | ldx [$Htbl+$nlo],$Zhi | |
122 | ||
123 | ldub [$inp+14],$nlo | |
124 | ||
125 | ldx [$Htblo+$nhi],$Tlo | |
126 | and $Zlo,0xf,$remi | |
127 | ldx [$Htbl+$nhi],$Thi | |
128 | sll $remi,3,$remi | |
129 | ldx [$rem_4bit+$remi],$rem | |
130 | srlx $Zlo,4,$Zlo | |
131 | mov 13,$cnt | |
132 | sllx $Zhi,60,$tmp | |
133 | xor $Tlo,$Zlo,$Zlo | |
134 | srlx $Zhi,4,$Zhi | |
135 | xor $Zlo,$tmp,$Zlo | |
136 | ||
137 | xor $xi1,$nlo,$nlo | |
138 | and $Zlo,0xf,$remi | |
139 | and $nlo,0xf0,$nhi | |
140 | and $nlo,0x0f,$nlo | |
141 | ba .Lghash_inner | |
142 | sll $nlo,4,$nlo | |
143 | .align 32 | |
144 | .Lghash_inner: | |
145 | ldx [$Htblo+$nlo],$Tlo | |
146 | sll $remi,3,$remi | |
147 | xor $Thi,$Zhi,$Zhi | |
148 | ldx [$Htbl+$nlo],$Thi | |
149 | srlx $Zlo,4,$Zlo | |
150 | xor $rem,$Zhi,$Zhi | |
151 | ldx [$rem_4bit+$remi],$rem | |
152 | sllx $Zhi,60,$tmp | |
153 | xor $Tlo,$Zlo,$Zlo | |
154 | ldub [$inp+$cnt],$nlo | |
155 | srlx $Zhi,4,$Zhi | |
156 | xor $Zlo,$tmp,$Zlo | |
157 | ldub [$Xi+$cnt],$xi1 | |
158 | xor $Thi,$Zhi,$Zhi | |
159 | and $Zlo,0xf,$remi | |
160 | ||
161 | ldx [$Htblo+$nhi],$Tlo | |
162 | sll $remi,3,$remi | |
163 | xor $rem,$Zhi,$Zhi | |
164 | ldx [$Htbl+$nhi],$Thi | |
165 | srlx $Zlo,4,$Zlo | |
166 | ldx [$rem_4bit+$remi],$rem | |
167 | sllx $Zhi,60,$tmp | |
168 | xor $xi1,$nlo,$nlo | |
169 | srlx $Zhi,4,$Zhi | |
170 | and $nlo,0xf0,$nhi | |
171 | addcc $cnt,-1,$cnt | |
172 | xor $Zlo,$tmp,$Zlo | |
173 | and $nlo,0x0f,$nlo | |
174 | xor $Tlo,$Zlo,$Zlo | |
175 | sll $nlo,4,$nlo | |
176 | blu .Lghash_inner | |
177 | and $Zlo,0xf,$remi | |
178 | ||
179 | ldx [$Htblo+$nlo],$Tlo | |
180 | sll $remi,3,$remi | |
181 | xor $Thi,$Zhi,$Zhi | |
182 | ldx [$Htbl+$nlo],$Thi | |
183 | srlx $Zlo,4,$Zlo | |
184 | xor $rem,$Zhi,$Zhi | |
185 | ldx [$rem_4bit+$remi],$rem | |
186 | sllx $Zhi,60,$tmp | |
187 | xor $Tlo,$Zlo,$Zlo | |
188 | srlx $Zhi,4,$Zhi | |
189 | xor $Zlo,$tmp,$Zlo | |
190 | xor $Thi,$Zhi,$Zhi | |
191 | ||
192 | add $inp,16,$inp | |
193 | cmp $inp,$len | |
eb77e888 | 194 | be,pn SIZE_T_CC,.Ldone |
c3473126 AP |
195 | and $Zlo,0xf,$remi |
196 | ||
197 | ldx [$Htblo+$nhi],$Tlo | |
198 | sll $remi,3,$remi | |
199 | xor $rem,$Zhi,$Zhi | |
200 | ldx [$Htbl+$nhi],$Thi | |
201 | srlx $Zlo,4,$Zlo | |
202 | ldx [$rem_4bit+$remi],$rem | |
203 | sllx $Zhi,60,$tmp | |
204 | xor $Tlo,$Zlo,$Zlo | |
205 | ldub [$inp+15],$nlo | |
206 | srlx $Zhi,4,$Zhi | |
207 | xor $Zlo,$tmp,$Zlo | |
208 | xor $Thi,$Zhi,$Zhi | |
209 | stx $Zlo,[$Xi+8] | |
210 | xor $rem,$Zhi,$Zhi | |
211 | stx $Zhi,[$Xi] | |
212 | srl $Zlo,8,$xi1 | |
213 | and $Zlo,0xff,$xi0 | |
214 | ba .Louter | |
215 | and $xi1,0xff,$xi1 | |
216 | .align 32 | |
217 | .Ldone: | |
218 | ldx [$Htblo+$nhi],$Tlo | |
219 | sll $remi,3,$remi | |
220 | xor $rem,$Zhi,$Zhi | |
221 | ldx [$Htbl+$nhi],$Thi | |
222 | srlx $Zlo,4,$Zlo | |
223 | ldx [$rem_4bit+$remi],$rem | |
224 | sllx $Zhi,60,$tmp | |
225 | xor $Tlo,$Zlo,$Zlo | |
226 | srlx $Zhi,4,$Zhi | |
227 | xor $Zlo,$tmp,$Zlo | |
228 | xor $Thi,$Zhi,$Zhi | |
229 | stx $Zlo,[$Xi+8] | |
230 | xor $rem,$Zhi,$Zhi | |
231 | stx $Zhi,[$Xi] | |
232 | ||
233 | ret | |
234 | restore | |
235 | .type gcm_ghash_4bit,#function | |
236 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) | |
237 | ___ | |
238 | ||
c3473126 AP |
239 | undef $inp; |
240 | undef $len; | |
241 | ||
242 | $code.=<<___; | |
243 | .globl gcm_gmult_4bit | |
244 | .align 32 | |
245 | gcm_gmult_4bit: | |
246 | save %sp,-$frame,%sp | |
247 | ldub [$Xi+15],$nlo | |
248 | add $Htbl,8,$Htblo | |
249 | ||
250 | 1: call .+8 | |
251 | add %o7,rem_4bit-1b,$rem_4bit | |
252 | ||
253 | and $nlo,0xf0,$nhi | |
254 | and $nlo,0x0f,$nlo | |
255 | sll $nlo,4,$nlo | |
256 | ldx [$Htblo+$nlo],$Zlo | |
257 | ldx [$Htbl+$nlo],$Zhi | |
258 | ||
259 | ldub [$Xi+14],$nlo | |
260 | ||
261 | ldx [$Htblo+$nhi],$Tlo | |
262 | and $Zlo,0xf,$remi | |
263 | ldx [$Htbl+$nhi],$Thi | |
264 | sll $remi,3,$remi | |
265 | ldx [$rem_4bit+$remi],$rem | |
266 | srlx $Zlo,4,$Zlo | |
267 | mov 13,$cnt | |
268 | sllx $Zhi,60,$tmp | |
269 | xor $Tlo,$Zlo,$Zlo | |
270 | srlx $Zhi,4,$Zhi | |
271 | xor $Zlo,$tmp,$Zlo | |
272 | ||
273 | and $Zlo,0xf,$remi | |
274 | and $nlo,0xf0,$nhi | |
275 | and $nlo,0x0f,$nlo | |
276 | ba .Lgmult_inner | |
277 | sll $nlo,4,$nlo | |
278 | .align 32 | |
279 | .Lgmult_inner: | |
280 | ldx [$Htblo+$nlo],$Tlo | |
281 | sll $remi,3,$remi | |
282 | xor $Thi,$Zhi,$Zhi | |
283 | ldx [$Htbl+$nlo],$Thi | |
284 | srlx $Zlo,4,$Zlo | |
285 | xor $rem,$Zhi,$Zhi | |
286 | ldx [$rem_4bit+$remi],$rem | |
287 | sllx $Zhi,60,$tmp | |
288 | xor $Tlo,$Zlo,$Zlo | |
289 | ldub [$Xi+$cnt],$nlo | |
290 | srlx $Zhi,4,$Zhi | |
291 | xor $Zlo,$tmp,$Zlo | |
292 | xor $Thi,$Zhi,$Zhi | |
293 | and $Zlo,0xf,$remi | |
294 | ||
295 | ldx [$Htblo+$nhi],$Tlo | |
296 | sll $remi,3,$remi | |
297 | xor $rem,$Zhi,$Zhi | |
298 | ldx [$Htbl+$nhi],$Thi | |
299 | srlx $Zlo,4,$Zlo | |
300 | ldx [$rem_4bit+$remi],$rem | |
301 | sllx $Zhi,60,$tmp | |
302 | srlx $Zhi,4,$Zhi | |
303 | and $nlo,0xf0,$nhi | |
304 | addcc $cnt,-1,$cnt | |
305 | xor $Zlo,$tmp,$Zlo | |
306 | and $nlo,0x0f,$nlo | |
307 | xor $Tlo,$Zlo,$Zlo | |
308 | sll $nlo,4,$nlo | |
309 | blu .Lgmult_inner | |
310 | and $Zlo,0xf,$remi | |
311 | ||
312 | ldx [$Htblo+$nlo],$Tlo | |
313 | sll $remi,3,$remi | |
314 | xor $Thi,$Zhi,$Zhi | |
315 | ldx [$Htbl+$nlo],$Thi | |
316 | srlx $Zlo,4,$Zlo | |
317 | xor $rem,$Zhi,$Zhi | |
318 | ldx [$rem_4bit+$remi],$rem | |
319 | sllx $Zhi,60,$tmp | |
320 | xor $Tlo,$Zlo,$Zlo | |
321 | srlx $Zhi,4,$Zhi | |
322 | xor $Zlo,$tmp,$Zlo | |
323 | xor $Thi,$Zhi,$Zhi | |
324 | and $Zlo,0xf,$remi | |
325 | ||
326 | ldx [$Htblo+$nhi],$Tlo | |
327 | sll $remi,3,$remi | |
328 | xor $rem,$Zhi,$Zhi | |
329 | ldx [$Htbl+$nhi],$Thi | |
330 | srlx $Zlo,4,$Zlo | |
331 | ldx [$rem_4bit+$remi],$rem | |
332 | sllx $Zhi,60,$tmp | |
333 | xor $Tlo,$Zlo,$Zlo | |
334 | srlx $Zhi,4,$Zhi | |
335 | xor $Zlo,$tmp,$Zlo | |
336 | xor $Thi,$Zhi,$Zhi | |
337 | stx $Zlo,[$Xi+8] | |
338 | xor $rem,$Zhi,$Zhi | |
339 | stx $Zhi,[$Xi] | |
340 | ||
341 | ret | |
342 | restore | |
343 | .type gcm_gmult_4bit,#function | |
344 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) | |
23328d4b AP |
345 | ___ |
346 | \f | |
347 | {{{ | |
24798c5e AP |
348 | # Straightforward 128x128-bit multiplication using Karatsuba algorithm |
349 | # followed by pair of 64-bit reductions [with a shortcut in first one, | |
350 | # which allowed to break dependency between reductions and remove one | |
3766e7cc | 351 | # multiplication from critical path]. While it might be suboptimal |
24798c5e AP |
352 | # with regard to sheer number of multiplications, other methods [such |
353 | # as aggregate reduction] would require more 64-bit registers, which | |
354 | # we don't have in 32-bit application context. | |
23328d4b AP |
355 | |
356 | ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); | |
357 | ||
3766e7cc | 358 | ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= |
24798c5e | 359 | (map("%o$_",(0..5,7)),map("%g$_",(1..5))); |
23328d4b | 360 | |
3766e7cc | 361 | ($shl,$shr)=map("%l$_",(0..7)); |
24798c5e AP |
362 | |
363 | # For details regarding "twisted H" see ghash-x86.pl. | |
23328d4b | 364 | $code.=<<___; |
24798c5e | 365 | .globl gcm_init_vis3 |
23328d4b | 366 | .align 32 |
24798c5e | 367 | gcm_init_vis3: |
23328d4b AP |
368 | save %sp,-$frame,%sp |
369 | ||
24798c5e AP |
370 | ldx [%i1+0],$Hhi |
371 | ldx [%i1+8],$Hlo | |
372 | mov 0xE1,$Xhi | |
373 | mov 1,$Xlo | |
374 | sllx $Xhi,57,$Xhi | |
3766e7cc | 375 | srax $Hhi,63,$C0 ! broadcast carry |
24798c5e AP |
376 | addcc $Hlo,$Hlo,$Hlo ! H<<=1 |
377 | addxc $Hhi,$Hhi,$Hhi | |
3766e7cc AP |
378 | and $C0,$Xlo,$Xlo |
379 | and $C0,$Xhi,$Xhi | |
24798c5e AP |
380 | xor $Xlo,$Hlo,$Hlo |
381 | xor $Xhi,$Hhi,$Hhi | |
382 | stx $Hlo,[%i0+8] ! save twisted H | |
383 | stx $Hhi,[%i0+0] | |
23328d4b | 384 | |
3766e7cc AP |
385 | sethi %hi(0xA0406080),$V |
386 | sethi %hi(0x20C0E000),%l0 | |
387 | or $V,%lo(0xA0406080),$V | |
388 | or %l0,%lo(0x20C0E000),%l0 | |
389 | sllx $V,32,$V | |
053fa39a | 390 | or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 |
3766e7cc AP |
391 | stx $V,[%i0+16] |
392 | ||
24798c5e AP |
393 | ret |
394 | restore | |
395 | .type gcm_init_vis3,#function | |
396 | .size gcm_init_vis3,.-gcm_init_vis3 | |
23328d4b | 397 | |
24798c5e AP |
398 | .globl gcm_gmult_vis3 |
399 | .align 32 | |
400 | gcm_gmult_vis3: | |
401 | save %sp,-$frame,%sp | |
23328d4b | 402 | |
24798c5e AP |
403 | ldx [$Xip+8],$Xlo ! load Xi |
404 | ldx [$Xip+0],$Xhi | |
405 | ldx [$Htable+8],$Hlo ! load twisted H | |
406 | ldx [$Htable+0],$Hhi | |
407 | ||
3766e7cc AP |
408 | mov 0xE1,%l7 |
409 | sllx %l7,57,$xE1 ! 57 is not a typo | |
053fa39a | 410 | ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 |
24798c5e | 411 | |
3766e7cc | 412 | xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing |
24798c5e AP |
413 | xmulx $Xlo,$Hlo,$C0 |
414 | xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing | |
415 | xmulx $C2,$Hhl,$C1 | |
416 | xmulxhi $Xlo,$Hlo,$Xlo | |
417 | xmulxhi $C2,$Hhl,$C2 | |
418 | xmulxhi $Xhi,$Hhi,$C3 | |
419 | xmulx $Xhi,$Hhi,$Xhi | |
420 | ||
421 | sll $C0,3,$sqr | |
053fa39a | 422 | srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] |
24798c5e | 423 | xor $C0,$sqr,$sqr |
053fa39a | 424 | sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] |
24798c5e AP |
425 | |
426 | xor $C0,$C1,$C1 ! Karatsuba post-processing | |
427 | xor $Xlo,$C2,$C2 | |
3766e7cc | 428 | xor $sqr,$Xlo,$Xlo ! real destination is $C1 |
24798c5e AP |
429 | xor $C3,$C2,$C2 |
430 | xor $Xlo,$C1,$C1 | |
3766e7cc AP |
431 | xor $Xhi,$C2,$C2 |
432 | xor $Xhi,$C1,$C1 | |
24798c5e | 433 | |
053fa39a | 434 | xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 |
24798c5e AP |
435 | xor $C0,$C2,$C2 |
436 | xmulx $C1,$xE1,$C0 | |
437 | xor $C1,$C3,$C3 | |
438 | xmulxhi $C1,$xE1,$C1 | |
439 | ||
440 | xor $Xlo,$C2,$C2 | |
24798c5e AP |
441 | xor $C0,$C2,$C2 |
442 | xor $C1,$C3,$C3 | |
443 | ||
444 | stx $C2,[$Xip+8] ! save Xi | |
445 | stx $C3,[$Xip+0] | |
23328d4b AP |
446 | |
447 | ret | |
448 | restore | |
449 | .type gcm_gmult_vis3,#function | |
450 | .size gcm_gmult_vis3,.-gcm_gmult_vis3 | |
451 | ||
452 | .globl gcm_ghash_vis3 | |
453 | .align 32 | |
454 | gcm_ghash_vis3: | |
455 | save %sp,-$frame,%sp | |
f198cc43 AP |
456 | nop |
457 | srln $len,0,$len ! needed on v8+, "nop" on v9 | |
23328d4b | 458 | |
24798c5e AP |
459 | ldx [$Xip+8],$C2 ! load Xi |
460 | ldx [$Xip+0],$C3 | |
461 | ldx [$Htable+8],$Hlo ! load twisted H | |
462 | ldx [$Htable+0],$Hhi | |
463 | ||
24798c5e | 464 | mov 0xE1,%l7 |
24798c5e | 465 | sllx %l7,57,$xE1 ! 57 is not a typo |
053fa39a | 466 | ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 |
24798c5e | 467 | |
23328d4b AP |
468 | and $inp,7,$shl |
469 | andn $inp,7,$inp | |
23328d4b AP |
470 | sll $shl,3,$shl |
471 | prefetch [$inp+63], 20 | |
23328d4b | 472 | sub %g0,$shl,$shr |
24798c5e AP |
473 | |
474 | xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing | |
23328d4b | 475 | .Loop: |
24798c5e | 476 | ldx [$inp+8],$Xlo |
23328d4b | 477 | brz,pt $shl,1f |
24798c5e AP |
478 | ldx [$inp+0],$Xhi |
479 | ||
480 | ldx [$inp+16],$C1 ! align data | |
481 | srlx $Xlo,$shr,$C0 | |
482 | sllx $Xlo,$shl,$Xlo | |
483 | sllx $Xhi,$shl,$Xhi | |
484 | srlx $C1,$shr,$C1 | |
485 | or $C0,$Xhi,$Xhi | |
486 | or $C1,$Xlo,$Xlo | |
23328d4b AP |
487 | 1: |
488 | add $inp,16,$inp | |
489 | sub $len,16,$len | |
24798c5e AP |
490 | xor $C2,$Xlo,$Xlo |
491 | xor $C3,$Xhi,$Xhi | |
23328d4b AP |
492 | prefetch [$inp+63], 20 |
493 | ||
24798c5e AP |
494 | xmulx $Xlo,$Hlo,$C0 |
495 | xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing | |
496 | xmulx $C2,$Hhl,$C1 | |
497 | xmulxhi $Xlo,$Hlo,$Xlo | |
498 | xmulxhi $C2,$Hhl,$C2 | |
499 | xmulxhi $Xhi,$Hhi,$C3 | |
500 | xmulx $Xhi,$Hhi,$Xhi | |
501 | ||
502 | sll $C0,3,$sqr | |
053fa39a | 503 | srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] |
24798c5e | 504 | xor $C0,$sqr,$sqr |
053fa39a | 505 | sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] |
24798c5e AP |
506 | |
507 | xor $C0,$C1,$C1 ! Karatsuba post-processing | |
508 | xor $Xlo,$C2,$C2 | |
3766e7cc | 509 | xor $sqr,$Xlo,$Xlo ! real destination is $C1 |
24798c5e AP |
510 | xor $C3,$C2,$C2 |
511 | xor $Xlo,$C1,$C1 | |
3766e7cc AP |
512 | xor $Xhi,$C2,$C2 |
513 | xor $Xhi,$C1,$C1 | |
24798c5e | 514 | |
053fa39a | 515 | xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 |
24798c5e AP |
516 | xor $C0,$C2,$C2 |
517 | xmulx $C1,$xE1,$C0 | |
518 | xor $C1,$C3,$C3 | |
519 | xmulxhi $C1,$xE1,$C1 | |
520 | ||
521 | xor $Xlo,$C2,$C2 | |
24798c5e | 522 | xor $C0,$C2,$C2 |
23328d4b | 523 | brnz,pt $len,.Loop |
24798c5e | 524 | xor $C1,$C3,$C3 |
23328d4b | 525 | |
24798c5e AP |
526 | stx $C2,[$Xip+8] ! save Xi |
527 | stx $C3,[$Xip+0] | |
23328d4b AP |
528 | |
529 | ret | |
530 | restore | |
531 | .type gcm_ghash_vis3,#function | |
532 | .size gcm_ghash_vis3,.-gcm_ghash_vis3 | |
533 | ___ | |
534 | }}} | |
535 | $code.=<<___; | |
536 | .asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" | |
c32fcca6 | 537 | .align 4 |
c3473126 AP |
538 | ___ |
539 | ||
23328d4b AP |
540 | \f |
541 | # Purpose of these subroutines is to explicitly encode VIS instructions, | |
542 | # so that one can compile the module without having to specify VIS | |
478b50cf | 543 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. |
23328d4b AP |
544 | # Idea is to reserve for option to produce "universal" binary and let |
545 | # programmer detect if current CPU is VIS capable at run-time. | |
546 | sub unvis3 { | |
547 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
548 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | |
549 | my ($ref,$opf); | |
550 | my %visopf = ( "addxc" => 0x011, | |
551 | "addxccc" => 0x013, | |
552 | "xmulx" => 0x115, | |
553 | "xmulxhi" => 0x116 ); | |
554 | ||
555 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
556 | ||
557 | if ($opf=$visopf{$mnemonic}) { | |
558 | foreach ($rs1,$rs2,$rd) { | |
559 | return $ref if (!/%([goli])([0-9])/); | |
560 | $_=$bias{$1}+$2; | |
561 | } | |
562 | ||
563 | return sprintf ".word\t0x%08x !%s", | |
564 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
565 | $ref; | |
566 | } else { | |
567 | return $ref; | |
568 | } | |
569 | } | |
570 | ||
571 | foreach (split("\n",$code)) { | |
572 | s/\`([^\`]*)\`/eval $1/ge; | |
573 | ||
574 | s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | |
575 | &unvis3($1,$2,$3,$4) | |
576 | /ge; | |
577 | ||
578 | print $_,"\n"; | |
579 | } | |
580 | ||
c3473126 | 581 | close STDOUT; |