]>
Commit | Line | Data |
---|---|---|
ee0449b1 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # ==================================================================== | |
f889bb03 | 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
ee0449b1 AP |
5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
10 | # SHA256 block transform for x86. September 2007. | |
11 | # | |
ee9bf3eb | 12 | # Performance improvement over compiler generated code varies from |
053fa39a | 13 | # 10% to 40% [see below]. Not very impressive on some ยต-archs, but |
ee9bf3eb AP |
14 | # it's 5 times smaller and optimizies amount of writes. |
15 | # | |
16 | # May 2012. | |
17 | # | |
f889bb03 AP |
18 | # Optimization including two of Pavel Semjanov's ideas, alternative |
19 | # Maj and full unroll, resulted in ~20-25% improvement on most CPUs, | |
3a9b3852 AP |
20 | # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost |
21 | # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not | |
22 | # on P4, where it kills performance, nor Sandy Bridge, where folded | |
23 | # loop is approximately as fast... | |
ee9bf3eb | 24 | # |
f3eac74b AP |
25 | # June 2012. |
26 | # | |
27 | # Add AMD XOP-specific code path, >30% improvement on Bulldozer over | |
28 | # May version, >60% over original. Add AVX+shrd code path, >25% | |
29 | # improvement on Sandy Bridge over May version, 60% over original. | |
30 | # | |
32213d8d AP |
31 | # May 2013. |
32 | # | |
33 | # Replace AMD XOP code path with SSSE3 to cover more processors. | |
34 | # (Biggest improvement coefficient is on upcoming Atom Silvermont, | |
35 | # not shown.) Add AVX+BMI code path. | |
36 | # | |
619b9466 AP |
37 | # March 2014. |
38 | # | |
39 | # Add support for Intel SHA Extensions. | |
40 | # | |
ee0449b1 AP |
41 | # Performance in clock cycles per processed byte (less is better): |
42 | # | |
32213d8d AP |
43 | # gcc icc x86 asm(*) SIMD x86_64 asm(**) |
44 | # Pentium 46 57 40/38 - - | |
45 | # PIII 36 33 27/24 - - | |
46 | # P4 41 38 28 - 17.3 | |
47 | # AMD K8 27 25 19/15.5 - 14.9 | |
48 | # Core2 26 23 18/15.6 14.3 13.8 | |
49 | # Westmere 27 - 19/15.7 13.4 12.3 | |
50 | # Sandy Bridge 25 - 15.9 12.4 11.6 | |
51 | # Ivy Bridge 24 - 15.0 11.4 10.3 | |
52 | # Haswell 22 - 13.9 9.46 7.80 | |
53 | # Bulldozer 36 - 27/22 17.0 13.6 | |
54 | # VIA Nano 36 - 25/22 16.8 16.5 | |
55 | # Atom 50 - 30/25 21.9 18.9 | |
b59f92e7 | 56 | # Silvermont 40 - 34/31 22.9 20.6 |
ee0449b1 | 57 | # |
32213d8d | 58 | # (*) numbers after slash are for unrolled loop, where applicable; |
f3eac74b | 59 | # (**) x86_64 assembly performance is presented for reference |
32213d8d | 60 | # purposes, results are best-available; |
ee0449b1 AP |
61 | |
62 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
63 | push(@INC,"${dir}","${dir}../../perlasm"); | |
64 | require "x86asm.pl"; | |
65 | ||
e87e380a RL |
66 | $output=pop; |
67 | open STDOUT,">$output"; | |
68 | ||
ee0449b1 AP |
69 | &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); |
70 | ||
32213d8d | 71 | $xmm=$avx=0; |
f3eac74b AP |
72 | for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } |
73 | ||
32213d8d AP |
74 | if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
75 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
76 | $avx = ($1>=2.19) + ($1>=2.22); | |
77 | } | |
f3eac74b | 78 | |
32213d8d AP |
79 | if ($xmm && !$avx && $ARGV[0] eq "win32n" && |
80 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
81 | $avx = ($1>=2.03) + ($1>=2.10); | |
82 | } | |
f3eac74b | 83 | |
32213d8d AP |
84 | if ($xmm && !$avx && $ARGV[0] eq "win32" && |
85 | `ml 2>&1` =~ /Version ([0-9]+)\./) { | |
86 | $avx = ($1>=10) + ($1>=11); | |
87 | } | |
f3eac74b | 88 | |
a356e488 AP |
89 | if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { |
90 | $avx = ($2>=3.0) + ($2>3.0); | |
ac171925 AP |
91 | } |
92 | ||
977f32e8 AP |
93 | $shaext=$xmm; ### set to zero if compiling for 1.0.1 |
94 | ||
3a9b3852 AP |
95 | $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of |
96 | # fully unrolled loop was measured to run about | |
97 | # 3-4x slower. If slowdown coefficient is N and | |
98 | # unrolled loop is m times faster, then you break | |
99 | # even at (N-1)/(m-1) blocks. Then it needs to be | |
100 | # adjusted for probability of code being evicted, | |
101 | # code size/cache size=1/4. Typical m is 1.15... | |
f889bb03 | 102 | |
ee0449b1 AP |
103 | $A="eax"; |
104 | $E="edx"; | |
105 | $T="ebx"; | |
ee9bf3eb AP |
106 | $Aoff=&DWP(4,"esp"); |
107 | $Boff=&DWP(8,"esp"); | |
108 | $Coff=&DWP(12,"esp"); | |
109 | $Doff=&DWP(16,"esp"); | |
110 | $Eoff=&DWP(20,"esp"); | |
111 | $Foff=&DWP(24,"esp"); | |
112 | $Goff=&DWP(28,"esp"); | |
113 | $Hoff=&DWP(32,"esp"); | |
114 | $Xoff=&DWP(36,"esp"); | |
ee0449b1 AP |
115 | $K256="ebp"; |
116 | ||
f889bb03 AP |
117 | sub BODY_16_63() { |
118 | &mov ($T,"ecx"); # "ecx" is preloaded | |
119 | &mov ("esi",&DWP(4*(9+15+16-14),"esp")); | |
120 | &ror ("ecx",18-7); | |
121 | &mov ("edi","esi"); | |
122 | &ror ("esi",19-17); | |
123 | &xor ("ecx",$T); | |
3a9b3852 | 124 | &shr ($T,3); |
f889bb03 | 125 | &ror ("ecx",7); |
3a9b3852 | 126 | &xor ("esi","edi"); |
f889bb03 AP |
127 | &xor ($T,"ecx"); # T = sigma0(X[-15]) |
128 | &ror ("esi",17); | |
129 | &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] | |
130 | &shr ("edi",10); | |
131 | &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] | |
132 | #&xor ("edi","esi") # sigma1(X[-2]) | |
133 | # &add ($T,"edi"); # T += sigma1(X[-2]) | |
134 | # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] | |
ee9bf3eb | 135 | |
f889bb03 AP |
136 | &BODY_00_15(1); |
137 | } | |
ee0449b1 | 138 | sub BODY_00_15() { |
b5e5760d | 139 | my $in_16_63=shift; |
8dc899de | 140 | |
ee0449b1 | 141 | &mov ("ecx",$E); |
f889bb03 | 142 | &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) |
ee0449b1 | 143 | &mov ("esi",$Foff); |
ee9bf3eb AP |
144 | &ror ("ecx",25-11); |
145 | &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) | |
ee9bf3eb | 146 | &mov ("edi",$Goff); |
7470276a | 147 | &xor ("ecx",$E); |
d4bb6bdd | 148 | &xor ("esi","edi"); |
f889bb03 | 149 | &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); |
ee9bf3eb | 150 | &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] |
d4bb6bdd | 151 | &ror ("ecx",11-6); |
ee9bf3eb | 152 | &and ("esi",$E); |
f889bb03 | 153 | &mov ($Eoff,$E); # modulo-scheduled |
d4bb6bdd | 154 | &xor ($E,"ecx"); |
f889bb03 | 155 | &add ($T,$Hoff); # T += h |
3a9b3852 | 156 | &xor ("esi","edi"); # Ch(e,f,g) |
f889bb03 | 157 | &ror ($E,6); # Sigma1(e) |
d4bb6bdd | 158 | &mov ("ecx",$A); |
f889bb03 | 159 | &add ($T,"esi"); # T += Ch(e,f,g) |
ee0449b1 | 160 | |
7470276a | 161 | &ror ("ecx",22-13); |
f889bb03 | 162 | &add ($T,$E); # T += Sigma1(e) |
ee9bf3eb | 163 | &mov ("edi",$Boff); |
7470276a | 164 | &xor ("ecx",$A); |
f889bb03 | 165 | &mov ($Aoff,$A); # modulo-scheduled |
d4bb6bdd AP |
166 | &lea ("esp",&DWP(-4,"esp")); |
167 | &ror ("ecx",13-2); | |
168 | &mov ("esi",&DWP(0,$K256)); | |
169 | &xor ("ecx",$A); | |
f889bb03 AP |
170 | &mov ($E,$Eoff); # e in next iteration, d in this one |
171 | &xor ($A,"edi"); # a ^= b | |
172 | &ror ("ecx",2); # Sigma0(a) | |
ee0449b1 | 173 | |
f889bb03 AP |
174 | &add ($T,"esi"); # T+= K[i] |
175 | &mov (&DWP(0,"esp"),$A); # (b^c) in next round | |
176 | &add ($E,$T); # d += T | |
ee9bf3eb | 177 | &and ($A,&DWP(4,"esp")); # a &= (b^c) |
f889bb03 AP |
178 | &add ($T,"ecx"); # T += Sigma0(a) |
179 | &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) | |
180 | &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T | |
ee0449b1 | 181 | &add ($K256,4); |
f889bb03 | 182 | &add ($A,$T); # h += T |
ee0449b1 AP |
183 | } |
184 | ||
f889bb03 AP |
185 | &external_label("OPENSSL_ia32cap_P") if (!$i386); |
186 | ||
87facba3 | 187 | &function_begin("sha256_block_data_order"); |
ee0449b1 AP |
188 | &mov ("esi",wparam(0)); # ctx |
189 | &mov ("edi",wparam(1)); # inp | |
190 | &mov ("eax",wparam(2)); # num | |
191 | &mov ("ebx","esp"); # saved sp | |
192 | ||
193 | &call (&label("pic_point")); # make it PIC! | |
194 | &set_label("pic_point"); | |
195 | &blindpop($K256); | |
196 | &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); | |
197 | ||
198 | &sub ("esp",16); | |
199 | &and ("esp",-64); | |
200 | ||
201 | &shl ("eax",6); | |
202 | &add ("eax","edi"); | |
203 | &mov (&DWP(0,"esp"),"esi"); # ctx | |
204 | &mov (&DWP(4,"esp"),"edi"); # inp | |
205 | &mov (&DWP(8,"esp"),"eax"); # inp+num*128 | |
206 | &mov (&DWP(12,"esp"),"ebx"); # saved sp | |
d49135e7 | 207 | if (!$i386 && $xmm) { |
f889bb03 AP |
208 | &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); |
209 | &mov ("ecx",&DWP(0,"edx")); | |
32213d8d | 210 | &mov ("ebx",&DWP(4,"edx")); |
f889bb03 AP |
211 | &test ("ecx",1<<20); # check for P4 |
212 | &jnz (&label("loop")); | |
619b9466 AP |
213 | &mov ("edx",&DWP(8,"edx")) if ($xmm); |
214 | &test ("ecx",1<<24); # check for FXSR | |
215 | &jz ($unroll_after?&label("no_xmm"):&label("loop")); | |
f889bb03 | 216 | &and ("ecx",1<<30); # mask "Intel CPU" bit |
32213d8d | 217 | &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits |
977f32e8 AP |
218 | &test ("edx",1<<29) if ($shaext); # check for SHA |
219 | &jnz (&label("shaext")) if ($shaext); | |
32213d8d AP |
220 | &or ("ecx","ebx"); |
221 | &and ("ecx",1<<28|1<<30); | |
f889bb03 | 222 | &cmp ("ecx",1<<28|1<<30); |
32213d8d AP |
223 | if ($xmm) { |
224 | &je (&label("AVX")) if ($avx); | |
225 | &test ("ebx",1<<9); # check for SSSE3 | |
226 | &jnz (&label("SSSE3")); | |
227 | } else { | |
228 | &je (&label("loop_shrd")); | |
229 | } | |
f889bb03 | 230 | if ($unroll_after) { |
619b9466 | 231 | &set_label("no_xmm"); |
f889bb03 AP |
232 | &sub ("eax","edi"); |
233 | &cmp ("eax",$unroll_after); | |
3a9b3852 | 234 | &jae (&label("unrolled")); |
f889bb03 AP |
235 | } } |
236 | &jmp (&label("loop")); | |
ee0449b1 | 237 | |
f889bb03 AP |
238 | sub COMPACT_LOOP() { |
239 | my $suffix=shift; | |
240 | ||
32213d8d | 241 | &set_label("loop$suffix",$suffix?32:16); |
ee0449b1 AP |
242 | # copy input block to stack reversing byte and dword order |
243 | for($i=0;$i<4;$i++) { | |
244 | &mov ("eax",&DWP($i*16+0,"edi")); | |
245 | &mov ("ebx",&DWP($i*16+4,"edi")); | |
246 | &mov ("ecx",&DWP($i*16+8,"edi")); | |
ee0449b1 | 247 | &bswap ("eax"); |
ee9bf3eb | 248 | &mov ("edx",&DWP($i*16+12,"edi")); |
ee0449b1 | 249 | &bswap ("ebx"); |
ee0449b1 | 250 | &push ("eax"); |
ee9bf3eb | 251 | &bswap ("ecx"); |
ee0449b1 | 252 | &push ("ebx"); |
ee9bf3eb | 253 | &bswap ("edx"); |
ee0449b1 AP |
254 | &push ("ecx"); |
255 | &push ("edx"); | |
256 | } | |
257 | &add ("edi",64); | |
ee9bf3eb AP |
258 | &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H |
259 | &mov (&DWP(4*(9+16)+4,"esp"),"edi"); | |
ee0449b1 AP |
260 | |
261 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
262 | &mov ($A,&DWP(0,"esi")); | |
263 | &mov ("ebx",&DWP(4,"esi")); | |
264 | &mov ("ecx",&DWP(8,"esi")); | |
265 | &mov ("edi",&DWP(12,"esi")); | |
266 | # &mov ($Aoff,$A); | |
267 | &mov ($Boff,"ebx"); | |
ee9bf3eb | 268 | &xor ("ebx","ecx"); |
ee0449b1 AP |
269 | &mov ($Coff,"ecx"); |
270 | &mov ($Doff,"edi"); | |
ee9bf3eb | 271 | &mov (&DWP(0,"esp"),"ebx"); # magic |
ee0449b1 AP |
272 | &mov ($E,&DWP(16,"esi")); |
273 | &mov ("ebx",&DWP(20,"esi")); | |
274 | &mov ("ecx",&DWP(24,"esi")); | |
275 | &mov ("edi",&DWP(28,"esi")); | |
276 | # &mov ($Eoff,$E); | |
277 | &mov ($Foff,"ebx"); | |
278 | &mov ($Goff,"ecx"); | |
279 | &mov ($Hoff,"edi"); | |
280 | ||
f889bb03 | 281 | &set_label("00_15$suffix",16); |
ee0449b1 AP |
282 | |
283 | &BODY_00_15(); | |
284 | ||
285 | &cmp ("esi",0xc19bf174); | |
f889bb03 | 286 | &jne (&label("00_15$suffix")); |
ee0449b1 | 287 | |
f889bb03 AP |
288 | &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) |
289 | &jmp (&label("16_63$suffix")); | |
290 | ||
291 | &set_label("16_63$suffix",16); | |
292 | ||
293 | &BODY_16_63(); | |
ee0449b1 AP |
294 | |
295 | &cmp ("esi",0xc67178f2); | |
f889bb03 | 296 | &jne (&label("16_63$suffix")); |
ee0449b1 | 297 | |
ee9bf3eb | 298 | &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx |
ee0449b1 AP |
299 | # &mov ($A,$Aoff); |
300 | &mov ("ebx",$Boff); | |
f889bb03 AP |
301 | # &mov ("edi",$Coff); |
302 | &mov ("ecx",$Doff); | |
ee0449b1 AP |
303 | &add ($A,&DWP(0,"esi")); |
304 | &add ("ebx",&DWP(4,"esi")); | |
f889bb03 AP |
305 | &add ("edi",&DWP(8,"esi")); |
306 | &add ("ecx",&DWP(12,"esi")); | |
ee0449b1 AP |
307 | &mov (&DWP(0,"esi"),$A); |
308 | &mov (&DWP(4,"esi"),"ebx"); | |
f889bb03 AP |
309 | &mov (&DWP(8,"esi"),"edi"); |
310 | &mov (&DWP(12,"esi"),"ecx"); | |
ee0449b1 AP |
311 | # &mov ($E,$Eoff); |
312 | &mov ("eax",$Foff); | |
313 | &mov ("ebx",$Goff); | |
314 | &mov ("ecx",$Hoff); | |
ee9bf3eb | 315 | &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp |
ee0449b1 AP |
316 | &add ($E,&DWP(16,"esi")); |
317 | &add ("eax",&DWP(20,"esi")); | |
318 | &add ("ebx",&DWP(24,"esi")); | |
319 | &add ("ecx",&DWP(28,"esi")); | |
320 | &mov (&DWP(16,"esi"),$E); | |
321 | &mov (&DWP(20,"esi"),"eax"); | |
322 | &mov (&DWP(24,"esi"),"ebx"); | |
323 | &mov (&DWP(28,"esi"),"ecx"); | |
324 | ||
ee9bf3eb | 325 | &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame |
ee0449b1 AP |
326 | &sub ($K256,4*64); # rewind K |
327 | ||
328 | &cmp ("edi",&DWP(8,"esp")); # are we done yet? | |
f889bb03 AP |
329 | &jb (&label("loop$suffix")); |
330 | } | |
331 | &COMPACT_LOOP(); | |
332 | &mov ("esp",&DWP(12,"esp")); # restore sp | |
333 | &function_end_A(); | |
32213d8d | 334 | if (!$i386 && !$xmm) { |
f889bb03 AP |
335 | # ~20% improvement on Sandy Bridge |
336 | local *ror = sub { &shrd(@_[0],@_) }; | |
337 | &COMPACT_LOOP("_shrd"); | |
ee0449b1 AP |
338 | &mov ("esp",&DWP(12,"esp")); # restore sp |
339 | &function_end_A(); | |
f889bb03 | 340 | } |
ee0449b1 AP |
341 | |
342 | &set_label("K256",64); # Yes! I keep it in the code segment! | |
f889bb03 AP |
343 | @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, |
344 | 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, | |
345 | 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, | |
346 | 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, | |
347 | 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, | |
348 | 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, | |
349 | 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, | |
350 | 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, | |
351 | 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, | |
352 | 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, | |
353 | 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, | |
354 | 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, | |
355 | 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, | |
356 | 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, | |
357 | 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, | |
358 | 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); | |
359 | &data_word(@K256); | |
32213d8d AP |
360 | &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask |
361 | &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
362 | ||
363 | ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets | |
364 | sub off { &DWP(4*(((shift)-$i)&7),"esp"); } | |
f889bb03 AP |
365 | |
366 | if (!$i386 && $unroll_after) { | |
367 | my @AH=($A,$K256); | |
368 | ||
369 | &set_label("unrolled",16); | |
370 | &lea ("esp",&DWP(-96,"esp")); | |
371 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
372 | &mov ($AH[0],&DWP(0,"esi")); | |
373 | &mov ($AH[1],&DWP(4,"esi")); | |
374 | &mov ("ecx",&DWP(8,"esi")); | |
375 | &mov ("ebx",&DWP(12,"esi")); | |
376 | #&mov (&DWP(0,"esp"),$AH[0]); | |
377 | &mov (&DWP(4,"esp"),$AH[1]); | |
378 | &xor ($AH[1],"ecx"); # magic | |
379 | &mov (&DWP(8,"esp"),"ecx"); | |
380 | &mov (&DWP(12,"esp"),"ebx"); | |
381 | &mov ($E,&DWP(16,"esi")); | |
382 | &mov ("ebx",&DWP(20,"esi")); | |
383 | &mov ("ecx",&DWP(24,"esi")); | |
384 | &mov ("esi",&DWP(28,"esi")); | |
385 | #&mov (&DWP(16,"esp"),$E); | |
386 | &mov (&DWP(20,"esp"),"ebx"); | |
387 | &mov (&DWP(24,"esp"),"ecx"); | |
388 | &mov (&DWP(28,"esp"),"esi"); | |
389 | &jmp (&label("grand_loop")); | |
390 | ||
391 | &set_label("grand_loop",16); | |
392 | # copy input block to stack reversing byte order | |
393 | for($i=0;$i<5;$i++) { | |
394 | &mov ("ebx",&DWP(12*$i+0,"edi")); | |
395 | &mov ("ecx",&DWP(12*$i+4,"edi")); | |
396 | &bswap ("ebx"); | |
397 | &mov ("esi",&DWP(12*$i+8,"edi")); | |
398 | &bswap ("ecx"); | |
399 | &mov (&DWP(32+12*$i+0,"esp"),"ebx"); | |
400 | &bswap ("esi"); | |
401 | &mov (&DWP(32+12*$i+4,"esp"),"ecx"); | |
402 | &mov (&DWP(32+12*$i+8,"esp"),"esi"); | |
403 | } | |
404 | &mov ("ebx",&DWP($i*12,"edi")); | |
405 | &add ("edi",64); | |
406 | &bswap ("ebx"); | |
407 | &mov (&DWP(96+4,"esp"),"edi"); | |
408 | &mov (&DWP(32+12*$i,"esp"),"ebx"); | |
409 | ||
3a9b3852 | 410 | my ($t1,$t2) = ("ecx","esi"); |
f889bb03 AP |
411 | |
412 | for ($i=0;$i<64;$i++) { | |
413 | ||
414 | if ($i>=16) { | |
3a9b3852 AP |
415 | &mov ($T,$t1); # $t1 is preloaded |
416 | # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); | |
417 | &ror ($t1,18-7); | |
418 | &mov ("edi",$t2); | |
419 | &ror ($t2,19-17); | |
420 | &xor ($t1,$T); | |
421 | &shr ($T,3); | |
422 | &ror ($t1,7); | |
423 | &xor ($t2,"edi"); | |
424 | &xor ($T,$t1); # T = sigma0(X[-15]) | |
425 | &ror ($t2,17); | |
f889bb03 AP |
426 | &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] |
427 | &shr ("edi",10); | |
428 | &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] | |
3a9b3852 | 429 | #&xor ("edi",$t2) # sigma1(X[-2]) |
f889bb03 AP |
430 | # &add ($T,"edi"); # T += sigma1(X[-2]) |
431 | # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] | |
432 | } | |
3a9b3852 AP |
433 | &mov ($t1,$E); |
434 | &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) | |
435 | &mov ($t2,&off($f)); | |
436 | &ror ($E,25-11); | |
f889bb03 AP |
437 | &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) |
438 | &mov ("edi",&off($g)); | |
3a9b3852 | 439 | &xor ($E,$t1); |
f889bb03 | 440 | &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] |
3a9b3852 AP |
441 | &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] |
442 | &xor ($t2,"edi"); | |
443 | &ror ($E,11-6); | |
444 | &and ($t2,$t1); | |
445 | &mov (&off($e),$t1); # save $E, modulo-scheduled | |
446 | &xor ($E,$t1); | |
f889bb03 | 447 | &add ($T,&off($h)); # T += h |
3a9b3852 | 448 | &xor ("edi",$t2); # Ch(e,f,g) |
f889bb03 | 449 | &ror ($E,6); # Sigma1(e) |
3a9b3852 AP |
450 | &mov ($t1,$AH[0]); |
451 | &add ($T,"edi"); # T += Ch(e,f,g) | |
f889bb03 | 452 | |
3a9b3852 AP |
453 | &ror ($t1,22-13); |
454 | &mov ($t2,$AH[0]); | |
f889bb03 | 455 | &mov ("edi",&off($b)); |
3a9b3852 AP |
456 | &xor ($t1,$AH[0]); |
457 | &mov (&off($a),$AH[0]); # save $A, modulo-scheduled | |
f889bb03 | 458 | &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round |
3a9b3852 AP |
459 | &ror ($t1,13-2); |
460 | &and ($AH[1],$AH[0]); # (b^c) &= (a^b) | |
461 | &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] | |
462 | &xor ($t1,$t2); | |
463 | &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) | |
464 | &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); | |
465 | &ror ($t1,2); # Sigma0(a) | |
466 | ||
467 | &add ($AH[1],$E); # h += T | |
468 | &add ($E,&off($d)); # d += T | |
469 | &add ($AH[1],$t1); # h += Sigma0(a) | |
470 | &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); | |
471 | ||
472 | @AH = reverse(@AH); # rotate(a,h) | |
473 | ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) | |
f889bb03 AP |
474 | } |
475 | &mov ("esi",&DWP(96,"esp")); #ctx | |
476 | #&mov ($AH[0],&DWP(0,"esp")); | |
477 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
478 | #&mov ("edi", &DWP(8,"esp")); | |
479 | &mov ("ecx",&DWP(12,"esp")); | |
480 | &add ($AH[0],&DWP(0,"esi")); | |
481 | &add ($AH[1],&DWP(4,"esi")); | |
482 | &add ("edi",&DWP(8,"esi")); | |
483 | &add ("ecx",&DWP(12,"esi")); | |
484 | &mov (&DWP(0,"esi"),$AH[0]); | |
485 | &mov (&DWP(4,"esi"),$AH[1]); | |
486 | &mov (&DWP(8,"esi"),"edi"); | |
487 | &mov (&DWP(12,"esi"),"ecx"); | |
488 | #&mov (&DWP(0,"esp"),$AH[0]); | |
489 | &mov (&DWP(4,"esp"),$AH[1]); | |
490 | &xor ($AH[1],"edi"); # magic | |
491 | &mov (&DWP(8,"esp"),"edi"); | |
492 | &mov (&DWP(12,"esp"),"ecx"); | |
493 | #&mov ($E,&DWP(16,"esp")); | |
494 | &mov ("edi",&DWP(20,"esp")); | |
495 | &mov ("ebx",&DWP(24,"esp")); | |
496 | &mov ("ecx",&DWP(28,"esp")); | |
497 | &add ($E,&DWP(16,"esi")); | |
498 | &add ("edi",&DWP(20,"esi")); | |
499 | &add ("ebx",&DWP(24,"esi")); | |
500 | &add ("ecx",&DWP(28,"esi")); | |
501 | &mov (&DWP(16,"esi"),$E); | |
502 | &mov (&DWP(20,"esi"),"edi"); | |
503 | &mov (&DWP(24,"esi"),"ebx"); | |
504 | &mov (&DWP(28,"esi"),"ecx"); | |
505 | #&mov (&DWP(16,"esp"),$E); | |
506 | &mov (&DWP(20,"esp"),"edi"); | |
507 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
508 | &mov (&DWP(24,"esp"),"ebx"); | |
509 | &mov (&DWP(28,"esp"),"ecx"); | |
510 | ||
511 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
512 | &jb (&label("grand_loop")); | |
513 | ||
514 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
515 | &function_end_A(); | |
32213d8d AP |
516 | } |
517 | if (!$i386 && $xmm) {{{ | |
977f32e8 | 518 | if ($shaext) { |
619b9466 AP |
519 | ###################################################################### |
520 | # Intel SHA Extensions implementation of SHA256 update function. | |
521 | # | |
522 | my ($ctx,$inp,$end)=("esi","edi","eax"); | |
523 | my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); | |
524 | my @MSG=map("xmm$_",(3..6)); | |
525 | ||
526 | sub sha256op38 { | |
527 | my ($opcodelet,$dst,$src)=@_; | |
528 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | |
529 | { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } | |
530 | } | |
531 | sub sha256rnds2 { sha256op38(0xcb,@_); } | |
532 | sub sha256msg1 { sha256op38(0xcc,@_); } | |
533 | sub sha256msg2 { sha256op38(0xcd,@_); } | |
534 | ||
535 | &set_label("shaext",32); | |
536 | &sub ("esp",32); | |
537 | ||
538 | &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA | |
539 | &lea ($K256,&DWP(0x80,$K256)); | |
540 | &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE | |
541 | &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask | |
542 | ||
543 | &pshufd ($Wi,$ABEF,0x1b); # ABCD | |
544 | &pshufd ($ABEF,$ABEF,0xb1); # CDAB | |
545 | &pshufd ($CDGH,$CDGH,0x1b); # EFGH | |
546 | &palignr ($ABEF,$CDGH,8); # ABEF | |
547 | &punpcklqdq ($CDGH,$Wi); # CDGH | |
548 | &jmp (&label("loop_shaext")); | |
549 | ||
550 | &set_label("loop_shaext",16); | |
551 | &movdqu (@MSG[0],&QWP(0,$inp)); | |
552 | &movdqu (@MSG[1],&QWP(0x10,$inp)); | |
553 | &movdqu (@MSG[2],&QWP(0x20,$inp)); | |
554 | &pshufb (@MSG[0],$TMP); | |
555 | &movdqu (@MSG[3],&QWP(0x30,$inp)); | |
556 | &movdqa (&QWP(16,"esp"),$CDGH); # offload | |
557 | ||
558 | &movdqa ($Wi,&QWP(0*16-0x80,$K256)); | |
559 | &paddd ($Wi,@MSG[0]); | |
560 | &pshufb (@MSG[1],$TMP); | |
561 | &sha256rnds2 ($CDGH,$ABEF); # 0-3 | |
562 | &pshufd ($Wi,$Wi,0x0e); | |
563 | &nop (); | |
564 | &movdqa (&QWP(0,"esp"),$ABEF); # offload | |
565 | &sha256rnds2 ($ABEF,$CDGH); | |
566 | ||
567 | &movdqa ($Wi,&QWP(1*16-0x80,$K256)); | |
568 | &paddd ($Wi,@MSG[1]); | |
569 | &pshufb (@MSG[2],$TMP); | |
570 | &sha256rnds2 ($CDGH,$ABEF); # 4-7 | |
571 | &pshufd ($Wi,$Wi,0x0e); | |
572 | &lea ($inp,&DWP(0x40,$inp)); | |
573 | &sha256msg1 (@MSG[0],@MSG[1]); | |
574 | &sha256rnds2 ($ABEF,$CDGH); | |
575 | ||
576 | &movdqa ($Wi,&QWP(2*16-0x80,$K256)); | |
577 | &paddd ($Wi,@MSG[2]); | |
578 | &pshufb (@MSG[3],$TMP); | |
579 | &sha256rnds2 ($CDGH,$ABEF); # 8-11 | |
580 | &pshufd ($Wi,$Wi,0x0e); | |
581 | &movdqa ($TMP,@MSG[3]); | |
582 | &palignr ($TMP,@MSG[2],4); | |
583 | &nop (); | |
584 | &paddd (@MSG[0],$TMP); | |
585 | &sha256msg1 (@MSG[1],@MSG[2]); | |
586 | &sha256rnds2 ($ABEF,$CDGH); | |
587 | ||
588 | &movdqa ($Wi,&QWP(3*16-0x80,$K256)); | |
589 | &paddd ($Wi,@MSG[3]); | |
590 | &sha256msg2 (@MSG[0],@MSG[3]); | |
591 | &sha256rnds2 ($CDGH,$ABEF); # 12-15 | |
592 | &pshufd ($Wi,$Wi,0x0e); | |
593 | &movdqa ($TMP,@MSG[0]); | |
594 | &palignr ($TMP,@MSG[3],4); | |
595 | &nop (); | |
596 | &paddd (@MSG[1],$TMP); | |
597 | &sha256msg1 (@MSG[2],@MSG[3]); | |
598 | &sha256rnds2 ($ABEF,$CDGH); | |
599 | ||
600 | for($i=4;$i<16-3;$i++) { | |
601 | &movdqa ($Wi,&QWP($i*16-0x80,$K256)); | |
602 | &paddd ($Wi,@MSG[0]); | |
603 | &sha256msg2 (@MSG[1],@MSG[0]); | |
604 | &sha256rnds2 ($CDGH,$ABEF); # 16-19... | |
605 | &pshufd ($Wi,$Wi,0x0e); | |
606 | &movdqa ($TMP,@MSG[1]); | |
607 | &palignr ($TMP,@MSG[0],4); | |
608 | &nop (); | |
609 | &paddd (@MSG[2],$TMP); | |
610 | &sha256msg1 (@MSG[3],@MSG[0]); | |
611 | &sha256rnds2 ($ABEF,$CDGH); | |
612 | ||
613 | push(@MSG,shift(@MSG)); | |
614 | } | |
615 | &movdqa ($Wi,&QWP(13*16-0x80,$K256)); | |
616 | &paddd ($Wi,@MSG[0]); | |
617 | &sha256msg2 (@MSG[1],@MSG[0]); | |
618 | &sha256rnds2 ($CDGH,$ABEF); # 52-55 | |
619 | &pshufd ($Wi,$Wi,0x0e); | |
620 | &movdqa ($TMP,@MSG[1]) | |
621 | &palignr ($TMP,@MSG[0],4); | |
622 | &sha256rnds2 ($ABEF,$CDGH); | |
623 | &paddd (@MSG[2],$TMP); | |
624 | ||
625 | &movdqa ($Wi,&QWP(14*16-0x80,$K256)); | |
626 | &paddd ($Wi,@MSG[1]); | |
627 | &sha256rnds2 ($CDGH,$ABEF); # 56-59 | |
628 | &pshufd ($Wi,$Wi,0x0e); | |
629 | &sha256msg2 (@MSG[2],@MSG[1]); | |
630 | &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask | |
631 | &sha256rnds2 ($ABEF,$CDGH); | |
632 | ||
633 | &movdqa ($Wi,&QWP(15*16-0x80,$K256)); | |
634 | &paddd ($Wi,@MSG[2]); | |
635 | &nop (); | |
636 | &sha256rnds2 ($CDGH,$ABEF); # 60-63 | |
637 | &pshufd ($Wi,$Wi,0x0e); | |
638 | &cmp ($end,$inp); | |
639 | &nop (); | |
640 | &sha256rnds2 ($ABEF,$CDGH); | |
641 | ||
642 | &paddd ($CDGH,&QWP(16,"esp")); | |
643 | &paddd ($ABEF,&QWP(0,"esp")); | |
644 | &jnz (&label("loop_shaext")); | |
645 | ||
646 | &pshufd ($CDGH,$CDGH,0xb1); # DCHG | |
647 | &pshufd ($TMP,$ABEF,0x1b); # FEBA | |
648 | &pshufd ($ABEF,$ABEF,0xb1); # BAFE | |
649 | &punpckhqdq ($ABEF,$CDGH); # DCBA | |
650 | &palignr ($CDGH,$TMP,8); # HGFE | |
651 | ||
652 | &mov ("esp",&DWP(32+12,"esp")); | |
653 | &movdqu (&QWP(0,$ctx),$ABEF); | |
654 | &movdqu (&QWP(16,$ctx),$CDGH); | |
655 | &function_end_A(); | |
656 | } | |
657 | ||
f3eac74b AP |
658 | my @X = map("xmm$_",(0..3)); |
659 | my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); | |
660 | my @AH = ($A,$T); | |
661 | ||
32213d8d | 662 | &set_label("SSSE3",32); |
f3eac74b | 663 | &lea ("esp",&DWP(-96,"esp")); |
f3eac74b AP |
664 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
665 | &mov ($AH[0],&DWP(0,"esi")); | |
666 | &mov ($AH[1],&DWP(4,"esi")); | |
667 | &mov ("ecx",&DWP(8,"esi")); | |
668 | &mov ("edi",&DWP(12,"esi")); | |
669 | #&mov (&DWP(0,"esp"),$AH[0]); | |
670 | &mov (&DWP(4,"esp"),$AH[1]); | |
671 | &xor ($AH[1],"ecx"); # magic | |
672 | &mov (&DWP(8,"esp"),"ecx"); | |
673 | &mov (&DWP(12,"esp"),"edi"); | |
674 | &mov ($E,&DWP(16,"esi")); | |
675 | &mov ("edi",&DWP(20,"esi")); | |
676 | &mov ("ecx",&DWP(24,"esi")); | |
677 | &mov ("esi",&DWP(28,"esi")); | |
678 | #&mov (&DWP(16,"esp"),$E); | |
679 | &mov (&DWP(20,"esp"),"edi"); | |
680 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
681 | &mov (&DWP(24,"esp"),"ecx"); | |
682 | &mov (&DWP(28,"esp"),"esi"); | |
32213d8d AP |
683 | &movdqa ($t3,&QWP(256,$K256)); |
684 | &jmp (&label("grand_ssse3")); | |
f3eac74b | 685 | |
32213d8d | 686 | &set_label("grand_ssse3",16); |
f3eac74b | 687 | # load input, reverse byte order, add K256[0..15], save to stack |
32213d8d AP |
688 | &movdqu (@X[0],&QWP(0,"edi")); |
689 | &movdqu (@X[1],&QWP(16,"edi")); | |
690 | &movdqu (@X[2],&QWP(32,"edi")); | |
691 | &movdqu (@X[3],&QWP(48,"edi")); | |
692 | &add ("edi",64); | |
693 | &pshufb (@X[0],$t3); | |
694 | &mov (&DWP(96+4,"esp"),"edi"); | |
695 | &pshufb (@X[1],$t3); | |
696 | &movdqa ($t0,&QWP(0,$K256)); | |
697 | &pshufb (@X[2],$t3); | |
698 | &movdqa ($t1,&QWP(16,$K256)); | |
699 | &paddd ($t0,@X[0]); | |
700 | &pshufb (@X[3],$t3); | |
701 | &movdqa ($t2,&QWP(32,$K256)); | |
702 | &paddd ($t1,@X[1]); | |
703 | &movdqa ($t3,&QWP(48,$K256)); | |
704 | &movdqa (&QWP(32+0,"esp"),$t0); | |
705 | &paddd ($t2,@X[2]); | |
706 | &movdqa (&QWP(32+16,"esp"),$t1); | |
707 | &paddd ($t3,@X[3]); | |
708 | &movdqa (&QWP(32+32,"esp"),$t2); | |
709 | &movdqa (&QWP(32+48,"esp"),$t3); | |
710 | &jmp (&label("ssse3_00_47")); | |
711 | ||
712 | &set_label("ssse3_00_47",16); | |
f3eac74b AP |
713 | &add ($K256,64); |
714 | ||
32213d8d | 715 | sub SSSE3_00_47 () { |
f3eac74b AP |
716 | my $j = shift; |
717 | my $body = shift; | |
718 | my @X = @_; | |
719 | my @insns = (&$body,&$body,&$body,&$body); # 120 instructions | |
720 | ||
f3eac74b | 721 | eval(shift(@insns)); |
32213d8d AP |
722 | &movdqa ($t0,@X[1]); |
723 | eval(shift(@insns)); # @ | |
f3eac74b | 724 | eval(shift(@insns)); |
32213d8d | 725 | &movdqa ($t3,@X[3]); |
f3eac74b AP |
726 | eval(shift(@insns)); |
727 | eval(shift(@insns)); | |
32213d8d | 728 | &palignr ($t0,@X[0],4); # X[1..4] |
f3eac74b | 729 | eval(shift(@insns)); |
32213d8d | 730 | eval(shift(@insns)); # @ |
f3eac74b | 731 | eval(shift(@insns)); |
32213d8d | 732 | &palignr ($t3,@X[2],4); # X[9..12] |
f3eac74b AP |
733 | eval(shift(@insns)); |
734 | eval(shift(@insns)); | |
735 | eval(shift(@insns)); | |
32213d8d AP |
736 | &movdqa ($t1,$t0); |
737 | eval(shift(@insns)); # @ | |
f3eac74b | 738 | eval(shift(@insns)); |
32213d8d | 739 | &movdqa ($t2,$t0); |
f3eac74b AP |
740 | eval(shift(@insns)); |
741 | eval(shift(@insns)); | |
32213d8d | 742 | &psrld ($t0,3); |
f3eac74b | 743 | eval(shift(@insns)); |
32213d8d AP |
744 | eval(shift(@insns)); # @ |
745 | &paddd (@X[0],$t3); # X[0..3] += X[9..12] | |
f3eac74b | 746 | eval(shift(@insns)); |
f3eac74b | 747 | eval(shift(@insns)); |
32213d8d | 748 | &psrld ($t2,7); |
f3eac74b | 749 | eval(shift(@insns)); |
f3eac74b | 750 | eval(shift(@insns)); |
32213d8d | 751 | eval(shift(@insns)); # @ |
f3eac74b | 752 | eval(shift(@insns)); |
32213d8d | 753 | &pshufd ($t3,@X[3],0b11111010); # X[14..15] |
f3eac74b AP |
754 | eval(shift(@insns)); |
755 | eval(shift(@insns)); | |
32213d8d | 756 | &pslld ($t1,32-18); |
f3eac74b | 757 | eval(shift(@insns)); |
32213d8d AP |
758 | eval(shift(@insns)); # @ |
759 | &pxor ($t0,$t2); | |
f3eac74b AP |
760 | eval(shift(@insns)); |
761 | eval(shift(@insns)); | |
32213d8d | 762 | &psrld ($t2,18-7); |
f3eac74b | 763 | eval(shift(@insns)); |
f3eac74b | 764 | eval(shift(@insns)); |
32213d8d AP |
765 | eval(shift(@insns)); # @ |
766 | &pxor ($t0,$t1); | |
f3eac74b AP |
767 | eval(shift(@insns)); |
768 | eval(shift(@insns)); | |
32213d8d | 769 | &pslld ($t1,18-7); |
f3eac74b | 770 | eval(shift(@insns)); |
f3eac74b | 771 | eval(shift(@insns)); |
32213d8d AP |
772 | eval(shift(@insns)); # @ |
773 | &pxor ($t0,$t2); | |
f3eac74b AP |
774 | eval(shift(@insns)); |
775 | eval(shift(@insns)); | |
32213d8d | 776 | &movdqa ($t2,$t3); |
f3eac74b | 777 | eval(shift(@insns)); |
f3eac74b | 778 | eval(shift(@insns)); |
32213d8d AP |
779 | eval(shift(@insns)); # @ |
780 | &pxor ($t0,$t1); # sigma0(X[1..4]) | |
f3eac74b AP |
781 | eval(shift(@insns)); |
782 | eval(shift(@insns)); | |
32213d8d | 783 | &psrld ($t3,10); |
f3eac74b | 784 | eval(shift(@insns)); |
f3eac74b | 785 | eval(shift(@insns)); |
32213d8d AP |
786 | eval(shift(@insns)); # @ |
787 | &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) | |
f3eac74b | 788 | eval(shift(@insns)); |
f3eac74b | 789 | eval(shift(@insns)); |
32213d8d | 790 | &psrlq ($t2,17); |
f3eac74b | 791 | eval(shift(@insns)); |
f3eac74b | 792 | eval(shift(@insns)); |
32213d8d AP |
793 | eval(shift(@insns)); # @ |
794 | &pxor ($t3,$t2); | |
f3eac74b | 795 | eval(shift(@insns)); |
f3eac74b | 796 | eval(shift(@insns)); |
32213d8d | 797 | &psrlq ($t2,19-17); |
f3eac74b AP |
798 | eval(shift(@insns)); |
799 | eval(shift(@insns)); | |
32213d8d AP |
800 | eval(shift(@insns)); # @ |
801 | &pxor ($t3,$t2); | |
f3eac74b | 802 | eval(shift(@insns)); |
f3eac74b | 803 | eval(shift(@insns)); |
32213d8d | 804 | &pshufd ($t3,$t3,0b10000000); |
f3eac74b AP |
805 | eval(shift(@insns)); |
806 | eval(shift(@insns)); | |
32213d8d | 807 | eval(shift(@insns)); # @ |
f3eac74b | 808 | eval(shift(@insns)); |
f3eac74b AP |
809 | eval(shift(@insns)); |
810 | eval(shift(@insns)); | |
811 | eval(shift(@insns)); | |
32213d8d | 812 | eval(shift(@insns)); # @ |
f3eac74b | 813 | eval(shift(@insns)); |
32213d8d | 814 | &psrldq ($t3,8); |
f3eac74b AP |
815 | eval(shift(@insns)); |
816 | eval(shift(@insns)); | |
817 | eval(shift(@insns)); | |
32213d8d AP |
818 | &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) |
819 | eval(shift(@insns)); # @ | |
f3eac74b | 820 | eval(shift(@insns)); |
32213d8d AP |
821 | eval(shift(@insns)); |
822 | eval(shift(@insns)); | |
823 | eval(shift(@insns)); | |
824 | eval(shift(@insns)); # @ | |
825 | eval(shift(@insns)); | |
826 | &pshufd ($t3,@X[0],0b01010000); # X[16..17] | |
827 | eval(shift(@insns)); | |
828 | eval(shift(@insns)); | |
829 | eval(shift(@insns)); | |
830 | &movdqa ($t2,$t3); | |
831 | eval(shift(@insns)); # @ | |
832 | &psrld ($t3,10); | |
833 | eval(shift(@insns)); | |
834 | &psrlq ($t2,17); | |
835 | eval(shift(@insns)); | |
836 | eval(shift(@insns)); | |
837 | eval(shift(@insns)); | |
838 | eval(shift(@insns)); # @ | |
839 | &pxor ($t3,$t2); | |
840 | eval(shift(@insns)); | |
841 | eval(shift(@insns)); | |
842 | &psrlq ($t2,19-17); | |
843 | eval(shift(@insns)); | |
844 | eval(shift(@insns)); | |
845 | eval(shift(@insns)); # @ | |
846 | &pxor ($t3,$t2); | |
847 | eval(shift(@insns)); | |
848 | eval(shift(@insns)); | |
849 | eval(shift(@insns)); | |
850 | &pshufd ($t3,$t3,0b00001000); | |
851 | eval(shift(@insns)); | |
852 | eval(shift(@insns)); # @ | |
853 | &movdqa ($t2,&QWP(16*$j,$K256)); | |
854 | eval(shift(@insns)); | |
855 | eval(shift(@insns)); | |
856 | &pslldq ($t3,8); | |
857 | eval(shift(@insns)); | |
858 | eval(shift(@insns)); | |
859 | eval(shift(@insns)); # @ | |
860 | eval(shift(@insns)); | |
861 | eval(shift(@insns)); | |
862 | eval(shift(@insns)); | |
863 | eval(shift(@insns)); | |
864 | eval(shift(@insns)); # @ | |
865 | &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) | |
866 | eval(shift(@insns)); | |
867 | eval(shift(@insns)); | |
868 | eval(shift(@insns)); | |
869 | eval(shift(@insns)); | |
870 | &paddd ($t2,@X[0]); | |
871 | eval(shift(@insns)); # @ | |
f3eac74b AP |
872 | |
873 | foreach (@insns) { eval; } # remaining instructions | |
874 | ||
32213d8d | 875 | &movdqa (&QWP(32+16*$j,"esp"),$t2); |
f3eac74b AP |
876 | } |
877 | ||
878 | sub body_00_15 () { | |
879 | ( | |
880 | '&mov ("ecx",$E);', | |
f3eac74b | 881 | '&ror ($E,25-11);', |
32213d8d | 882 | '&mov ("esi",&off($f));', |
f3eac74b | 883 | '&xor ($E,"ecx");', |
32213d8d | 884 | '&mov ("edi",&off($g));', |
f3eac74b AP |
885 | '&xor ("esi","edi");', |
886 | '&ror ($E,11-6);', | |
887 | '&and ("esi","ecx");', | |
888 | '&mov (&off($e),"ecx");', # save $E, modulo-scheduled | |
889 | '&xor ($E,"ecx");', | |
890 | '&xor ("edi","esi");', # Ch(e,f,g) | |
891 | '&ror ($E,6);', # T = Sigma1(e) | |
892 | '&mov ("ecx",$AH[0]);', | |
32213d8d AP |
893 | '&add ($E,"edi");', # T += Ch(e,f,g) |
894 | '&mov ("edi",&off($b));', | |
f3eac74b | 895 | '&mov ("esi",$AH[0]);', |
f3eac74b AP |
896 | |
897 | '&ror ("ecx",22-13);', | |
f3eac74b | 898 | '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled |
32213d8d | 899 | '&xor ("ecx",$AH[0]);', |
f3eac74b | 900 | '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round |
32213d8d | 901 | '&add ($E,&off($h));', # T += h |
f3eac74b AP |
902 | '&ror ("ecx",13-2);', |
903 | '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) | |
f3eac74b | 904 | '&xor ("ecx","esi");', |
32213d8d | 905 | '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] |
f3eac74b AP |
906 | '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) |
907 | '&ror ("ecx",2);', # Sigma0(a) | |
908 | ||
909 | '&add ($AH[1],$E);', # h += T | |
910 | '&add ($E,&off($d));', # d += T | |
911 | '&add ($AH[1],"ecx");'. # h += Sigma0(a) | |
912 | ||
913 | '@AH = reverse(@AH); $i++;' # rotate(a,h) | |
914 | ); | |
915 | } | |
916 | ||
917 | for ($i=0,$j=0; $j<4; $j++) { | |
32213d8d | 918 | &SSSE3_00_47($j,\&body_00_15,@X); |
f3eac74b AP |
919 | push(@X,shift(@X)); # rotate(@X) |
920 | } | |
921 | &cmp (&DWP(16*$j,$K256),0x00010203); | |
32213d8d | 922 | &jne (&label("ssse3_00_47")); |
f3eac74b AP |
923 | |
924 | for ($i=0; $i<16; ) { | |
925 | foreach(body_00_15()) { eval; } | |
926 | } | |
927 | ||
928 | &mov ("esi",&DWP(96,"esp")); #ctx | |
929 | #&mov ($AH[0],&DWP(0,"esp")); | |
930 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
931 | #&mov ("edi", &DWP(8,"esp")); | |
932 | &mov ("ecx",&DWP(12,"esp")); | |
933 | &add ($AH[0],&DWP(0,"esi")); | |
934 | &add ($AH[1],&DWP(4,"esi")); | |
935 | &add ("edi",&DWP(8,"esi")); | |
936 | &add ("ecx",&DWP(12,"esi")); | |
937 | &mov (&DWP(0,"esi"),$AH[0]); | |
938 | &mov (&DWP(4,"esi"),$AH[1]); | |
939 | &mov (&DWP(8,"esi"),"edi"); | |
940 | &mov (&DWP(12,"esi"),"ecx"); | |
941 | #&mov (&DWP(0,"esp"),$AH[0]); | |
942 | &mov (&DWP(4,"esp"),$AH[1]); | |
943 | &xor ($AH[1],"edi"); # magic | |
944 | &mov (&DWP(8,"esp"),"edi"); | |
945 | &mov (&DWP(12,"esp"),"ecx"); | |
946 | #&mov ($E,&DWP(16,"esp")); | |
947 | &mov ("edi",&DWP(20,"esp")); | |
948 | &mov ("ecx",&DWP(24,"esp")); | |
949 | &add ($E,&DWP(16,"esi")); | |
950 | &add ("edi",&DWP(20,"esi")); | |
951 | &add ("ecx",&DWP(24,"esi")); | |
952 | &mov (&DWP(16,"esi"),$E); | |
953 | &mov (&DWP(20,"esi"),"edi"); | |
954 | &mov (&DWP(20,"esp"),"edi"); | |
955 | &mov ("edi",&DWP(28,"esp")); | |
956 | &mov (&DWP(24,"esi"),"ecx"); | |
957 | #&mov (&DWP(16,"esp"),$E); | |
958 | &add ("edi",&DWP(28,"esi")); | |
959 | &mov (&DWP(24,"esp"),"ecx"); | |
960 | &mov (&DWP(28,"esi"),"edi"); | |
961 | &mov (&DWP(28,"esp"),"edi"); | |
962 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
963 | ||
32213d8d | 964 | &movdqa ($t3,&QWP(64,$K256)); |
f3eac74b AP |
965 | &sub ($K256,3*64); # rewind K |
966 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
32213d8d | 967 | &jb (&label("grand_ssse3")); |
f3eac74b AP |
968 | |
969 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
f3eac74b | 970 | &function_end_A(); |
32213d8d AP |
971 | if ($avx) { |
972 | &set_label("AVX",32); | |
973 | if ($avx>1) { | |
32213d8d AP |
974 | &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 |
975 | &cmp ("edx",1<<8|1<<3); | |
976 | &je (&label("AVX_BMI")); | |
977 | } | |
f3eac74b AP |
978 | &lea ("esp",&DWP(-96,"esp")); |
979 | &vzeroall (); | |
980 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
981 | &mov ($AH[0],&DWP(0,"esi")); | |
982 | &mov ($AH[1],&DWP(4,"esi")); | |
983 | &mov ("ecx",&DWP(8,"esi")); | |
984 | &mov ("edi",&DWP(12,"esi")); | |
985 | #&mov (&DWP(0,"esp"),$AH[0]); | |
986 | &mov (&DWP(4,"esp"),$AH[1]); | |
987 | &xor ($AH[1],"ecx"); # magic | |
988 | &mov (&DWP(8,"esp"),"ecx"); | |
989 | &mov (&DWP(12,"esp"),"edi"); | |
990 | &mov ($E,&DWP(16,"esi")); | |
991 | &mov ("edi",&DWP(20,"esi")); | |
992 | &mov ("ecx",&DWP(24,"esi")); | |
993 | &mov ("esi",&DWP(28,"esi")); | |
994 | #&mov (&DWP(16,"esp"),$E); | |
995 | &mov (&DWP(20,"esp"),"edi"); | |
996 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
997 | &mov (&DWP(24,"esp"),"ecx"); | |
998 | &mov (&DWP(28,"esp"),"esi"); | |
2dce10c5 | 999 | &vmovdqa ($t3,&QWP(256,$K256)); |
f3eac74b AP |
1000 | &jmp (&label("grand_avx")); |
1001 | ||
32213d8d | 1002 | &set_label("grand_avx",32); |
f3eac74b AP |
1003 | # load input, reverse byte order, add K256[0..15], save to stack |
1004 | &vmovdqu (@X[0],&QWP(0,"edi")); | |
1005 | &vmovdqu (@X[1],&QWP(16,"edi")); | |
1006 | &vmovdqu (@X[2],&QWP(32,"edi")); | |
1007 | &vmovdqu (@X[3],&QWP(48,"edi")); | |
1008 | &add ("edi",64); | |
1009 | &vpshufb (@X[0],@X[0],$t3); | |
1010 | &mov (&DWP(96+4,"esp"),"edi"); | |
1011 | &vpshufb (@X[1],@X[1],$t3); | |
1012 | &vpshufb (@X[2],@X[2],$t3); | |
1013 | &vpaddd ($t0,@X[0],&QWP(0,$K256)); | |
1014 | &vpshufb (@X[3],@X[3],$t3); | |
1015 | &vpaddd ($t1,@X[1],&QWP(16,$K256)); | |
1016 | &vpaddd ($t2,@X[2],&QWP(32,$K256)); | |
1017 | &vpaddd ($t3,@X[3],&QWP(48,$K256)); | |
1018 | &vmovdqa (&QWP(32+0,"esp"),$t0); | |
1019 | &vmovdqa (&QWP(32+16,"esp"),$t1); | |
1020 | &vmovdqa (&QWP(32+32,"esp"),$t2); | |
1021 | &vmovdqa (&QWP(32+48,"esp"),$t3); | |
1022 | &jmp (&label("avx_00_47")); | |
1023 | ||
1024 | &set_label("avx_00_47",16); | |
1025 | &add ($K256,64); | |
1026 | ||
1027 | sub Xupdate_AVX () { | |
1028 | ( | |
1029 | '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] | |
1030 | '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] | |
1031 | '&vpsrld ($t2,$t0,7);', | |
1032 | '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] | |
1033 | '&vpsrld ($t3,$t0,3);', | |
1034 | '&vpslld ($t1,$t0,14);', | |
1035 | '&vpxor ($t0,$t3,$t2);', | |
47edeb9f | 1036 | '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] |
f3eac74b AP |
1037 | '&vpsrld ($t2,$t2,18-7);', |
1038 | '&vpxor ($t0,$t0,$t1);', | |
1039 | '&vpslld ($t1,$t1,25-14);', | |
1040 | '&vpxor ($t0,$t0,$t2);', | |
47edeb9f | 1041 | '&vpsrld ($t2,$t3,10);', |
f3eac74b | 1042 | '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) |
47edeb9f | 1043 | '&vpsrlq ($t1,$t3,17);', |
f3eac74b | 1044 | '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) |
47edeb9f AP |
1045 | '&vpxor ($t2,$t2,$t1);', |
1046 | '&vpsrlq ($t3,$t3,19);', | |
1047 | '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] | |
1048 | '&vpshufd ($t3,$t2,0b10000100);', | |
f3eac74b AP |
1049 | '&vpsrldq ($t3,$t3,8);', |
1050 | '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) | |
47edeb9f AP |
1051 | '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] |
1052 | '&vpsrld ($t2,$t3,10);', | |
1053 | '&vpsrlq ($t1,$t3,17);', | |
1054 | '&vpxor ($t2,$t2,$t1);', | |
1055 | '&vpsrlq ($t3,$t3,19);', | |
1056 | '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] | |
1057 | '&vpshufd ($t3,$t2,0b11101000);', | |
f3eac74b AP |
1058 | '&vpslldq ($t3,$t3,8);', |
1059 | '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) | |
1060 | ); | |
1061 | } | |
1062 | ||
1063 | local *ror = sub { &shrd(@_[0],@_) }; | |
1064 | sub AVX_00_47 () { | |
1065 | my $j = shift; | |
1066 | my $body = shift; | |
1067 | my @X = @_; | |
1068 | my @insns = (&$body,&$body,&$body,&$body); # 120 instructions | |
32213d8d | 1069 | my $insn; |
f3eac74b | 1070 | |
47edeb9f | 1071 | foreach (Xupdate_AVX()) { # 31 instructions |
f3eac74b AP |
1072 | eval; |
1073 | eval(shift(@insns)); | |
1074 | eval(shift(@insns)); | |
32213d8d AP |
1075 | eval($insn = shift(@insns)); |
1076 | eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); | |
f3eac74b AP |
1077 | } |
1078 | &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); | |
1079 | foreach (@insns) { eval; } # remaining instructions | |
1080 | &vmovdqa (&QWP(32+16*$j,"esp"),$t2); | |
1081 | } | |
1082 | ||
1083 | for ($i=0,$j=0; $j<4; $j++) { | |
1084 | &AVX_00_47($j,\&body_00_15,@X); | |
1085 | push(@X,shift(@X)); # rotate(@X) | |
1086 | } | |
1087 | &cmp (&DWP(16*$j,$K256),0x00010203); | |
1088 | &jne (&label("avx_00_47")); | |
1089 | ||
1090 | for ($i=0; $i<16; ) { | |
1091 | foreach(body_00_15()) { eval; } | |
1092 | } | |
1093 | ||
1094 | &mov ("esi",&DWP(96,"esp")); #ctx | |
1095 | #&mov ($AH[0],&DWP(0,"esp")); | |
1096 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
1097 | #&mov ("edi", &DWP(8,"esp")); | |
1098 | &mov ("ecx",&DWP(12,"esp")); | |
1099 | &add ($AH[0],&DWP(0,"esi")); | |
1100 | &add ($AH[1],&DWP(4,"esi")); | |
1101 | &add ("edi",&DWP(8,"esi")); | |
1102 | &add ("ecx",&DWP(12,"esi")); | |
1103 | &mov (&DWP(0,"esi"),$AH[0]); | |
1104 | &mov (&DWP(4,"esi"),$AH[1]); | |
1105 | &mov (&DWP(8,"esi"),"edi"); | |
1106 | &mov (&DWP(12,"esi"),"ecx"); | |
1107 | #&mov (&DWP(0,"esp"),$AH[0]); | |
1108 | &mov (&DWP(4,"esp"),$AH[1]); | |
1109 | &xor ($AH[1],"edi"); # magic | |
1110 | &mov (&DWP(8,"esp"),"edi"); | |
1111 | &mov (&DWP(12,"esp"),"ecx"); | |
1112 | #&mov ($E,&DWP(16,"esp")); | |
1113 | &mov ("edi",&DWP(20,"esp")); | |
1114 | &mov ("ecx",&DWP(24,"esp")); | |
1115 | &add ($E,&DWP(16,"esi")); | |
1116 | &add ("edi",&DWP(20,"esi")); | |
1117 | &add ("ecx",&DWP(24,"esi")); | |
1118 | &mov (&DWP(16,"esi"),$E); | |
1119 | &mov (&DWP(20,"esi"),"edi"); | |
1120 | &mov (&DWP(20,"esp"),"edi"); | |
1121 | &mov ("edi",&DWP(28,"esp")); | |
1122 | &mov (&DWP(24,"esi"),"ecx"); | |
1123 | #&mov (&DWP(16,"esp"),$E); | |
1124 | &add ("edi",&DWP(28,"esi")); | |
1125 | &mov (&DWP(24,"esp"),"ecx"); | |
1126 | &mov (&DWP(28,"esi"),"edi"); | |
1127 | &mov (&DWP(28,"esp"),"edi"); | |
1128 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1129 | ||
1130 | &vmovdqa ($t3,&QWP(64,$K256)); | |
1131 | &sub ($K256,3*64); # rewind K | |
1132 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
1133 | &jb (&label("grand_avx")); | |
1134 | ||
1135 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
1136 | &vzeroall (); | |
1137 | &function_end_A(); | |
32213d8d AP |
1138 | if ($avx>1) { |
1139 | sub bodyx_00_15 () { # +10% | |
1140 | ( | |
1141 | '&rorx ("ecx",$E,6)', | |
1142 | '&rorx ("esi",$E,11)', | |
1143 | '&mov (&off($e),$E)', # save $E, modulo-scheduled | |
1144 | '&rorx ("edi",$E,25)', | |
1145 | '&xor ("ecx","esi")', | |
1146 | '&andn ("esi",$E,&off($g))', | |
1147 | '&xor ("ecx","edi")', # Sigma1(e) | |
1148 | '&and ($E,&off($f))', | |
1149 | '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled | |
1150 | '&or ($E,"esi")', # T = Ch(e,f,g) | |
1151 | ||
1152 | '&rorx ("edi",$AH[0],2)', | |
1153 | '&rorx ("esi",$AH[0],13)', | |
1154 | '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) | |
1155 | '&rorx ("ecx",$AH[0],22)', | |
1156 | '&xor ("esi","edi")', | |
1157 | '&mov ("edi",&off($b))', | |
1158 | '&xor ("ecx","esi")', # Sigma0(a) | |
1159 | ||
1160 | '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round | |
1161 | '&add ($E,&off($h))', # T += h | |
1162 | '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) | |
1163 | '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] | |
1164 | '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) | |
1165 | ||
1166 | '&add ("ecx",$E)', # h += T | |
1167 | '&add ($E,&off($d))', # d += T | |
1168 | '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) | |
1169 | ||
1170 | '@AH = reverse(@AH); $i++;' # rotate(a,h) | |
1171 | ); | |
f889bb03 | 1172 | } |
32213d8d AP |
1173 | |
1174 | &set_label("AVX_BMI",32); | |
1175 | &lea ("esp",&DWP(-96,"esp")); | |
1176 | &vzeroall (); | |
1177 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
1178 | &mov ($AH[0],&DWP(0,"esi")); | |
1179 | &mov ($AH[1],&DWP(4,"esi")); | |
1180 | &mov ("ecx",&DWP(8,"esi")); | |
1181 | &mov ("edi",&DWP(12,"esi")); | |
1182 | #&mov (&DWP(0,"esp"),$AH[0]); | |
1183 | &mov (&DWP(4,"esp"),$AH[1]); | |
1184 | &xor ($AH[1],"ecx"); # magic | |
1185 | &mov (&DWP(8,"esp"),"ecx"); | |
1186 | &mov (&DWP(12,"esp"),"edi"); | |
1187 | &mov ($E,&DWP(16,"esi")); | |
1188 | &mov ("edi",&DWP(20,"esi")); | |
1189 | &mov ("ecx",&DWP(24,"esi")); | |
1190 | &mov ("esi",&DWP(28,"esi")); | |
1191 | #&mov (&DWP(16,"esp"),$E); | |
1192 | &mov (&DWP(20,"esp"),"edi"); | |
1193 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1194 | &mov (&DWP(24,"esp"),"ecx"); | |
1195 | &mov (&DWP(28,"esp"),"esi"); | |
1196 | &vmovdqa ($t3,&QWP(256,$K256)); | |
1197 | &jmp (&label("grand_avx_bmi")); | |
1198 | ||
1199 | &set_label("grand_avx_bmi",32); | |
1200 | # load input, reverse byte order, add K256[0..15], save to stack | |
1201 | &vmovdqu (@X[0],&QWP(0,"edi")); | |
1202 | &vmovdqu (@X[1],&QWP(16,"edi")); | |
1203 | &vmovdqu (@X[2],&QWP(32,"edi")); | |
1204 | &vmovdqu (@X[3],&QWP(48,"edi")); | |
1205 | &add ("edi",64); | |
1206 | &vpshufb (@X[0],@X[0],$t3); | |
1207 | &mov (&DWP(96+4,"esp"),"edi"); | |
1208 | &vpshufb (@X[1],@X[1],$t3); | |
1209 | &vpshufb (@X[2],@X[2],$t3); | |
1210 | &vpaddd ($t0,@X[0],&QWP(0,$K256)); | |
1211 | &vpshufb (@X[3],@X[3],$t3); | |
1212 | &vpaddd ($t1,@X[1],&QWP(16,$K256)); | |
1213 | &vpaddd ($t2,@X[2],&QWP(32,$K256)); | |
1214 | &vpaddd ($t3,@X[3],&QWP(48,$K256)); | |
1215 | &vmovdqa (&QWP(32+0,"esp"),$t0); | |
1216 | &vmovdqa (&QWP(32+16,"esp"),$t1); | |
1217 | &vmovdqa (&QWP(32+32,"esp"),$t2); | |
1218 | &vmovdqa (&QWP(32+48,"esp"),$t3); | |
1219 | &jmp (&label("avx_bmi_00_47")); | |
1220 | ||
1221 | &set_label("avx_bmi_00_47",16); | |
1222 | &add ($K256,64); | |
1223 | ||
1224 | for ($i=0,$j=0; $j<4; $j++) { | |
1225 | &AVX_00_47($j,\&bodyx_00_15,@X); | |
1226 | push(@X,shift(@X)); # rotate(@X) | |
1227 | } | |
1228 | &cmp (&DWP(16*$j,$K256),0x00010203); | |
1229 | &jne (&label("avx_bmi_00_47")); | |
1230 | ||
1231 | for ($i=0; $i<16; ) { | |
1232 | foreach(bodyx_00_15()) { eval; } | |
1233 | } | |
1234 | ||
1235 | &mov ("esi",&DWP(96,"esp")); #ctx | |
1236 | #&mov ($AH[0],&DWP(0,"esp")); | |
1237 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
1238 | #&mov ("edi", &DWP(8,"esp")); | |
1239 | &mov ("ecx",&DWP(12,"esp")); | |
1240 | &add ($AH[0],&DWP(0,"esi")); | |
1241 | &add ($AH[1],&DWP(4,"esi")); | |
1242 | &add ("edi",&DWP(8,"esi")); | |
1243 | &add ("ecx",&DWP(12,"esi")); | |
1244 | &mov (&DWP(0,"esi"),$AH[0]); | |
1245 | &mov (&DWP(4,"esi"),$AH[1]); | |
1246 | &mov (&DWP(8,"esi"),"edi"); | |
1247 | &mov (&DWP(12,"esi"),"ecx"); | |
1248 | #&mov (&DWP(0,"esp"),$AH[0]); | |
1249 | &mov (&DWP(4,"esp"),$AH[1]); | |
1250 | &xor ($AH[1],"edi"); # magic | |
1251 | &mov (&DWP(8,"esp"),"edi"); | |
1252 | &mov (&DWP(12,"esp"),"ecx"); | |
1253 | #&mov ($E,&DWP(16,"esp")); | |
1254 | &mov ("edi",&DWP(20,"esp")); | |
1255 | &mov ("ecx",&DWP(24,"esp")); | |
1256 | &add ($E,&DWP(16,"esi")); | |
1257 | &add ("edi",&DWP(20,"esi")); | |
1258 | &add ("ecx",&DWP(24,"esi")); | |
1259 | &mov (&DWP(16,"esi"),$E); | |
1260 | &mov (&DWP(20,"esi"),"edi"); | |
1261 | &mov (&DWP(20,"esp"),"edi"); | |
1262 | &mov ("edi",&DWP(28,"esp")); | |
1263 | &mov (&DWP(24,"esi"),"ecx"); | |
1264 | #&mov (&DWP(16,"esp"),$E); | |
1265 | &add ("edi",&DWP(28,"esi")); | |
1266 | &mov (&DWP(24,"esp"),"ecx"); | |
1267 | &mov (&DWP(28,"esi"),"edi"); | |
1268 | &mov (&DWP(28,"esp"),"edi"); | |
1269 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1270 | ||
1271 | &vmovdqa ($t3,&QWP(64,$K256)); | |
1272 | &sub ($K256,3*64); # rewind K | |
1273 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
1274 | &jb (&label("grand_avx_bmi")); | |
1275 | ||
1276 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
1277 | &vzeroall (); | |
1278 | &function_end_A(); | |
1279 | } | |
1280 | } | |
1281 | }}} | |
ee0449b1 | 1282 | &function_end_B("sha256_block_data_order"); |
ee0449b1 AP |
1283 | |
1284 | &asm_finish(); | |
e87e380a RL |
1285 | |
1286 | close STDOUT; |