]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
ee0449b1 AP |
9 | # |
10 | # ==================================================================== | |
f889bb03 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
ee0449b1 AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # SHA256 block transform for x86. September 2007. | |
18 | # | |
ee9bf3eb | 19 | # Performance improvement over compiler generated code varies from |
053fa39a | 20 | # 10% to 40% [see below]. Not very impressive on some ยต-archs, but |
ee9bf3eb AP |
21 | # it's 5 times smaller and optimizies amount of writes. |
22 | # | |
23 | # May 2012. | |
24 | # | |
f889bb03 AP |
25 | # Optimization including two of Pavel Semjanov's ideas, alternative |
26 | # Maj and full unroll, resulted in ~20-25% improvement on most CPUs, | |
3a9b3852 AP |
27 | # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost |
28 | # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not | |
29 | # on P4, where it kills performance, nor Sandy Bridge, where folded | |
30 | # loop is approximately as fast... | |
ee9bf3eb | 31 | # |
f3eac74b AP |
32 | # June 2012. |
33 | # | |
34 | # Add AMD XOP-specific code path, >30% improvement on Bulldozer over | |
35 | # May version, >60% over original. Add AVX+shrd code path, >25% | |
36 | # improvement on Sandy Bridge over May version, 60% over original. | |
37 | # | |
32213d8d AP |
38 | # May 2013. |
39 | # | |
40 | # Replace AMD XOP code path with SSSE3 to cover more processors. | |
41 | # (Biggest improvement coefficient is on upcoming Atom Silvermont, | |
42 | # not shown.) Add AVX+BMI code path. | |
43 | # | |
619b9466 AP |
44 | # March 2014. |
45 | # | |
46 | # Add support for Intel SHA Extensions. | |
47 | # | |
ee0449b1 AP |
48 | # Performance in clock cycles per processed byte (less is better): |
49 | # | |
609b0852 | 50 | # gcc icc x86 asm(*) SIMD x86_64 asm(**) |
32213d8d AP |
51 | # Pentium 46 57 40/38 - - |
52 | # PIII 36 33 27/24 - - | |
53 | # P4 41 38 28 - 17.3 | |
54 | # AMD K8 27 25 19/15.5 - 14.9 | |
55 | # Core2 26 23 18/15.6 14.3 13.8 | |
56 | # Westmere 27 - 19/15.7 13.4 12.3 | |
57 | # Sandy Bridge 25 - 15.9 12.4 11.6 | |
58 | # Ivy Bridge 24 - 15.0 11.4 10.3 | |
59 | # Haswell 22 - 13.9 9.46 7.80 | |
a30b0522 | 60 | # Skylake 20 - 14.9 9.50 7.70 |
32213d8d AP |
61 | # Bulldozer 36 - 27/22 17.0 13.6 |
62 | # VIA Nano 36 - 25/22 16.8 16.5 | |
63 | # Atom 50 - 30/25 21.9 18.9 | |
b59f92e7 | 64 | # Silvermont 40 - 34/31 22.9 20.6 |
a30b0522 | 65 | # Goldmont 29 - 20 16.3(***) |
ee0449b1 | 66 | # |
32213d8d | 67 | # (*) numbers after slash are for unrolled loop, where applicable; |
f3eac74b | 68 | # (**) x86_64 assembly performance is presented for reference |
32213d8d | 69 | # purposes, results are best-available; |
a30b0522 | 70 | # (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; |
ee0449b1 AP |
71 | |
72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
73 | push(@INC,"${dir}","${dir}../../perlasm"); | |
74 | require "x86asm.pl"; | |
75 | ||
e87e380a RL |
76 | $output=pop; |
77 | open STDOUT,">$output"; | |
78 | ||
e195c8a2 | 79 | &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); |
ee0449b1 | 80 | |
32213d8d | 81 | $xmm=$avx=0; |
f3eac74b AP |
82 | for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } |
83 | ||
32213d8d AP |
84 | if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
85 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
86 | $avx = ($1>=2.19) + ($1>=2.22); | |
87 | } | |
f3eac74b | 88 | |
32213d8d AP |
89 | if ($xmm && !$avx && $ARGV[0] eq "win32n" && |
90 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
91 | $avx = ($1>=2.03) + ($1>=2.10); | |
92 | } | |
f3eac74b | 93 | |
32213d8d AP |
94 | if ($xmm && !$avx && $ARGV[0] eq "win32" && |
95 | `ml 2>&1` =~ /Version ([0-9]+)\./) { | |
96 | $avx = ($1>=10) + ($1>=11); | |
97 | } | |
f3eac74b | 98 | |
a356e488 AP |
99 | if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { |
100 | $avx = ($2>=3.0) + ($2>3.0); | |
ac171925 AP |
101 | } |
102 | ||
977f32e8 AP |
103 | $shaext=$xmm; ### set to zero if compiling for 1.0.1 |
104 | ||
3a9b3852 AP |
105 | $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of |
106 | # fully unrolled loop was measured to run about | |
107 | # 3-4x slower. If slowdown coefficient is N and | |
108 | # unrolled loop is m times faster, then you break | |
109 | # even at (N-1)/(m-1) blocks. Then it needs to be | |
110 | # adjusted for probability of code being evicted, | |
111 | # code size/cache size=1/4. Typical m is 1.15... | |
f889bb03 | 112 | |
ee0449b1 AP |
113 | $A="eax"; |
114 | $E="edx"; | |
115 | $T="ebx"; | |
ee9bf3eb AP |
116 | $Aoff=&DWP(4,"esp"); |
117 | $Boff=&DWP(8,"esp"); | |
118 | $Coff=&DWP(12,"esp"); | |
119 | $Doff=&DWP(16,"esp"); | |
120 | $Eoff=&DWP(20,"esp"); | |
121 | $Foff=&DWP(24,"esp"); | |
122 | $Goff=&DWP(28,"esp"); | |
123 | $Hoff=&DWP(32,"esp"); | |
124 | $Xoff=&DWP(36,"esp"); | |
ee0449b1 AP |
125 | $K256="ebp"; |
126 | ||
f889bb03 AP |
127 | sub BODY_16_63() { |
128 | &mov ($T,"ecx"); # "ecx" is preloaded | |
129 | &mov ("esi",&DWP(4*(9+15+16-14),"esp")); | |
130 | &ror ("ecx",18-7); | |
131 | &mov ("edi","esi"); | |
132 | &ror ("esi",19-17); | |
133 | &xor ("ecx",$T); | |
3a9b3852 | 134 | &shr ($T,3); |
f889bb03 | 135 | &ror ("ecx",7); |
3a9b3852 | 136 | &xor ("esi","edi"); |
f889bb03 AP |
137 | &xor ($T,"ecx"); # T = sigma0(X[-15]) |
138 | &ror ("esi",17); | |
139 | &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] | |
140 | &shr ("edi",10); | |
141 | &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] | |
142 | #&xor ("edi","esi") # sigma1(X[-2]) | |
143 | # &add ($T,"edi"); # T += sigma1(X[-2]) | |
144 | # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] | |
ee9bf3eb | 145 | |
f889bb03 AP |
146 | &BODY_00_15(1); |
147 | } | |
ee0449b1 | 148 | sub BODY_00_15() { |
b5e5760d | 149 | my $in_16_63=shift; |
8dc899de | 150 | |
ee0449b1 | 151 | &mov ("ecx",$E); |
f889bb03 | 152 | &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) |
ee0449b1 | 153 | &mov ("esi",$Foff); |
ee9bf3eb AP |
154 | &ror ("ecx",25-11); |
155 | &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) | |
ee9bf3eb | 156 | &mov ("edi",$Goff); |
7470276a | 157 | &xor ("ecx",$E); |
d4bb6bdd | 158 | &xor ("esi","edi"); |
f889bb03 | 159 | &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); |
ee9bf3eb | 160 | &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] |
d4bb6bdd | 161 | &ror ("ecx",11-6); |
ee9bf3eb | 162 | &and ("esi",$E); |
f889bb03 | 163 | &mov ($Eoff,$E); # modulo-scheduled |
d4bb6bdd | 164 | &xor ($E,"ecx"); |
f889bb03 | 165 | &add ($T,$Hoff); # T += h |
3a9b3852 | 166 | &xor ("esi","edi"); # Ch(e,f,g) |
f889bb03 | 167 | &ror ($E,6); # Sigma1(e) |
d4bb6bdd | 168 | &mov ("ecx",$A); |
f889bb03 | 169 | &add ($T,"esi"); # T += Ch(e,f,g) |
ee0449b1 | 170 | |
7470276a | 171 | &ror ("ecx",22-13); |
f889bb03 | 172 | &add ($T,$E); # T += Sigma1(e) |
ee9bf3eb | 173 | &mov ("edi",$Boff); |
7470276a | 174 | &xor ("ecx",$A); |
f889bb03 | 175 | &mov ($Aoff,$A); # modulo-scheduled |
d4bb6bdd AP |
176 | &lea ("esp",&DWP(-4,"esp")); |
177 | &ror ("ecx",13-2); | |
178 | &mov ("esi",&DWP(0,$K256)); | |
179 | &xor ("ecx",$A); | |
f889bb03 AP |
180 | &mov ($E,$Eoff); # e in next iteration, d in this one |
181 | &xor ($A,"edi"); # a ^= b | |
182 | &ror ("ecx",2); # Sigma0(a) | |
ee0449b1 | 183 | |
f889bb03 AP |
184 | &add ($T,"esi"); # T+= K[i] |
185 | &mov (&DWP(0,"esp"),$A); # (b^c) in next round | |
186 | &add ($E,$T); # d += T | |
ee9bf3eb | 187 | &and ($A,&DWP(4,"esp")); # a &= (b^c) |
f889bb03 AP |
188 | &add ($T,"ecx"); # T += Sigma0(a) |
189 | &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) | |
190 | &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T | |
ee0449b1 | 191 | &add ($K256,4); |
f889bb03 | 192 | &add ($A,$T); # h += T |
ee0449b1 AP |
193 | } |
194 | ||
f889bb03 AP |
195 | &external_label("OPENSSL_ia32cap_P") if (!$i386); |
196 | ||
87facba3 | 197 | &function_begin("sha256_block_data_order"); |
ee0449b1 AP |
198 | &mov ("esi",wparam(0)); # ctx |
199 | &mov ("edi",wparam(1)); # inp | |
200 | &mov ("eax",wparam(2)); # num | |
201 | &mov ("ebx","esp"); # saved sp | |
202 | ||
203 | &call (&label("pic_point")); # make it PIC! | |
204 | &set_label("pic_point"); | |
205 | &blindpop($K256); | |
206 | &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); | |
207 | ||
208 | &sub ("esp",16); | |
209 | &and ("esp",-64); | |
210 | ||
211 | &shl ("eax",6); | |
212 | &add ("eax","edi"); | |
213 | &mov (&DWP(0,"esp"),"esi"); # ctx | |
214 | &mov (&DWP(4,"esp"),"edi"); # inp | |
215 | &mov (&DWP(8,"esp"),"eax"); # inp+num*128 | |
216 | &mov (&DWP(12,"esp"),"ebx"); # saved sp | |
d49135e7 | 217 | if (!$i386 && $xmm) { |
f889bb03 AP |
218 | &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); |
219 | &mov ("ecx",&DWP(0,"edx")); | |
32213d8d | 220 | &mov ("ebx",&DWP(4,"edx")); |
f889bb03 AP |
221 | &test ("ecx",1<<20); # check for P4 |
222 | &jnz (&label("loop")); | |
619b9466 AP |
223 | &mov ("edx",&DWP(8,"edx")) if ($xmm); |
224 | &test ("ecx",1<<24); # check for FXSR | |
225 | &jz ($unroll_after?&label("no_xmm"):&label("loop")); | |
f889bb03 | 226 | &and ("ecx",1<<30); # mask "Intel CPU" bit |
32213d8d | 227 | &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits |
977f32e8 AP |
228 | &test ("edx",1<<29) if ($shaext); # check for SHA |
229 | &jnz (&label("shaext")) if ($shaext); | |
32213d8d AP |
230 | &or ("ecx","ebx"); |
231 | &and ("ecx",1<<28|1<<30); | |
f889bb03 | 232 | &cmp ("ecx",1<<28|1<<30); |
32213d8d AP |
233 | if ($xmm) { |
234 | &je (&label("AVX")) if ($avx); | |
235 | &test ("ebx",1<<9); # check for SSSE3 | |
236 | &jnz (&label("SSSE3")); | |
237 | } else { | |
238 | &je (&label("loop_shrd")); | |
239 | } | |
f889bb03 | 240 | if ($unroll_after) { |
619b9466 | 241 | &set_label("no_xmm"); |
f889bb03 AP |
242 | &sub ("eax","edi"); |
243 | &cmp ("eax",$unroll_after); | |
3a9b3852 | 244 | &jae (&label("unrolled")); |
f889bb03 AP |
245 | } } |
246 | &jmp (&label("loop")); | |
ee0449b1 | 247 | |
f889bb03 AP |
248 | sub COMPACT_LOOP() { |
249 | my $suffix=shift; | |
250 | ||
32213d8d | 251 | &set_label("loop$suffix",$suffix?32:16); |
ee0449b1 AP |
252 | # copy input block to stack reversing byte and dword order |
253 | for($i=0;$i<4;$i++) { | |
254 | &mov ("eax",&DWP($i*16+0,"edi")); | |
255 | &mov ("ebx",&DWP($i*16+4,"edi")); | |
256 | &mov ("ecx",&DWP($i*16+8,"edi")); | |
ee0449b1 | 257 | &bswap ("eax"); |
ee9bf3eb | 258 | &mov ("edx",&DWP($i*16+12,"edi")); |
ee0449b1 | 259 | &bswap ("ebx"); |
ee0449b1 | 260 | &push ("eax"); |
ee9bf3eb | 261 | &bswap ("ecx"); |
ee0449b1 | 262 | &push ("ebx"); |
ee9bf3eb | 263 | &bswap ("edx"); |
ee0449b1 AP |
264 | &push ("ecx"); |
265 | &push ("edx"); | |
266 | } | |
267 | &add ("edi",64); | |
ee9bf3eb AP |
268 | &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H |
269 | &mov (&DWP(4*(9+16)+4,"esp"),"edi"); | |
ee0449b1 AP |
270 | |
271 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
272 | &mov ($A,&DWP(0,"esi")); | |
273 | &mov ("ebx",&DWP(4,"esi")); | |
274 | &mov ("ecx",&DWP(8,"esi")); | |
275 | &mov ("edi",&DWP(12,"esi")); | |
276 | # &mov ($Aoff,$A); | |
277 | &mov ($Boff,"ebx"); | |
ee9bf3eb | 278 | &xor ("ebx","ecx"); |
ee0449b1 AP |
279 | &mov ($Coff,"ecx"); |
280 | &mov ($Doff,"edi"); | |
ee9bf3eb | 281 | &mov (&DWP(0,"esp"),"ebx"); # magic |
609b0852 | 282 | &mov ($E,&DWP(16,"esi")); |
ee0449b1 AP |
283 | &mov ("ebx",&DWP(20,"esi")); |
284 | &mov ("ecx",&DWP(24,"esi")); | |
285 | &mov ("edi",&DWP(28,"esi")); | |
286 | # &mov ($Eoff,$E); | |
287 | &mov ($Foff,"ebx"); | |
288 | &mov ($Goff,"ecx"); | |
289 | &mov ($Hoff,"edi"); | |
290 | ||
f889bb03 | 291 | &set_label("00_15$suffix",16); |
ee0449b1 AP |
292 | |
293 | &BODY_00_15(); | |
294 | ||
295 | &cmp ("esi",0xc19bf174); | |
f889bb03 | 296 | &jne (&label("00_15$suffix")); |
ee0449b1 | 297 | |
f889bb03 AP |
298 | &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) |
299 | &jmp (&label("16_63$suffix")); | |
300 | ||
301 | &set_label("16_63$suffix",16); | |
302 | ||
303 | &BODY_16_63(); | |
ee0449b1 AP |
304 | |
305 | &cmp ("esi",0xc67178f2); | |
f889bb03 | 306 | &jne (&label("16_63$suffix")); |
ee0449b1 | 307 | |
ee9bf3eb | 308 | &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx |
ee0449b1 AP |
309 | # &mov ($A,$Aoff); |
310 | &mov ("ebx",$Boff); | |
f889bb03 AP |
311 | # &mov ("edi",$Coff); |
312 | &mov ("ecx",$Doff); | |
ee0449b1 AP |
313 | &add ($A,&DWP(0,"esi")); |
314 | &add ("ebx",&DWP(4,"esi")); | |
f889bb03 AP |
315 | &add ("edi",&DWP(8,"esi")); |
316 | &add ("ecx",&DWP(12,"esi")); | |
ee0449b1 AP |
317 | &mov (&DWP(0,"esi"),$A); |
318 | &mov (&DWP(4,"esi"),"ebx"); | |
f889bb03 AP |
319 | &mov (&DWP(8,"esi"),"edi"); |
320 | &mov (&DWP(12,"esi"),"ecx"); | |
ee0449b1 AP |
321 | # &mov ($E,$Eoff); |
322 | &mov ("eax",$Foff); | |
323 | &mov ("ebx",$Goff); | |
324 | &mov ("ecx",$Hoff); | |
ee9bf3eb | 325 | &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp |
ee0449b1 AP |
326 | &add ($E,&DWP(16,"esi")); |
327 | &add ("eax",&DWP(20,"esi")); | |
328 | &add ("ebx",&DWP(24,"esi")); | |
329 | &add ("ecx",&DWP(28,"esi")); | |
330 | &mov (&DWP(16,"esi"),$E); | |
331 | &mov (&DWP(20,"esi"),"eax"); | |
332 | &mov (&DWP(24,"esi"),"ebx"); | |
333 | &mov (&DWP(28,"esi"),"ecx"); | |
334 | ||
ee9bf3eb | 335 | &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame |
ee0449b1 AP |
336 | &sub ($K256,4*64); # rewind K |
337 | ||
338 | &cmp ("edi",&DWP(8,"esp")); # are we done yet? | |
f889bb03 AP |
339 | &jb (&label("loop$suffix")); |
340 | } | |
341 | &COMPACT_LOOP(); | |
342 | &mov ("esp",&DWP(12,"esp")); # restore sp | |
343 | &function_end_A(); | |
32213d8d | 344 | if (!$i386 && !$xmm) { |
f889bb03 AP |
345 | # ~20% improvement on Sandy Bridge |
346 | local *ror = sub { &shrd(@_[0],@_) }; | |
347 | &COMPACT_LOOP("_shrd"); | |
ee0449b1 AP |
348 | &mov ("esp",&DWP(12,"esp")); # restore sp |
349 | &function_end_A(); | |
f889bb03 | 350 | } |
ee0449b1 AP |
351 | |
352 | &set_label("K256",64); # Yes! I keep it in the code segment! | |
f889bb03 AP |
353 | @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, |
354 | 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, | |
355 | 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, | |
356 | 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, | |
357 | 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, | |
358 | 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, | |
359 | 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, | |
360 | 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, | |
361 | 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, | |
362 | 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, | |
363 | 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, | |
364 | 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, | |
365 | 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, | |
366 | 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, | |
367 | 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, | |
368 | 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); | |
369 | &data_word(@K256); | |
32213d8d AP |
370 | &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask |
371 | &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
372 | ||
373 | ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets | |
374 | sub off { &DWP(4*(((shift)-$i)&7),"esp"); } | |
f889bb03 AP |
375 | |
376 | if (!$i386 && $unroll_after) { | |
377 | my @AH=($A,$K256); | |
378 | ||
379 | &set_label("unrolled",16); | |
380 | &lea ("esp",&DWP(-96,"esp")); | |
381 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
382 | &mov ($AH[0],&DWP(0,"esi")); | |
383 | &mov ($AH[1],&DWP(4,"esi")); | |
384 | &mov ("ecx",&DWP(8,"esi")); | |
385 | &mov ("ebx",&DWP(12,"esi")); | |
386 | #&mov (&DWP(0,"esp"),$AH[0]); | |
387 | &mov (&DWP(4,"esp"),$AH[1]); | |
388 | &xor ($AH[1],"ecx"); # magic | |
389 | &mov (&DWP(8,"esp"),"ecx"); | |
390 | &mov (&DWP(12,"esp"),"ebx"); | |
609b0852 | 391 | &mov ($E,&DWP(16,"esi")); |
f889bb03 AP |
392 | &mov ("ebx",&DWP(20,"esi")); |
393 | &mov ("ecx",&DWP(24,"esi")); | |
394 | &mov ("esi",&DWP(28,"esi")); | |
395 | #&mov (&DWP(16,"esp"),$E); | |
396 | &mov (&DWP(20,"esp"),"ebx"); | |
397 | &mov (&DWP(24,"esp"),"ecx"); | |
398 | &mov (&DWP(28,"esp"),"esi"); | |
399 | &jmp (&label("grand_loop")); | |
400 | ||
401 | &set_label("grand_loop",16); | |
402 | # copy input block to stack reversing byte order | |
403 | for($i=0;$i<5;$i++) { | |
404 | &mov ("ebx",&DWP(12*$i+0,"edi")); | |
405 | &mov ("ecx",&DWP(12*$i+4,"edi")); | |
406 | &bswap ("ebx"); | |
407 | &mov ("esi",&DWP(12*$i+8,"edi")); | |
408 | &bswap ("ecx"); | |
409 | &mov (&DWP(32+12*$i+0,"esp"),"ebx"); | |
410 | &bswap ("esi"); | |
411 | &mov (&DWP(32+12*$i+4,"esp"),"ecx"); | |
412 | &mov (&DWP(32+12*$i+8,"esp"),"esi"); | |
413 | } | |
414 | &mov ("ebx",&DWP($i*12,"edi")); | |
415 | &add ("edi",64); | |
416 | &bswap ("ebx"); | |
417 | &mov (&DWP(96+4,"esp"),"edi"); | |
418 | &mov (&DWP(32+12*$i,"esp"),"ebx"); | |
419 | ||
3a9b3852 | 420 | my ($t1,$t2) = ("ecx","esi"); |
f889bb03 AP |
421 | |
422 | for ($i=0;$i<64;$i++) { | |
423 | ||
424 | if ($i>=16) { | |
3a9b3852 AP |
425 | &mov ($T,$t1); # $t1 is preloaded |
426 | # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); | |
427 | &ror ($t1,18-7); | |
428 | &mov ("edi",$t2); | |
429 | &ror ($t2,19-17); | |
430 | &xor ($t1,$T); | |
431 | &shr ($T,3); | |
432 | &ror ($t1,7); | |
433 | &xor ($t2,"edi"); | |
434 | &xor ($T,$t1); # T = sigma0(X[-15]) | |
435 | &ror ($t2,17); | |
f889bb03 AP |
436 | &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] |
437 | &shr ("edi",10); | |
438 | &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] | |
3a9b3852 | 439 | #&xor ("edi",$t2) # sigma1(X[-2]) |
f889bb03 AP |
440 | # &add ($T,"edi"); # T += sigma1(X[-2]) |
441 | # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] | |
442 | } | |
3a9b3852 AP |
443 | &mov ($t1,$E); |
444 | &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) | |
445 | &mov ($t2,&off($f)); | |
446 | &ror ($E,25-11); | |
f889bb03 AP |
447 | &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) |
448 | &mov ("edi",&off($g)); | |
3a9b3852 | 449 | &xor ($E,$t1); |
f889bb03 | 450 | &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] |
3a9b3852 AP |
451 | &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] |
452 | &xor ($t2,"edi"); | |
453 | &ror ($E,11-6); | |
454 | &and ($t2,$t1); | |
455 | &mov (&off($e),$t1); # save $E, modulo-scheduled | |
456 | &xor ($E,$t1); | |
f889bb03 | 457 | &add ($T,&off($h)); # T += h |
3a9b3852 | 458 | &xor ("edi",$t2); # Ch(e,f,g) |
f889bb03 | 459 | &ror ($E,6); # Sigma1(e) |
3a9b3852 AP |
460 | &mov ($t1,$AH[0]); |
461 | &add ($T,"edi"); # T += Ch(e,f,g) | |
f889bb03 | 462 | |
3a9b3852 AP |
463 | &ror ($t1,22-13); |
464 | &mov ($t2,$AH[0]); | |
f889bb03 | 465 | &mov ("edi",&off($b)); |
3a9b3852 AP |
466 | &xor ($t1,$AH[0]); |
467 | &mov (&off($a),$AH[0]); # save $A, modulo-scheduled | |
f889bb03 | 468 | &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round |
3a9b3852 AP |
469 | &ror ($t1,13-2); |
470 | &and ($AH[1],$AH[0]); # (b^c) &= (a^b) | |
471 | &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] | |
472 | &xor ($t1,$t2); | |
473 | &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) | |
474 | &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); | |
475 | &ror ($t1,2); # Sigma0(a) | |
476 | ||
477 | &add ($AH[1],$E); # h += T | |
478 | &add ($E,&off($d)); # d += T | |
479 | &add ($AH[1],$t1); # h += Sigma0(a) | |
480 | &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); | |
481 | ||
482 | @AH = reverse(@AH); # rotate(a,h) | |
483 | ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) | |
f889bb03 AP |
484 | } |
485 | &mov ("esi",&DWP(96,"esp")); #ctx | |
486 | #&mov ($AH[0],&DWP(0,"esp")); | |
487 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
488 | #&mov ("edi", &DWP(8,"esp")); | |
489 | &mov ("ecx",&DWP(12,"esp")); | |
490 | &add ($AH[0],&DWP(0,"esi")); | |
491 | &add ($AH[1],&DWP(4,"esi")); | |
492 | &add ("edi",&DWP(8,"esi")); | |
493 | &add ("ecx",&DWP(12,"esi")); | |
494 | &mov (&DWP(0,"esi"),$AH[0]); | |
495 | &mov (&DWP(4,"esi"),$AH[1]); | |
496 | &mov (&DWP(8,"esi"),"edi"); | |
497 | &mov (&DWP(12,"esi"),"ecx"); | |
498 | #&mov (&DWP(0,"esp"),$AH[0]); | |
499 | &mov (&DWP(4,"esp"),$AH[1]); | |
500 | &xor ($AH[1],"edi"); # magic | |
501 | &mov (&DWP(8,"esp"),"edi"); | |
502 | &mov (&DWP(12,"esp"),"ecx"); | |
503 | #&mov ($E,&DWP(16,"esp")); | |
504 | &mov ("edi",&DWP(20,"esp")); | |
505 | &mov ("ebx",&DWP(24,"esp")); | |
506 | &mov ("ecx",&DWP(28,"esp")); | |
507 | &add ($E,&DWP(16,"esi")); | |
508 | &add ("edi",&DWP(20,"esi")); | |
509 | &add ("ebx",&DWP(24,"esi")); | |
510 | &add ("ecx",&DWP(28,"esi")); | |
511 | &mov (&DWP(16,"esi"),$E); | |
512 | &mov (&DWP(20,"esi"),"edi"); | |
513 | &mov (&DWP(24,"esi"),"ebx"); | |
514 | &mov (&DWP(28,"esi"),"ecx"); | |
515 | #&mov (&DWP(16,"esp"),$E); | |
516 | &mov (&DWP(20,"esp"),"edi"); | |
517 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
518 | &mov (&DWP(24,"esp"),"ebx"); | |
519 | &mov (&DWP(28,"esp"),"ecx"); | |
520 | ||
521 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
522 | &jb (&label("grand_loop")); | |
523 | ||
524 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
525 | &function_end_A(); | |
32213d8d AP |
526 | } |
527 | if (!$i386 && $xmm) {{{ | |
977f32e8 | 528 | if ($shaext) { |
619b9466 AP |
529 | ###################################################################### |
530 | # Intel SHA Extensions implementation of SHA256 update function. | |
531 | # | |
532 | my ($ctx,$inp,$end)=("esi","edi","eax"); | |
533 | my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); | |
534 | my @MSG=map("xmm$_",(3..6)); | |
535 | ||
536 | sub sha256op38 { | |
537 | my ($opcodelet,$dst,$src)=@_; | |
538 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | |
539 | { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } | |
540 | } | |
541 | sub sha256rnds2 { sha256op38(0xcb,@_); } | |
542 | sub sha256msg1 { sha256op38(0xcc,@_); } | |
543 | sub sha256msg2 { sha256op38(0xcd,@_); } | |
544 | ||
545 | &set_label("shaext",32); | |
546 | &sub ("esp",32); | |
547 | ||
548 | &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA | |
549 | &lea ($K256,&DWP(0x80,$K256)); | |
550 | &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE | |
551 | &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask | |
552 | ||
553 | &pshufd ($Wi,$ABEF,0x1b); # ABCD | |
554 | &pshufd ($ABEF,$ABEF,0xb1); # CDAB | |
555 | &pshufd ($CDGH,$CDGH,0x1b); # EFGH | |
556 | &palignr ($ABEF,$CDGH,8); # ABEF | |
557 | &punpcklqdq ($CDGH,$Wi); # CDGH | |
558 | &jmp (&label("loop_shaext")); | |
559 | ||
560 | &set_label("loop_shaext",16); | |
561 | &movdqu (@MSG[0],&QWP(0,$inp)); | |
562 | &movdqu (@MSG[1],&QWP(0x10,$inp)); | |
563 | &movdqu (@MSG[2],&QWP(0x20,$inp)); | |
564 | &pshufb (@MSG[0],$TMP); | |
565 | &movdqu (@MSG[3],&QWP(0x30,$inp)); | |
566 | &movdqa (&QWP(16,"esp"),$CDGH); # offload | |
567 | ||
568 | &movdqa ($Wi,&QWP(0*16-0x80,$K256)); | |
569 | &paddd ($Wi,@MSG[0]); | |
570 | &pshufb (@MSG[1],$TMP); | |
571 | &sha256rnds2 ($CDGH,$ABEF); # 0-3 | |
572 | &pshufd ($Wi,$Wi,0x0e); | |
573 | &nop (); | |
574 | &movdqa (&QWP(0,"esp"),$ABEF); # offload | |
575 | &sha256rnds2 ($ABEF,$CDGH); | |
576 | ||
577 | &movdqa ($Wi,&QWP(1*16-0x80,$K256)); | |
578 | &paddd ($Wi,@MSG[1]); | |
579 | &pshufb (@MSG[2],$TMP); | |
580 | &sha256rnds2 ($CDGH,$ABEF); # 4-7 | |
581 | &pshufd ($Wi,$Wi,0x0e); | |
582 | &lea ($inp,&DWP(0x40,$inp)); | |
583 | &sha256msg1 (@MSG[0],@MSG[1]); | |
584 | &sha256rnds2 ($ABEF,$CDGH); | |
585 | ||
586 | &movdqa ($Wi,&QWP(2*16-0x80,$K256)); | |
587 | &paddd ($Wi,@MSG[2]); | |
588 | &pshufb (@MSG[3],$TMP); | |
589 | &sha256rnds2 ($CDGH,$ABEF); # 8-11 | |
590 | &pshufd ($Wi,$Wi,0x0e); | |
591 | &movdqa ($TMP,@MSG[3]); | |
592 | &palignr ($TMP,@MSG[2],4); | |
593 | &nop (); | |
594 | &paddd (@MSG[0],$TMP); | |
595 | &sha256msg1 (@MSG[1],@MSG[2]); | |
596 | &sha256rnds2 ($ABEF,$CDGH); | |
597 | ||
598 | &movdqa ($Wi,&QWP(3*16-0x80,$K256)); | |
599 | &paddd ($Wi,@MSG[3]); | |
600 | &sha256msg2 (@MSG[0],@MSG[3]); | |
601 | &sha256rnds2 ($CDGH,$ABEF); # 12-15 | |
602 | &pshufd ($Wi,$Wi,0x0e); | |
603 | &movdqa ($TMP,@MSG[0]); | |
604 | &palignr ($TMP,@MSG[3],4); | |
605 | &nop (); | |
606 | &paddd (@MSG[1],$TMP); | |
607 | &sha256msg1 (@MSG[2],@MSG[3]); | |
608 | &sha256rnds2 ($ABEF,$CDGH); | |
609 | ||
610 | for($i=4;$i<16-3;$i++) { | |
611 | &movdqa ($Wi,&QWP($i*16-0x80,$K256)); | |
612 | &paddd ($Wi,@MSG[0]); | |
613 | &sha256msg2 (@MSG[1],@MSG[0]); | |
614 | &sha256rnds2 ($CDGH,$ABEF); # 16-19... | |
615 | &pshufd ($Wi,$Wi,0x0e); | |
616 | &movdqa ($TMP,@MSG[1]); | |
617 | &palignr ($TMP,@MSG[0],4); | |
618 | &nop (); | |
619 | &paddd (@MSG[2],$TMP); | |
620 | &sha256msg1 (@MSG[3],@MSG[0]); | |
621 | &sha256rnds2 ($ABEF,$CDGH); | |
622 | ||
623 | push(@MSG,shift(@MSG)); | |
624 | } | |
625 | &movdqa ($Wi,&QWP(13*16-0x80,$K256)); | |
626 | &paddd ($Wi,@MSG[0]); | |
627 | &sha256msg2 (@MSG[1],@MSG[0]); | |
628 | &sha256rnds2 ($CDGH,$ABEF); # 52-55 | |
629 | &pshufd ($Wi,$Wi,0x0e); | |
630 | &movdqa ($TMP,@MSG[1]) | |
631 | &palignr ($TMP,@MSG[0],4); | |
632 | &sha256rnds2 ($ABEF,$CDGH); | |
633 | &paddd (@MSG[2],$TMP); | |
634 | ||
635 | &movdqa ($Wi,&QWP(14*16-0x80,$K256)); | |
636 | &paddd ($Wi,@MSG[1]); | |
637 | &sha256rnds2 ($CDGH,$ABEF); # 56-59 | |
638 | &pshufd ($Wi,$Wi,0x0e); | |
639 | &sha256msg2 (@MSG[2],@MSG[1]); | |
640 | &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask | |
641 | &sha256rnds2 ($ABEF,$CDGH); | |
642 | ||
643 | &movdqa ($Wi,&QWP(15*16-0x80,$K256)); | |
644 | &paddd ($Wi,@MSG[2]); | |
645 | &nop (); | |
646 | &sha256rnds2 ($CDGH,$ABEF); # 60-63 | |
647 | &pshufd ($Wi,$Wi,0x0e); | |
648 | &cmp ($end,$inp); | |
649 | &nop (); | |
650 | &sha256rnds2 ($ABEF,$CDGH); | |
651 | ||
652 | &paddd ($CDGH,&QWP(16,"esp")); | |
653 | &paddd ($ABEF,&QWP(0,"esp")); | |
654 | &jnz (&label("loop_shaext")); | |
655 | ||
656 | &pshufd ($CDGH,$CDGH,0xb1); # DCHG | |
657 | &pshufd ($TMP,$ABEF,0x1b); # FEBA | |
658 | &pshufd ($ABEF,$ABEF,0xb1); # BAFE | |
659 | &punpckhqdq ($ABEF,$CDGH); # DCBA | |
660 | &palignr ($CDGH,$TMP,8); # HGFE | |
661 | ||
662 | &mov ("esp",&DWP(32+12,"esp")); | |
663 | &movdqu (&QWP(0,$ctx),$ABEF); | |
664 | &movdqu (&QWP(16,$ctx),$CDGH); | |
665 | &function_end_A(); | |
666 | } | |
667 | ||
f3eac74b AP |
668 | my @X = map("xmm$_",(0..3)); |
669 | my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); | |
670 | my @AH = ($A,$T); | |
671 | ||
32213d8d | 672 | &set_label("SSSE3",32); |
f3eac74b | 673 | &lea ("esp",&DWP(-96,"esp")); |
f3eac74b AP |
674 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
675 | &mov ($AH[0],&DWP(0,"esi")); | |
676 | &mov ($AH[1],&DWP(4,"esi")); | |
677 | &mov ("ecx",&DWP(8,"esi")); | |
678 | &mov ("edi",&DWP(12,"esi")); | |
679 | #&mov (&DWP(0,"esp"),$AH[0]); | |
680 | &mov (&DWP(4,"esp"),$AH[1]); | |
681 | &xor ($AH[1],"ecx"); # magic | |
682 | &mov (&DWP(8,"esp"),"ecx"); | |
683 | &mov (&DWP(12,"esp"),"edi"); | |
684 | &mov ($E,&DWP(16,"esi")); | |
685 | &mov ("edi",&DWP(20,"esi")); | |
686 | &mov ("ecx",&DWP(24,"esi")); | |
687 | &mov ("esi",&DWP(28,"esi")); | |
688 | #&mov (&DWP(16,"esp"),$E); | |
689 | &mov (&DWP(20,"esp"),"edi"); | |
690 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
691 | &mov (&DWP(24,"esp"),"ecx"); | |
692 | &mov (&DWP(28,"esp"),"esi"); | |
32213d8d AP |
693 | &movdqa ($t3,&QWP(256,$K256)); |
694 | &jmp (&label("grand_ssse3")); | |
f3eac74b | 695 | |
32213d8d | 696 | &set_label("grand_ssse3",16); |
f3eac74b | 697 | # load input, reverse byte order, add K256[0..15], save to stack |
32213d8d AP |
698 | &movdqu (@X[0],&QWP(0,"edi")); |
699 | &movdqu (@X[1],&QWP(16,"edi")); | |
700 | &movdqu (@X[2],&QWP(32,"edi")); | |
701 | &movdqu (@X[3],&QWP(48,"edi")); | |
702 | &add ("edi",64); | |
703 | &pshufb (@X[0],$t3); | |
704 | &mov (&DWP(96+4,"esp"),"edi"); | |
705 | &pshufb (@X[1],$t3); | |
706 | &movdqa ($t0,&QWP(0,$K256)); | |
707 | &pshufb (@X[2],$t3); | |
708 | &movdqa ($t1,&QWP(16,$K256)); | |
709 | &paddd ($t0,@X[0]); | |
710 | &pshufb (@X[3],$t3); | |
711 | &movdqa ($t2,&QWP(32,$K256)); | |
712 | &paddd ($t1,@X[1]); | |
713 | &movdqa ($t3,&QWP(48,$K256)); | |
714 | &movdqa (&QWP(32+0,"esp"),$t0); | |
715 | &paddd ($t2,@X[2]); | |
716 | &movdqa (&QWP(32+16,"esp"),$t1); | |
717 | &paddd ($t3,@X[3]); | |
718 | &movdqa (&QWP(32+32,"esp"),$t2); | |
719 | &movdqa (&QWP(32+48,"esp"),$t3); | |
720 | &jmp (&label("ssse3_00_47")); | |
721 | ||
722 | &set_label("ssse3_00_47",16); | |
f3eac74b AP |
723 | &add ($K256,64); |
724 | ||
32213d8d | 725 | sub SSSE3_00_47 () { |
f3eac74b AP |
726 | my $j = shift; |
727 | my $body = shift; | |
728 | my @X = @_; | |
729 | my @insns = (&$body,&$body,&$body,&$body); # 120 instructions | |
730 | ||
f3eac74b | 731 | eval(shift(@insns)); |
32213d8d AP |
732 | &movdqa ($t0,@X[1]); |
733 | eval(shift(@insns)); # @ | |
f3eac74b | 734 | eval(shift(@insns)); |
32213d8d | 735 | &movdqa ($t3,@X[3]); |
f3eac74b AP |
736 | eval(shift(@insns)); |
737 | eval(shift(@insns)); | |
32213d8d | 738 | &palignr ($t0,@X[0],4); # X[1..4] |
f3eac74b | 739 | eval(shift(@insns)); |
32213d8d | 740 | eval(shift(@insns)); # @ |
f3eac74b | 741 | eval(shift(@insns)); |
32213d8d | 742 | &palignr ($t3,@X[2],4); # X[9..12] |
f3eac74b AP |
743 | eval(shift(@insns)); |
744 | eval(shift(@insns)); | |
745 | eval(shift(@insns)); | |
32213d8d AP |
746 | &movdqa ($t1,$t0); |
747 | eval(shift(@insns)); # @ | |
f3eac74b | 748 | eval(shift(@insns)); |
32213d8d | 749 | &movdqa ($t2,$t0); |
f3eac74b AP |
750 | eval(shift(@insns)); |
751 | eval(shift(@insns)); | |
32213d8d | 752 | &psrld ($t0,3); |
f3eac74b | 753 | eval(shift(@insns)); |
32213d8d AP |
754 | eval(shift(@insns)); # @ |
755 | &paddd (@X[0],$t3); # X[0..3] += X[9..12] | |
f3eac74b | 756 | eval(shift(@insns)); |
f3eac74b | 757 | eval(shift(@insns)); |
32213d8d | 758 | &psrld ($t2,7); |
f3eac74b | 759 | eval(shift(@insns)); |
f3eac74b | 760 | eval(shift(@insns)); |
32213d8d | 761 | eval(shift(@insns)); # @ |
f3eac74b | 762 | eval(shift(@insns)); |
32213d8d | 763 | &pshufd ($t3,@X[3],0b11111010); # X[14..15] |
f3eac74b AP |
764 | eval(shift(@insns)); |
765 | eval(shift(@insns)); | |
32213d8d | 766 | &pslld ($t1,32-18); |
f3eac74b | 767 | eval(shift(@insns)); |
32213d8d AP |
768 | eval(shift(@insns)); # @ |
769 | &pxor ($t0,$t2); | |
f3eac74b AP |
770 | eval(shift(@insns)); |
771 | eval(shift(@insns)); | |
32213d8d | 772 | &psrld ($t2,18-7); |
f3eac74b | 773 | eval(shift(@insns)); |
f3eac74b | 774 | eval(shift(@insns)); |
32213d8d AP |
775 | eval(shift(@insns)); # @ |
776 | &pxor ($t0,$t1); | |
f3eac74b AP |
777 | eval(shift(@insns)); |
778 | eval(shift(@insns)); | |
32213d8d | 779 | &pslld ($t1,18-7); |
f3eac74b | 780 | eval(shift(@insns)); |
f3eac74b | 781 | eval(shift(@insns)); |
32213d8d AP |
782 | eval(shift(@insns)); # @ |
783 | &pxor ($t0,$t2); | |
f3eac74b AP |
784 | eval(shift(@insns)); |
785 | eval(shift(@insns)); | |
32213d8d | 786 | &movdqa ($t2,$t3); |
f3eac74b | 787 | eval(shift(@insns)); |
f3eac74b | 788 | eval(shift(@insns)); |
32213d8d AP |
789 | eval(shift(@insns)); # @ |
790 | &pxor ($t0,$t1); # sigma0(X[1..4]) | |
f3eac74b AP |
791 | eval(shift(@insns)); |
792 | eval(shift(@insns)); | |
32213d8d | 793 | &psrld ($t3,10); |
f3eac74b | 794 | eval(shift(@insns)); |
f3eac74b | 795 | eval(shift(@insns)); |
32213d8d AP |
796 | eval(shift(@insns)); # @ |
797 | &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) | |
f3eac74b | 798 | eval(shift(@insns)); |
f3eac74b | 799 | eval(shift(@insns)); |
32213d8d | 800 | &psrlq ($t2,17); |
f3eac74b | 801 | eval(shift(@insns)); |
f3eac74b | 802 | eval(shift(@insns)); |
32213d8d AP |
803 | eval(shift(@insns)); # @ |
804 | &pxor ($t3,$t2); | |
f3eac74b | 805 | eval(shift(@insns)); |
f3eac74b | 806 | eval(shift(@insns)); |
32213d8d | 807 | &psrlq ($t2,19-17); |
f3eac74b AP |
808 | eval(shift(@insns)); |
809 | eval(shift(@insns)); | |
32213d8d AP |
810 | eval(shift(@insns)); # @ |
811 | &pxor ($t3,$t2); | |
f3eac74b | 812 | eval(shift(@insns)); |
f3eac74b | 813 | eval(shift(@insns)); |
32213d8d | 814 | &pshufd ($t3,$t3,0b10000000); |
f3eac74b AP |
815 | eval(shift(@insns)); |
816 | eval(shift(@insns)); | |
32213d8d | 817 | eval(shift(@insns)); # @ |
f3eac74b | 818 | eval(shift(@insns)); |
f3eac74b AP |
819 | eval(shift(@insns)); |
820 | eval(shift(@insns)); | |
821 | eval(shift(@insns)); | |
32213d8d | 822 | eval(shift(@insns)); # @ |
f3eac74b | 823 | eval(shift(@insns)); |
32213d8d | 824 | &psrldq ($t3,8); |
f3eac74b AP |
825 | eval(shift(@insns)); |
826 | eval(shift(@insns)); | |
827 | eval(shift(@insns)); | |
32213d8d AP |
828 | &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) |
829 | eval(shift(@insns)); # @ | |
f3eac74b | 830 | eval(shift(@insns)); |
32213d8d AP |
831 | eval(shift(@insns)); |
832 | eval(shift(@insns)); | |
833 | eval(shift(@insns)); | |
834 | eval(shift(@insns)); # @ | |
835 | eval(shift(@insns)); | |
836 | &pshufd ($t3,@X[0],0b01010000); # X[16..17] | |
837 | eval(shift(@insns)); | |
838 | eval(shift(@insns)); | |
839 | eval(shift(@insns)); | |
840 | &movdqa ($t2,$t3); | |
841 | eval(shift(@insns)); # @ | |
842 | &psrld ($t3,10); | |
843 | eval(shift(@insns)); | |
844 | &psrlq ($t2,17); | |
845 | eval(shift(@insns)); | |
846 | eval(shift(@insns)); | |
847 | eval(shift(@insns)); | |
848 | eval(shift(@insns)); # @ | |
849 | &pxor ($t3,$t2); | |
850 | eval(shift(@insns)); | |
851 | eval(shift(@insns)); | |
852 | &psrlq ($t2,19-17); | |
853 | eval(shift(@insns)); | |
854 | eval(shift(@insns)); | |
855 | eval(shift(@insns)); # @ | |
856 | &pxor ($t3,$t2); | |
857 | eval(shift(@insns)); | |
858 | eval(shift(@insns)); | |
859 | eval(shift(@insns)); | |
860 | &pshufd ($t3,$t3,0b00001000); | |
861 | eval(shift(@insns)); | |
862 | eval(shift(@insns)); # @ | |
863 | &movdqa ($t2,&QWP(16*$j,$K256)); | |
864 | eval(shift(@insns)); | |
865 | eval(shift(@insns)); | |
866 | &pslldq ($t3,8); | |
867 | eval(shift(@insns)); | |
868 | eval(shift(@insns)); | |
869 | eval(shift(@insns)); # @ | |
870 | eval(shift(@insns)); | |
871 | eval(shift(@insns)); | |
872 | eval(shift(@insns)); | |
873 | eval(shift(@insns)); | |
874 | eval(shift(@insns)); # @ | |
875 | &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) | |
876 | eval(shift(@insns)); | |
877 | eval(shift(@insns)); | |
878 | eval(shift(@insns)); | |
879 | eval(shift(@insns)); | |
880 | &paddd ($t2,@X[0]); | |
881 | eval(shift(@insns)); # @ | |
f3eac74b AP |
882 | |
883 | foreach (@insns) { eval; } # remaining instructions | |
884 | ||
32213d8d | 885 | &movdqa (&QWP(32+16*$j,"esp"),$t2); |
f3eac74b AP |
886 | } |
887 | ||
888 | sub body_00_15 () { | |
889 | ( | |
890 | '&mov ("ecx",$E);', | |
f3eac74b | 891 | '&ror ($E,25-11);', |
32213d8d | 892 | '&mov ("esi",&off($f));', |
f3eac74b | 893 | '&xor ($E,"ecx");', |
32213d8d | 894 | '&mov ("edi",&off($g));', |
f3eac74b AP |
895 | '&xor ("esi","edi");', |
896 | '&ror ($E,11-6);', | |
897 | '&and ("esi","ecx");', | |
898 | '&mov (&off($e),"ecx");', # save $E, modulo-scheduled | |
899 | '&xor ($E,"ecx");', | |
900 | '&xor ("edi","esi");', # Ch(e,f,g) | |
901 | '&ror ($E,6);', # T = Sigma1(e) | |
902 | '&mov ("ecx",$AH[0]);', | |
32213d8d AP |
903 | '&add ($E,"edi");', # T += Ch(e,f,g) |
904 | '&mov ("edi",&off($b));', | |
f3eac74b | 905 | '&mov ("esi",$AH[0]);', |
f3eac74b AP |
906 | |
907 | '&ror ("ecx",22-13);', | |
f3eac74b | 908 | '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled |
32213d8d | 909 | '&xor ("ecx",$AH[0]);', |
f3eac74b | 910 | '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round |
32213d8d | 911 | '&add ($E,&off($h));', # T += h |
f3eac74b AP |
912 | '&ror ("ecx",13-2);', |
913 | '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) | |
f3eac74b | 914 | '&xor ("ecx","esi");', |
32213d8d | 915 | '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] |
f3eac74b AP |
916 | '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) |
917 | '&ror ("ecx",2);', # Sigma0(a) | |
918 | ||
919 | '&add ($AH[1],$E);', # h += T | |
920 | '&add ($E,&off($d));', # d += T | |
921 | '&add ($AH[1],"ecx");'. # h += Sigma0(a) | |
922 | ||
923 | '@AH = reverse(@AH); $i++;' # rotate(a,h) | |
924 | ); | |
925 | } | |
926 | ||
927 | for ($i=0,$j=0; $j<4; $j++) { | |
32213d8d | 928 | &SSSE3_00_47($j,\&body_00_15,@X); |
f3eac74b AP |
929 | push(@X,shift(@X)); # rotate(@X) |
930 | } | |
931 | &cmp (&DWP(16*$j,$K256),0x00010203); | |
32213d8d | 932 | &jne (&label("ssse3_00_47")); |
f3eac74b AP |
933 | |
934 | for ($i=0; $i<16; ) { | |
935 | foreach(body_00_15()) { eval; } | |
936 | } | |
937 | ||
938 | &mov ("esi",&DWP(96,"esp")); #ctx | |
939 | #&mov ($AH[0],&DWP(0,"esp")); | |
940 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
941 | #&mov ("edi", &DWP(8,"esp")); | |
942 | &mov ("ecx",&DWP(12,"esp")); | |
943 | &add ($AH[0],&DWP(0,"esi")); | |
944 | &add ($AH[1],&DWP(4,"esi")); | |
945 | &add ("edi",&DWP(8,"esi")); | |
946 | &add ("ecx",&DWP(12,"esi")); | |
947 | &mov (&DWP(0,"esi"),$AH[0]); | |
948 | &mov (&DWP(4,"esi"),$AH[1]); | |
949 | &mov (&DWP(8,"esi"),"edi"); | |
950 | &mov (&DWP(12,"esi"),"ecx"); | |
951 | #&mov (&DWP(0,"esp"),$AH[0]); | |
952 | &mov (&DWP(4,"esp"),$AH[1]); | |
953 | &xor ($AH[1],"edi"); # magic | |
954 | &mov (&DWP(8,"esp"),"edi"); | |
955 | &mov (&DWP(12,"esp"),"ecx"); | |
956 | #&mov ($E,&DWP(16,"esp")); | |
957 | &mov ("edi",&DWP(20,"esp")); | |
958 | &mov ("ecx",&DWP(24,"esp")); | |
959 | &add ($E,&DWP(16,"esi")); | |
960 | &add ("edi",&DWP(20,"esi")); | |
961 | &add ("ecx",&DWP(24,"esi")); | |
962 | &mov (&DWP(16,"esi"),$E); | |
963 | &mov (&DWP(20,"esi"),"edi"); | |
964 | &mov (&DWP(20,"esp"),"edi"); | |
965 | &mov ("edi",&DWP(28,"esp")); | |
966 | &mov (&DWP(24,"esi"),"ecx"); | |
967 | #&mov (&DWP(16,"esp"),$E); | |
968 | &add ("edi",&DWP(28,"esi")); | |
969 | &mov (&DWP(24,"esp"),"ecx"); | |
970 | &mov (&DWP(28,"esi"),"edi"); | |
971 | &mov (&DWP(28,"esp"),"edi"); | |
972 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
973 | ||
32213d8d | 974 | &movdqa ($t3,&QWP(64,$K256)); |
f3eac74b AP |
975 | &sub ($K256,3*64); # rewind K |
976 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
32213d8d | 977 | &jb (&label("grand_ssse3")); |
f3eac74b AP |
978 | |
979 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
f3eac74b | 980 | &function_end_A(); |
32213d8d AP |
981 | if ($avx) { |
982 | &set_label("AVX",32); | |
983 | if ($avx>1) { | |
32213d8d AP |
984 | &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 |
985 | &cmp ("edx",1<<8|1<<3); | |
986 | &je (&label("AVX_BMI")); | |
987 | } | |
f3eac74b AP |
988 | &lea ("esp",&DWP(-96,"esp")); |
989 | &vzeroall (); | |
990 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
991 | &mov ($AH[0],&DWP(0,"esi")); | |
992 | &mov ($AH[1],&DWP(4,"esi")); | |
993 | &mov ("ecx",&DWP(8,"esi")); | |
994 | &mov ("edi",&DWP(12,"esi")); | |
995 | #&mov (&DWP(0,"esp"),$AH[0]); | |
996 | &mov (&DWP(4,"esp"),$AH[1]); | |
997 | &xor ($AH[1],"ecx"); # magic | |
998 | &mov (&DWP(8,"esp"),"ecx"); | |
999 | &mov (&DWP(12,"esp"),"edi"); | |
1000 | &mov ($E,&DWP(16,"esi")); | |
1001 | &mov ("edi",&DWP(20,"esi")); | |
1002 | &mov ("ecx",&DWP(24,"esi")); | |
1003 | &mov ("esi",&DWP(28,"esi")); | |
1004 | #&mov (&DWP(16,"esp"),$E); | |
1005 | &mov (&DWP(20,"esp"),"edi"); | |
1006 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1007 | &mov (&DWP(24,"esp"),"ecx"); | |
1008 | &mov (&DWP(28,"esp"),"esi"); | |
2dce10c5 | 1009 | &vmovdqa ($t3,&QWP(256,$K256)); |
f3eac74b AP |
1010 | &jmp (&label("grand_avx")); |
1011 | ||
32213d8d | 1012 | &set_label("grand_avx",32); |
f3eac74b AP |
1013 | # load input, reverse byte order, add K256[0..15], save to stack |
1014 | &vmovdqu (@X[0],&QWP(0,"edi")); | |
1015 | &vmovdqu (@X[1],&QWP(16,"edi")); | |
1016 | &vmovdqu (@X[2],&QWP(32,"edi")); | |
1017 | &vmovdqu (@X[3],&QWP(48,"edi")); | |
1018 | &add ("edi",64); | |
1019 | &vpshufb (@X[0],@X[0],$t3); | |
1020 | &mov (&DWP(96+4,"esp"),"edi"); | |
1021 | &vpshufb (@X[1],@X[1],$t3); | |
1022 | &vpshufb (@X[2],@X[2],$t3); | |
1023 | &vpaddd ($t0,@X[0],&QWP(0,$K256)); | |
1024 | &vpshufb (@X[3],@X[3],$t3); | |
1025 | &vpaddd ($t1,@X[1],&QWP(16,$K256)); | |
1026 | &vpaddd ($t2,@X[2],&QWP(32,$K256)); | |
1027 | &vpaddd ($t3,@X[3],&QWP(48,$K256)); | |
1028 | &vmovdqa (&QWP(32+0,"esp"),$t0); | |
1029 | &vmovdqa (&QWP(32+16,"esp"),$t1); | |
1030 | &vmovdqa (&QWP(32+32,"esp"),$t2); | |
1031 | &vmovdqa (&QWP(32+48,"esp"),$t3); | |
1032 | &jmp (&label("avx_00_47")); | |
1033 | ||
1034 | &set_label("avx_00_47",16); | |
1035 | &add ($K256,64); | |
1036 | ||
1037 | sub Xupdate_AVX () { | |
1038 | ( | |
1039 | '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] | |
1040 | '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] | |
1041 | '&vpsrld ($t2,$t0,7);', | |
1042 | '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] | |
1043 | '&vpsrld ($t3,$t0,3);', | |
1044 | '&vpslld ($t1,$t0,14);', | |
1045 | '&vpxor ($t0,$t3,$t2);', | |
47edeb9f | 1046 | '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] |
f3eac74b AP |
1047 | '&vpsrld ($t2,$t2,18-7);', |
1048 | '&vpxor ($t0,$t0,$t1);', | |
1049 | '&vpslld ($t1,$t1,25-14);', | |
1050 | '&vpxor ($t0,$t0,$t2);', | |
47edeb9f | 1051 | '&vpsrld ($t2,$t3,10);', |
f3eac74b | 1052 | '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) |
47edeb9f | 1053 | '&vpsrlq ($t1,$t3,17);', |
f3eac74b | 1054 | '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) |
47edeb9f AP |
1055 | '&vpxor ($t2,$t2,$t1);', |
1056 | '&vpsrlq ($t3,$t3,19);', | |
1057 | '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] | |
1058 | '&vpshufd ($t3,$t2,0b10000100);', | |
f3eac74b AP |
1059 | '&vpsrldq ($t3,$t3,8);', |
1060 | '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) | |
47edeb9f AP |
1061 | '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] |
1062 | '&vpsrld ($t2,$t3,10);', | |
1063 | '&vpsrlq ($t1,$t3,17);', | |
1064 | '&vpxor ($t2,$t2,$t1);', | |
1065 | '&vpsrlq ($t3,$t3,19);', | |
1066 | '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] | |
1067 | '&vpshufd ($t3,$t2,0b11101000);', | |
f3eac74b AP |
1068 | '&vpslldq ($t3,$t3,8);', |
1069 | '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) | |
1070 | ); | |
1071 | } | |
1072 | ||
1073 | local *ror = sub { &shrd(@_[0],@_) }; | |
1074 | sub AVX_00_47 () { | |
1075 | my $j = shift; | |
1076 | my $body = shift; | |
1077 | my @X = @_; | |
1078 | my @insns = (&$body,&$body,&$body,&$body); # 120 instructions | |
32213d8d | 1079 | my $insn; |
f3eac74b | 1080 | |
47edeb9f | 1081 | foreach (Xupdate_AVX()) { # 31 instructions |
f3eac74b AP |
1082 | eval; |
1083 | eval(shift(@insns)); | |
1084 | eval(shift(@insns)); | |
32213d8d AP |
1085 | eval($insn = shift(@insns)); |
1086 | eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); | |
f3eac74b AP |
1087 | } |
1088 | &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); | |
1089 | foreach (@insns) { eval; } # remaining instructions | |
1090 | &vmovdqa (&QWP(32+16*$j,"esp"),$t2); | |
1091 | } | |
1092 | ||
1093 | for ($i=0,$j=0; $j<4; $j++) { | |
1094 | &AVX_00_47($j,\&body_00_15,@X); | |
1095 | push(@X,shift(@X)); # rotate(@X) | |
1096 | } | |
1097 | &cmp (&DWP(16*$j,$K256),0x00010203); | |
1098 | &jne (&label("avx_00_47")); | |
1099 | ||
1100 | for ($i=0; $i<16; ) { | |
1101 | foreach(body_00_15()) { eval; } | |
1102 | } | |
1103 | ||
1104 | &mov ("esi",&DWP(96,"esp")); #ctx | |
1105 | #&mov ($AH[0],&DWP(0,"esp")); | |
1106 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
1107 | #&mov ("edi", &DWP(8,"esp")); | |
1108 | &mov ("ecx",&DWP(12,"esp")); | |
1109 | &add ($AH[0],&DWP(0,"esi")); | |
1110 | &add ($AH[1],&DWP(4,"esi")); | |
1111 | &add ("edi",&DWP(8,"esi")); | |
1112 | &add ("ecx",&DWP(12,"esi")); | |
1113 | &mov (&DWP(0,"esi"),$AH[0]); | |
1114 | &mov (&DWP(4,"esi"),$AH[1]); | |
1115 | &mov (&DWP(8,"esi"),"edi"); | |
1116 | &mov (&DWP(12,"esi"),"ecx"); | |
1117 | #&mov (&DWP(0,"esp"),$AH[0]); | |
1118 | &mov (&DWP(4,"esp"),$AH[1]); | |
1119 | &xor ($AH[1],"edi"); # magic | |
1120 | &mov (&DWP(8,"esp"),"edi"); | |
1121 | &mov (&DWP(12,"esp"),"ecx"); | |
1122 | #&mov ($E,&DWP(16,"esp")); | |
1123 | &mov ("edi",&DWP(20,"esp")); | |
1124 | &mov ("ecx",&DWP(24,"esp")); | |
1125 | &add ($E,&DWP(16,"esi")); | |
1126 | &add ("edi",&DWP(20,"esi")); | |
1127 | &add ("ecx",&DWP(24,"esi")); | |
1128 | &mov (&DWP(16,"esi"),$E); | |
1129 | &mov (&DWP(20,"esi"),"edi"); | |
1130 | &mov (&DWP(20,"esp"),"edi"); | |
1131 | &mov ("edi",&DWP(28,"esp")); | |
1132 | &mov (&DWP(24,"esi"),"ecx"); | |
1133 | #&mov (&DWP(16,"esp"),$E); | |
1134 | &add ("edi",&DWP(28,"esi")); | |
1135 | &mov (&DWP(24,"esp"),"ecx"); | |
1136 | &mov (&DWP(28,"esi"),"edi"); | |
1137 | &mov (&DWP(28,"esp"),"edi"); | |
1138 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1139 | ||
1140 | &vmovdqa ($t3,&QWP(64,$K256)); | |
1141 | &sub ($K256,3*64); # rewind K | |
1142 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
1143 | &jb (&label("grand_avx")); | |
1144 | ||
1145 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
1146 | &vzeroall (); | |
1147 | &function_end_A(); | |
32213d8d AP |
1148 | if ($avx>1) { |
1149 | sub bodyx_00_15 () { # +10% | |
1150 | ( | |
1151 | '&rorx ("ecx",$E,6)', | |
1152 | '&rorx ("esi",$E,11)', | |
1153 | '&mov (&off($e),$E)', # save $E, modulo-scheduled | |
1154 | '&rorx ("edi",$E,25)', | |
1155 | '&xor ("ecx","esi")', | |
1156 | '&andn ("esi",$E,&off($g))', | |
1157 | '&xor ("ecx","edi")', # Sigma1(e) | |
1158 | '&and ($E,&off($f))', | |
1159 | '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled | |
1160 | '&or ($E,"esi")', # T = Ch(e,f,g) | |
1161 | ||
1162 | '&rorx ("edi",$AH[0],2)', | |
1163 | '&rorx ("esi",$AH[0],13)', | |
1164 | '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) | |
1165 | '&rorx ("ecx",$AH[0],22)', | |
1166 | '&xor ("esi","edi")', | |
1167 | '&mov ("edi",&off($b))', | |
1168 | '&xor ("ecx","esi")', # Sigma0(a) | |
1169 | ||
1170 | '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round | |
1171 | '&add ($E,&off($h))', # T += h | |
1172 | '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) | |
1173 | '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] | |
1174 | '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) | |
1175 | ||
1176 | '&add ("ecx",$E)', # h += T | |
1177 | '&add ($E,&off($d))', # d += T | |
1178 | '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) | |
1179 | ||
1180 | '@AH = reverse(@AH); $i++;' # rotate(a,h) | |
1181 | ); | |
f889bb03 | 1182 | } |
32213d8d AP |
1183 | |
1184 | &set_label("AVX_BMI",32); | |
1185 | &lea ("esp",&DWP(-96,"esp")); | |
1186 | &vzeroall (); | |
1187 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack | |
1188 | &mov ($AH[0],&DWP(0,"esi")); | |
1189 | &mov ($AH[1],&DWP(4,"esi")); | |
1190 | &mov ("ecx",&DWP(8,"esi")); | |
1191 | &mov ("edi",&DWP(12,"esi")); | |
1192 | #&mov (&DWP(0,"esp"),$AH[0]); | |
1193 | &mov (&DWP(4,"esp"),$AH[1]); | |
1194 | &xor ($AH[1],"ecx"); # magic | |
1195 | &mov (&DWP(8,"esp"),"ecx"); | |
1196 | &mov (&DWP(12,"esp"),"edi"); | |
1197 | &mov ($E,&DWP(16,"esi")); | |
1198 | &mov ("edi",&DWP(20,"esi")); | |
1199 | &mov ("ecx",&DWP(24,"esi")); | |
1200 | &mov ("esi",&DWP(28,"esi")); | |
1201 | #&mov (&DWP(16,"esp"),$E); | |
1202 | &mov (&DWP(20,"esp"),"edi"); | |
1203 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1204 | &mov (&DWP(24,"esp"),"ecx"); | |
1205 | &mov (&DWP(28,"esp"),"esi"); | |
1206 | &vmovdqa ($t3,&QWP(256,$K256)); | |
1207 | &jmp (&label("grand_avx_bmi")); | |
1208 | ||
1209 | &set_label("grand_avx_bmi",32); | |
1210 | # load input, reverse byte order, add K256[0..15], save to stack | |
1211 | &vmovdqu (@X[0],&QWP(0,"edi")); | |
1212 | &vmovdqu (@X[1],&QWP(16,"edi")); | |
1213 | &vmovdqu (@X[2],&QWP(32,"edi")); | |
1214 | &vmovdqu (@X[3],&QWP(48,"edi")); | |
1215 | &add ("edi",64); | |
1216 | &vpshufb (@X[0],@X[0],$t3); | |
1217 | &mov (&DWP(96+4,"esp"),"edi"); | |
1218 | &vpshufb (@X[1],@X[1],$t3); | |
1219 | &vpshufb (@X[2],@X[2],$t3); | |
1220 | &vpaddd ($t0,@X[0],&QWP(0,$K256)); | |
1221 | &vpshufb (@X[3],@X[3],$t3); | |
1222 | &vpaddd ($t1,@X[1],&QWP(16,$K256)); | |
1223 | &vpaddd ($t2,@X[2],&QWP(32,$K256)); | |
1224 | &vpaddd ($t3,@X[3],&QWP(48,$K256)); | |
1225 | &vmovdqa (&QWP(32+0,"esp"),$t0); | |
1226 | &vmovdqa (&QWP(32+16,"esp"),$t1); | |
1227 | &vmovdqa (&QWP(32+32,"esp"),$t2); | |
1228 | &vmovdqa (&QWP(32+48,"esp"),$t3); | |
1229 | &jmp (&label("avx_bmi_00_47")); | |
1230 | ||
1231 | &set_label("avx_bmi_00_47",16); | |
1232 | &add ($K256,64); | |
1233 | ||
1234 | for ($i=0,$j=0; $j<4; $j++) { | |
1235 | &AVX_00_47($j,\&bodyx_00_15,@X); | |
1236 | push(@X,shift(@X)); # rotate(@X) | |
1237 | } | |
1238 | &cmp (&DWP(16*$j,$K256),0x00010203); | |
1239 | &jne (&label("avx_bmi_00_47")); | |
1240 | ||
1241 | for ($i=0; $i<16; ) { | |
1242 | foreach(bodyx_00_15()) { eval; } | |
1243 | } | |
1244 | ||
1245 | &mov ("esi",&DWP(96,"esp")); #ctx | |
1246 | #&mov ($AH[0],&DWP(0,"esp")); | |
1247 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); | |
1248 | #&mov ("edi", &DWP(8,"esp")); | |
1249 | &mov ("ecx",&DWP(12,"esp")); | |
1250 | &add ($AH[0],&DWP(0,"esi")); | |
1251 | &add ($AH[1],&DWP(4,"esi")); | |
1252 | &add ("edi",&DWP(8,"esi")); | |
1253 | &add ("ecx",&DWP(12,"esi")); | |
1254 | &mov (&DWP(0,"esi"),$AH[0]); | |
1255 | &mov (&DWP(4,"esi"),$AH[1]); | |
1256 | &mov (&DWP(8,"esi"),"edi"); | |
1257 | &mov (&DWP(12,"esi"),"ecx"); | |
1258 | #&mov (&DWP(0,"esp"),$AH[0]); | |
1259 | &mov (&DWP(4,"esp"),$AH[1]); | |
1260 | &xor ($AH[1],"edi"); # magic | |
1261 | &mov (&DWP(8,"esp"),"edi"); | |
1262 | &mov (&DWP(12,"esp"),"ecx"); | |
1263 | #&mov ($E,&DWP(16,"esp")); | |
1264 | &mov ("edi",&DWP(20,"esp")); | |
1265 | &mov ("ecx",&DWP(24,"esp")); | |
1266 | &add ($E,&DWP(16,"esi")); | |
1267 | &add ("edi",&DWP(20,"esi")); | |
1268 | &add ("ecx",&DWP(24,"esi")); | |
1269 | &mov (&DWP(16,"esi"),$E); | |
1270 | &mov (&DWP(20,"esi"),"edi"); | |
1271 | &mov (&DWP(20,"esp"),"edi"); | |
1272 | &mov ("edi",&DWP(28,"esp")); | |
1273 | &mov (&DWP(24,"esi"),"ecx"); | |
1274 | #&mov (&DWP(16,"esp"),$E); | |
1275 | &add ("edi",&DWP(28,"esi")); | |
1276 | &mov (&DWP(24,"esp"),"ecx"); | |
1277 | &mov (&DWP(28,"esi"),"edi"); | |
1278 | &mov (&DWP(28,"esp"),"edi"); | |
1279 | &mov ("edi",&DWP(96+4,"esp")); # inp | |
1280 | ||
1281 | &vmovdqa ($t3,&QWP(64,$K256)); | |
1282 | &sub ($K256,3*64); # rewind K | |
1283 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? | |
1284 | &jb (&label("grand_avx_bmi")); | |
1285 | ||
1286 | &mov ("esp",&DWP(96+12,"esp")); # restore sp | |
1287 | &vzeroall (); | |
1288 | &function_end_A(); | |
1289 | } | |
1290 | } | |
1291 | }}} | |
ee0449b1 | 1292 | &function_end_B("sha256_block_data_order"); |
ee0449b1 AP |
1293 | |
1294 | &asm_finish(); | |
e87e380a RL |
1295 | |
1296 | close STDOUT; |