]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
500b5a18 AP |
9 | |
10 | # ==================================================================== | |
35c77b73 | 11 | # [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
f0f61f6d AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
500b5a18 AP |
15 | # ==================================================================== |
16 | ||
17 | # "[Re]written" was achieved in two major overhauls. In 2004 BODY_* | |
18 | # functions were re-implemented to address P4 performance issue [see | |
19 | # commentary below], and in 2006 the rest was rewritten in order to | |
20 | # gain freedom to liberate licensing terms. | |
58964a49 | 21 | |
c372482c AP |
22 | # January, September 2004. |
23 | # | |
30cb9ec7 AP |
24 | # It was noted that Intel IA-32 C compiler generates code which |
25 | # performs ~30% *faster* on P4 CPU than original *hand-coded* | |
26 | # SHA1 assembler implementation. To address this problem (and | |
27 | # prove that humans are still better than machines:-), the | |
28 | # original code was overhauled, which resulted in following | |
29 | # performance changes: | |
30 | # | |
31 | # compared with original compared with Intel cc | |
32 | # assembler impl. generated code | |
c29ef588 | 33 | # Pentium -16% +48% |
30cb9ec7 AP |
34 | # PIII/AMD +8% +16% |
35 | # P4 +85%(!) +45% | |
36 | # | |
37 | # As you can see Pentium came out as looser:-( Yet I reckoned that | |
38 | # improvement on P4 outweights the loss and incorporate this | |
39 | # re-tuned code to 0.9.7 and later. | |
40 | # ---------------------------------------------------------------- | |
30cb9ec7 AP |
41 | # <appro@fy.chalmers.se> |
42 | ||
c372482c AP |
43 | # August 2009. |
44 | # | |
45 | # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as | |
46 | # '(c&d) + (b&(c^d))', which allows to accumulate partial results | |
47 | # and lighten "pressure" on scratch registers. This resulted in | |
48 | # >12% performance improvement on contemporary AMD cores (with no | |
49 | # degradation on other CPUs:-). Also, the code was revised to maximize | |
50 | # "distance" between instructions producing input to 'lea' instruction | |
51 | # and the 'lea' instruction itself, which is essential for Intel Atom | |
0c149802 AP |
52 | # core and resulted in ~15% improvement. |
53 | ||
54 | # October 2010. | |
55 | # | |
56 | # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it | |
57 | # is to offload message schedule denoted by Wt in NIST specification, | |
58 | # or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, | |
59 | # and in SSE2 context was first explored by Dean Gaudet in 2004, see | |
60 | # http://arctic.org/~dean/crypto/sha1.html. Since then several things | |
61 | # have changed that made it interesting again: | |
62 | # | |
63 | # a) XMM units became faster and wider; | |
64 | # b) instruction set became more versatile; | |
65 | # c) an important observation was made by Max Locktykhin, which made | |
66 | # it possible to reduce amount of instructions required to perform | |
67 | # the operation in question, for further details see | |
68 | # http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. | |
69 | ||
70 | # April 2011. | |
71 | # | |
72 | # Add AVX code path, probably most controversial... The thing is that | |
73 | # switch to AVX alone improves performance by as little as 4% in | |
74 | # comparison to SSSE3 code path. But below result doesn't look like | |
75 | # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as | |
053fa39a | 76 | # pair of µ-ops, and it's the additional µ-ops, two per round, that |
0c149802 | 77 | # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded |
053fa39a | 78 | # as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with |
0c149802 AP |
79 | # equivalent 'sh[rl]d' that is responsible for the impressive 5.1 |
80 | # cycles per processed byte. But 'sh[rl]d' is not something that used | |
81 | # to be fast, nor does it appear to be fast in upcoming Bulldozer | |
82 | # [according to its optimization manual]. Which is why AVX code path | |
83 | # is guarded by *both* AVX and synthetic bit denoting Intel CPUs. | |
84 | # One can argue that it's unfair to AMD, but without 'sh[rl]d' it | |
85 | # makes no sense to keep the AVX code path. If somebody feels that | |
86 | # strongly, it's probably more appropriate to discuss possibility of | |
87 | # using vector rotate XOP on AMD... | |
88 | ||
619b9466 AP |
89 | # March 2014. |
90 | # | |
91 | # Add support for Intel SHA Extensions. | |
92 | ||
0c149802 AP |
93 | ###################################################################### |
94 | # Current performance is summarized in following table. Numbers are | |
95 | # CPU clock cycles spent to process single byte (less is better). | |
96 | # | |
97 | # x86 SSSE3 AVX | |
98 | # Pentium 15.7 - | |
99 | # PIII 11.5 - | |
100 | # P4 10.6 - | |
101 | # AMD K8 7.1 - | |
35c77b73 | 102 | # Core2 7.3 6.0/+22% - |
69f45c52 | 103 | # Westmere 7.3 5.5/+33% - |
35c77b73 | 104 | # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% |
69f45c52 | 105 | # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% |
b217ca63 | 106 | # Haswell 6.5 4.3/+51% 4.1(**)/+58% |
a30b0522 | 107 | # Skylake 6.4 4.1/+55% 4.1(**)/+55% |
69f45c52 | 108 | # Bulldozer 11.6 6.0/+92% |
b217ca63 | 109 | # VIA Nano 10.6 7.5/+41% |
b59f92e7 AP |
110 | # Atom 12.5 9.3(*)/+35% |
111 | # Silvermont 14.5 9.9(*)/+46% | |
a30b0522 | 112 | # Goldmont 8.8 6.7/+30% 1.7(***)/+415% |
0c149802 AP |
113 | # |
114 | # (*) Loop is 1056 instructions long and expected result is ~8.25. | |
b59f92e7 AP |
115 | # The discrepancy is because of front-end limitations, so |
116 | # called MS-ROM penalties, and on Silvermont even rotate's | |
117 | # limited parallelism. | |
0c149802 AP |
118 | # |
119 | # (**) As per above comment, the result is for AVX *plus* sh[rl]d. | |
a30b0522 AP |
120 | # |
121 | # (***) SHAEXT result | |
c372482c | 122 | |
f0f61f6d AP |
123 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
124 | push(@INC,"${dir}","${dir}../../perlasm"); | |
58964a49 RE |
125 | require "x86asm.pl"; |
126 | ||
e87e380a RL |
127 | $output=pop; |
128 | open STDOUT,">$output"; | |
129 | ||
2613c1fa | 130 | &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); |
58964a49 | 131 | |
0c149802 AP |
132 | $xmm=$ymm=0; |
133 | for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } | |
134 | ||
135 | $ymm=1 if ($xmm && | |
afa4b386 | 136 | `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
0c149802 AP |
137 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ && |
138 | $1>=2.19); # first version supporting AVX | |
139 | ||
609b0852 | 140 | $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && |
0c149802 AP |
141 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && |
142 | $1>=2.03); # first version supporting AVX | |
143 | ||
367b1264 AP |
144 | $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && |
145 | `ml 2>&1` =~ /Version ([0-9]+)\./ && | |
146 | $1>=10); # first version supporting AVX | |
147 | ||
a356e488 AP |
148 | $ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && |
149 | $2>=3.0); # first version supporting AVX | |
ac171925 | 150 | |
977f32e8 AP |
151 | $shaext=$xmm; ### set to zero if compiling for 1.0.1 |
152 | ||
0c149802 AP |
153 | &external_label("OPENSSL_ia32cap_P") if ($xmm); |
154 | ||
155 | ||
58964a49 | 156 | $A="eax"; |
500b5a18 AP |
157 | $B="ebx"; |
158 | $C="ecx"; | |
58964a49 RE |
159 | $D="edx"; |
160 | $E="edi"; | |
161 | $T="esi"; | |
162 | $tmp1="ebp"; | |
163 | ||
500b5a18 | 164 | @V=($A,$B,$C,$D,$E,$T); |
58964a49 | 165 | |
0c149802 AP |
166 | $alt=0; # 1 denotes alternative IALU implementation, which performs |
167 | # 8% *worse* on P4, same on Westmere and Atom, 2% better on | |
168 | # Sandy Bridge... | |
169 | ||
58964a49 RE |
170 | sub BODY_00_15 |
171 | { | |
500b5a18 | 172 | local($n,$a,$b,$c,$d,$e,$f)=@_; |
58964a49 | 173 | |
58964a49 RE |
174 | &comment("00_15 $n"); |
175 | ||
c29ef588 AP |
176 | &mov($f,$c); # f to hold F_00_19(b,c,d) |
177 | if ($n==0) { &mov($tmp1,$a); } | |
178 | else { &mov($a,$tmp1); } | |
30cb9ec7 AP |
179 | &rotl($tmp1,5); # tmp1=ROTATE(a,5) |
180 | &xor($f,$d); | |
500b5a18 | 181 | &add($tmp1,$e); # tmp1+=e; |
c372482c | 182 | &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded |
500b5a18 AP |
183 | # with xi, also note that e becomes |
184 | # f in next round... | |
c372482c | 185 | &and($f,$b); |
c29ef588 | 186 | &rotr($b,2); # b=ROTATE(b,30) |
c372482c AP |
187 | &xor($f,$d); # f holds F_00_19(b,c,d) |
188 | &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi | |
c29ef588 | 189 | |
c372482c AP |
190 | if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round |
191 | &add($f,$tmp1); } # f+=tmp1 | |
500b5a18 | 192 | else { &add($tmp1,$f); } # f becomes a in next round |
0c149802 | 193 | &mov($tmp1,$a) if ($alt && $n==15); |
58964a49 RE |
194 | } |
195 | ||
196 | sub BODY_16_19 | |
197 | { | |
500b5a18 | 198 | local($n,$a,$b,$c,$d,$e,$f)=@_; |
58964a49 | 199 | |
58964a49 RE |
200 | &comment("16_19 $n"); |
201 | ||
0c149802 AP |
202 | if ($alt) { |
203 | &xor($c,$d); | |
204 | &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
205 | &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d | |
206 | &xor($f,&swtmp(($n+8)%16)); | |
207 | &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) | |
208 | &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
209 | &rotl($f,1); # f=ROTATE(f,1) | |
210 | &add($e,$tmp1); # e+=F_00_19(b,c,d) | |
211 | &xor($c,$d); # restore $c | |
212 | &mov($tmp1,$a); # b in next round | |
213 | &rotr($b,$n==16?2:7); # b=ROTATE(b,30) | |
214 | &mov(&swtmp($n%16),$f); # xi=f | |
215 | &rotl($a,5); # ROTATE(a,5) | |
216 | &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e | |
217 | &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
218 | &add($f,$a); # f+=ROTATE(a,5) | |
219 | } else { | |
c372482c AP |
220 | &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) |
221 | &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
222 | &xor($tmp1,$d); | |
223 | &xor($f,&swtmp(($n+8)%16)); | |
224 | &and($tmp1,$b); | |
500b5a18 AP |
225 | &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd |
226 | &rotl($f,1); # f=ROTATE(f,1) | |
c29ef588 | 227 | &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) |
c372482c AP |
228 | &add($e,$tmp1); # e+=F_00_19(b,c,d) |
229 | &mov($tmp1,$a); | |
230 | &rotr($b,2); # b=ROTATE(b,30) | |
231 | &mov(&swtmp($n%16),$f); # xi=f | |
232 | &rotl($tmp1,5); # ROTATE(a,5) | |
233 | &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e | |
234 | &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
235 | &add($f,$tmp1); # f+=ROTATE(a,5) | |
0c149802 | 236 | } |
58964a49 RE |
237 | } |
238 | ||
239 | sub BODY_20_39 | |
240 | { | |
500b5a18 AP |
241 | local($n,$a,$b,$c,$d,$e,$f)=@_; |
242 | local $K=($n<40)?0x6ed9eba1:0xca62c1d6; | |
58964a49 RE |
243 | |
244 | &comment("20_39 $n"); | |
58964a49 | 245 | |
0c149802 AP |
246 | if ($alt) { |
247 | &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c | |
248 | &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
249 | &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) | |
250 | &xor($f,&swtmp(($n+8)%16)); | |
251 | &add($e,$tmp1); # e+=F_20_39(b,c,d) | |
252 | &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
253 | &rotl($f,1); # f=ROTATE(f,1) | |
254 | &mov($tmp1,$a); # b in next round | |
255 | &rotr($b,7); # b=ROTATE(b,30) | |
256 | &mov(&swtmp($n%16),$f) if($n<77);# xi=f | |
257 | &rotl($a,5); # ROTATE(a,5) | |
258 | &xor($b,$c) if($n==39);# warm up for BODY_40_59 | |
259 | &and($tmp1,$b) if($n==39); | |
260 | &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY | |
261 | &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round | |
262 | &add($f,$a); # f+=ROTATE(a,5) | |
263 | &rotr($a,5) if ($n==79); | |
264 | } else { | |
c29ef588 | 265 | &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) |
c372482c | 266 | &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) |
c29ef588 | 267 | &xor($tmp1,$c); |
500b5a18 | 268 | &xor($f,&swtmp(($n+8)%16)); |
c29ef588 | 269 | &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) |
500b5a18 | 270 | &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd |
30cb9ec7 | 271 | &rotl($f,1); # f=ROTATE(f,1) |
c372482c AP |
272 | &add($e,$tmp1); # e+=F_20_39(b,c,d) |
273 | &rotr($b,2); # b=ROTATE(b,30) | |
274 | &mov($tmp1,$a); | |
275 | &rotl($tmp1,5); # ROTATE(a,5) | |
276 | &mov(&swtmp($n%16),$f) if($n<77);# xi=f | |
277 | &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY | |
278 | &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round | |
279 | &add($f,$tmp1); # f+=ROTATE(a,5) | |
0c149802 | 280 | } |
58964a49 RE |
281 | } |
282 | ||
283 | sub BODY_40_59 | |
284 | { | |
500b5a18 | 285 | local($n,$a,$b,$c,$d,$e,$f)=@_; |
58964a49 RE |
286 | |
287 | &comment("40_59 $n"); | |
58964a49 | 288 | |
0c149802 AP |
289 | if ($alt) { |
290 | &add($e,$tmp1); # e+=b&(c^d) | |
291 | &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
292 | &mov($tmp1,$d); | |
293 | &xor($f,&swtmp(($n+8)%16)); | |
294 | &xor($c,$d); # restore $c | |
295 | &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
296 | &rotl($f,1); # f=ROTATE(f,1) | |
297 | &and($tmp1,$c); | |
298 | &rotr($b,7); # b=ROTATE(b,30) | |
299 | &add($e,$tmp1); # e+=c&d | |
300 | &mov($tmp1,$a); # b in next round | |
301 | &mov(&swtmp($n%16),$f); # xi=f | |
302 | &rotl($a,5); # ROTATE(a,5) | |
303 | &xor($b,$c) if ($n<59); | |
304 | &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) | |
305 | &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) | |
306 | &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
307 | &add($f,$a); # f+=ROTATE(a,5) | |
308 | } else { | |
c372482c AP |
309 | &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) |
310 | &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
311 | &xor($tmp1,$d); | |
312 | &xor($f,&swtmp(($n+8)%16)); | |
313 | &and($tmp1,$b); | |
314 | &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
30cb9ec7 | 315 | &rotl($f,1); # f=ROTATE(f,1) |
c372482c | 316 | &add($tmp1,$e); # b&(c^d)+=e |
30cb9ec7 | 317 | &rotr($b,2); # b=ROTATE(b,30) |
c372482c AP |
318 | &mov($e,$a); # e becomes volatile |
319 | &rotl($e,5); # ROTATE(a,5) | |
320 | &mov(&swtmp($n%16),$f); # xi=f | |
321 | &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) | |
322 | &mov($tmp1,$c); | |
c29ef588 | 323 | &add($f,$e); # f+=ROTATE(a,5) |
c372482c AP |
324 | &and($tmp1,$d); |
325 | &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
326 | &add($f,$tmp1); # f+=c&d | |
0c149802 | 327 | } |
58964a49 RE |
328 | } |
329 | ||
87facba3 | 330 | &function_begin("sha1_block_data_order"); |
0c149802 | 331 | if ($xmm) { |
977f32e8 | 332 | &static_label("shaext_shortcut") if ($shaext); |
0c149802 AP |
333 | &static_label("ssse3_shortcut"); |
334 | &static_label("avx_shortcut") if ($ymm); | |
335 | &static_label("K_XX_XX"); | |
336 | ||
337 | &call (&label("pic_point")); # make it PIC! | |
338 | &set_label("pic_point"); | |
339 | &blindpop($tmp1); | |
340 | &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); | |
341 | &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
342 | ||
343 | &mov ($A,&DWP(0,$T)); | |
344 | &mov ($D,&DWP(4,$T)); | |
345 | &test ($D,1<<9); # check SSSE3 bit | |
346 | &jz (&label("x86")); | |
619b9466 | 347 | &mov ($C,&DWP(8,$T)); |
0c149802 AP |
348 | &test ($A,1<<24); # check FXSR bit |
349 | &jz (&label("x86")); | |
977f32e8 AP |
350 | if ($shaext) { |
351 | &test ($C,1<<29); # check SHA bit | |
352 | &jnz (&label("shaext_shortcut")); | |
353 | } | |
0c149802 AP |
354 | if ($ymm) { |
355 | &and ($D,1<<28); # mask AVX bit | |
356 | &and ($A,1<<30); # mask "Intel CPU" bit | |
357 | &or ($A,$D); | |
358 | &cmp ($A,1<<28|1<<30); | |
359 | &je (&label("avx_shortcut")); | |
360 | } | |
361 | &jmp (&label("ssse3_shortcut")); | |
362 | &set_label("x86",16); | |
363 | } | |
500b5a18 AP |
364 | &mov($tmp1,&wparam(0)); # SHA_CTX *c |
365 | &mov($T,&wparam(1)); # const void *input | |
366 | &mov($A,&wparam(2)); # size_t num | |
0c149802 | 367 | &stack_push(16+3); # allocate X[16] |
500b5a18 AP |
368 | &shl($A,6); |
369 | &add($A,$T); | |
370 | &mov(&wparam(2),$A); # pointer beyond the end of input | |
371 | &mov($E,&DWP(16,$tmp1));# pre-load E | |
0c149802 | 372 | &jmp(&label("loop")); |
58964a49 | 373 | |
0c149802 | 374 | &set_label("loop",16); |
500b5a18 AP |
375 | |
376 | # copy input chunk to X, but reversing byte order! | |
377 | for ($i=0; $i<16; $i+=4) | |
69fb1c3f | 378 | { |
500b5a18 AP |
379 | &mov($A,&DWP(4*($i+0),$T)); |
380 | &mov($B,&DWP(4*($i+1),$T)); | |
381 | &mov($C,&DWP(4*($i+2),$T)); | |
382 | &mov($D,&DWP(4*($i+3),$T)); | |
383 | &bswap($A); | |
384 | &bswap($B); | |
385 | &bswap($C); | |
386 | &bswap($D); | |
69fb1c3f | 387 | &mov(&swtmp($i+0),$A); |
500b5a18 AP |
388 | &mov(&swtmp($i+1),$B); |
389 | &mov(&swtmp($i+2),$C); | |
390 | &mov(&swtmp($i+3),$D); | |
391 | } | |
392 | &mov(&wparam(1),$T); # redundant in 1st spin | |
393 | ||
394 | &mov($A,&DWP(0,$tmp1)); # load SHA_CTX | |
395 | &mov($B,&DWP(4,$tmp1)); | |
396 | &mov($C,&DWP(8,$tmp1)); | |
397 | &mov($D,&DWP(12,$tmp1)); | |
398 | # E is pre-loaded | |
399 | ||
500b5a18 AP |
400 | for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } |
401 | for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } | |
402 | for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
403 | for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
404 | for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
405 | ||
406 | (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check | |
407 | ||
408 | &mov($tmp1,&wparam(0)); # re-load SHA_CTX* | |
409 | &mov($D,&wparam(1)); # D is last "T" and is discarded | |
410 | ||
411 | &add($E,&DWP(0,$tmp1)); # E is last "A"... | |
412 | &add($T,&DWP(4,$tmp1)); | |
413 | &add($A,&DWP(8,$tmp1)); | |
414 | &add($B,&DWP(12,$tmp1)); | |
415 | &add($C,&DWP(16,$tmp1)); | |
416 | ||
417 | &mov(&DWP(0,$tmp1),$E); # update SHA_CTX | |
418 | &add($D,64); # advance input pointer | |
419 | &mov(&DWP(4,$tmp1),$T); | |
420 | &cmp($D,&wparam(2)); # have we reached the end yet? | |
421 | &mov(&DWP(8,$tmp1),$A); | |
422 | &mov($E,$C); # C is last "E" which needs to be "pre-loaded" | |
423 | &mov(&DWP(12,$tmp1),$B); | |
424 | &mov($T,$D); # input pointer | |
425 | &mov(&DWP(16,$tmp1),$C); | |
426 | &jb(&label("loop")); | |
427 | ||
0c149802 | 428 | &stack_pop(16+3); |
c5f17d45 | 429 | &function_end("sha1_block_data_order"); |
0c149802 AP |
430 | |
431 | if ($xmm) { | |
977f32e8 | 432 | if ($shaext) { |
619b9466 AP |
433 | ###################################################################### |
434 | # Intel SHA Extensions implementation of SHA1 update function. | |
435 | # | |
436 | my ($ctx,$inp,$num)=("edi","esi","ecx"); | |
437 | my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3)); | |
438 | my @MSG=map("xmm$_",(4..7)); | |
439 | ||
440 | sub sha1rnds4 { | |
441 | my ($dst,$src,$imm)=@_; | |
442 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | |
443 | { &data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm); } | |
444 | } | |
445 | sub sha1op38 { | |
446 | my ($opcodelet,$dst,$src)=@_; | |
447 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | |
448 | { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } | |
449 | } | |
450 | sub sha1nexte { sha1op38(0xc8,@_); } | |
451 | sub sha1msg1 { sha1op38(0xc9,@_); } | |
452 | sub sha1msg2 { sha1op38(0xca,@_); } | |
453 | ||
454 | &function_begin("_sha1_block_data_order_shaext"); | |
455 | &call (&label("pic_point")); # make it PIC! | |
456 | &set_label("pic_point"); | |
457 | &blindpop($tmp1); | |
458 | &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
459 | &set_label("shaext_shortcut"); | |
460 | &mov ($ctx,&wparam(0)); | |
461 | &mov ("ebx","esp"); | |
462 | &mov ($inp,&wparam(1)); | |
463 | &mov ($num,&wparam(2)); | |
464 | &sub ("esp",32); | |
465 | ||
466 | &movdqu ($ABCD,&QWP(0,$ctx)); | |
3372c4ff | 467 | &movd ($E,&DWP(16,$ctx)); |
619b9466 AP |
468 | &and ("esp",-32); |
469 | &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap | |
470 | ||
471 | &movdqu (@MSG[0],&QWP(0,$inp)); | |
472 | &pshufd ($ABCD,$ABCD,0b00011011); # flip word order | |
473 | &movdqu (@MSG[1],&QWP(0x10,$inp)); | |
474 | &pshufd ($E,$E,0b00011011); # flip word order | |
475 | &movdqu (@MSG[2],&QWP(0x20,$inp)); | |
476 | &pshufb (@MSG[0],$BSWAP); | |
477 | &movdqu (@MSG[3],&QWP(0x30,$inp)); | |
478 | &pshufb (@MSG[1],$BSWAP); | |
479 | &pshufb (@MSG[2],$BSWAP); | |
480 | &pshufb (@MSG[3],$BSWAP); | |
481 | &jmp (&label("loop_shaext")); | |
482 | ||
483 | &set_label("loop_shaext",16); | |
484 | &dec ($num); | |
485 | &lea ("eax",&DWP(0x40,$inp)); | |
486 | &movdqa (&QWP(0,"esp"),$E); # offload $E | |
487 | &paddd ($E,@MSG[0]); | |
488 | &cmovne ($inp,"eax"); | |
489 | &movdqa (&QWP(16,"esp"),$ABCD); # offload $ABCD | |
490 | ||
491 | for($i=0;$i<20-4;$i+=2) { | |
492 | &sha1msg1 (@MSG[0],@MSG[1]); | |
493 | &movdqa ($E_,$ABCD); | |
494 | &sha1rnds4 ($ABCD,$E,int($i/5)); # 0-3... | |
495 | &sha1nexte ($E_,@MSG[1]); | |
496 | &pxor (@MSG[0],@MSG[2]); | |
497 | &sha1msg1 (@MSG[1],@MSG[2]); | |
498 | &sha1msg2 (@MSG[0],@MSG[3]); | |
499 | ||
500 | &movdqa ($E,$ABCD); | |
501 | &sha1rnds4 ($ABCD,$E_,int(($i+1)/5)); | |
502 | &sha1nexte ($E,@MSG[2]); | |
503 | &pxor (@MSG[1],@MSG[3]); | |
504 | &sha1msg2 (@MSG[1],@MSG[0]); | |
505 | ||
506 | push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); | |
507 | } | |
508 | &movdqu (@MSG[0],&QWP(0,$inp)); | |
509 | &movdqa ($E_,$ABCD); | |
510 | &sha1rnds4 ($ABCD,$E,3); # 64-67 | |
511 | &sha1nexte ($E_,@MSG[1]); | |
512 | &movdqu (@MSG[1],&QWP(0x10,$inp)); | |
513 | &pshufb (@MSG[0],$BSWAP); | |
514 | ||
515 | &movdqa ($E,$ABCD); | |
516 | &sha1rnds4 ($ABCD,$E_,3); # 68-71 | |
517 | &sha1nexte ($E,@MSG[2]); | |
518 | &movdqu (@MSG[2],&QWP(0x20,$inp)); | |
519 | &pshufb (@MSG[1],$BSWAP); | |
520 | ||
521 | &movdqa ($E_,$ABCD); | |
522 | &sha1rnds4 ($ABCD,$E,3); # 72-75 | |
523 | &sha1nexte ($E_,@MSG[3]); | |
524 | &movdqu (@MSG[3],&QWP(0x30,$inp)); | |
525 | &pshufb (@MSG[2],$BSWAP); | |
526 | ||
527 | &movdqa ($E,$ABCD); | |
528 | &sha1rnds4 ($ABCD,$E_,3); # 76-79 | |
529 | &movdqa ($E_,&QWP(0,"esp")); | |
530 | &pshufb (@MSG[3],$BSWAP); | |
531 | &sha1nexte ($E,$E_); | |
532 | &paddd ($ABCD,&QWP(16,"esp")); | |
533 | ||
534 | &jnz (&label("loop_shaext")); | |
535 | ||
536 | &pshufd ($ABCD,$ABCD,0b00011011); | |
537 | &pshufd ($E,$E,0b00011011); | |
538 | &movdqu (&QWP(0,$ctx),$ABCD) | |
539 | &movd (&DWP(16,$ctx),$E); | |
540 | &mov ("esp","ebx"); | |
541 | &function_end("_sha1_block_data_order_shaext"); | |
542 | } | |
0c149802 AP |
543 | ###################################################################### |
544 | # The SSSE3 implementation. | |
545 | # | |
546 | # %xmm[0-7] are used as ring @X[] buffer containing quadruples of last | |
547 | # 32 elements of the message schedule or Xupdate outputs. First 4 | |
548 | # quadruples are simply byte-swapped input, next 4 are calculated | |
549 | # according to method originally suggested by Dean Gaudet (modulo | |
550 | # being implemented in SSSE3). Once 8 quadruples or 32 elements are | |
551 | # collected, it switches to routine proposed by Max Locktyukhin. | |
552 | # | |
553 | # Calculations inevitably require temporary reqisters, and there are | |
554 | # no %xmm registers left to spare. For this reason part of the ring | |
555 | # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring | |
556 | # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - | |
557 | # X[-5], and X[4] - X[-4]... | |
558 | # | |
559 | # Another notable optimization is aggressive stack frame compression | |
560 | # aiming to minimize amount of 9-byte instructions... | |
561 | # | |
562 | # Yet another notable optimization is "jumping" $B variable. It means | |
563 | # that there is no register permanently allocated for $B value. This | |
564 | # allowed to eliminate one instruction from body_20_39... | |
565 | # | |
566 | my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded | |
567 | my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 | |
568 | my @V=($A,$B,$C,$D,$E); | |
569 | my $j=0; # hash round | |
35c77b73 | 570 | my $rx=0; |
0c149802 AP |
571 | my @T=($T,$tmp1); |
572 | my $inp; | |
573 | ||
574 | my $_rol=sub { &rol(@_) }; | |
575 | my $_ror=sub { &ror(@_) }; | |
576 | ||
577 | &function_begin("_sha1_block_data_order_ssse3"); | |
578 | &call (&label("pic_point")); # make it PIC! | |
579 | &set_label("pic_point"); | |
580 | &blindpop($tmp1); | |
581 | &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
582 | &set_label("ssse3_shortcut"); | |
583 | ||
584 | &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 | |
585 | &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 | |
586 | &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 | |
587 | &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 | |
588 | &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask | |
589 | ||
590 | &mov ($E,&wparam(0)); # load argument block | |
591 | &mov ($inp=@T[1],&wparam(1)); | |
592 | &mov ($D,&wparam(2)); | |
593 | &mov (@T[0],"esp"); | |
594 | ||
595 | # stack frame layout | |
596 | # | |
597 | # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area | |
598 | # X[4]+K X[5]+K X[6]+K X[7]+K | |
599 | # X[8]+K X[9]+K X[10]+K X[11]+K | |
600 | # X[12]+K X[13]+K X[14]+K X[15]+K | |
601 | # | |
602 | # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area | |
603 | # X[4] X[5] X[6] X[7] | |
604 | # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 | |
605 | # | |
606 | # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants | |
607 | # K_40_59 K_40_59 K_40_59 K_40_59 | |
608 | # K_60_79 K_60_79 K_60_79 K_60_79 | |
609 | # K_00_19 K_00_19 K_00_19 K_00_19 | |
610 | # pbswap mask | |
611 | # | |
612 | # +192 ctx # argument block | |
613 | # +196 inp | |
614 | # +200 end | |
615 | # +204 esp | |
616 | &sub ("esp",208); | |
617 | &and ("esp",-64); | |
618 | ||
619 | &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants | |
620 | &movdqa (&QWP(112+16,"esp"),@X[5]); | |
621 | &movdqa (&QWP(112+32,"esp"),@X[6]); | |
622 | &shl ($D,6); # len*64 | |
623 | &movdqa (&QWP(112+48,"esp"),@X[3]); | |
624 | &add ($D,$inp); # end of input | |
625 | &movdqa (&QWP(112+64,"esp"),@X[2]); | |
626 | &add ($inp,64); | |
627 | &mov (&DWP(192+0,"esp"),$E); # save argument block | |
628 | &mov (&DWP(192+4,"esp"),$inp); | |
629 | &mov (&DWP(192+8,"esp"),$D); | |
630 | &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp | |
631 | ||
632 | &mov ($A,&DWP(0,$E)); # load context | |
633 | &mov ($B,&DWP(4,$E)); | |
634 | &mov ($C,&DWP(8,$E)); | |
635 | &mov ($D,&DWP(12,$E)); | |
636 | &mov ($E,&DWP(16,$E)); | |
637 | &mov (@T[0],$B); # magic seed | |
638 | ||
639 | &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] | |
640 | &movdqu (@X[-3&7],&QWP(-48,$inp)); | |
641 | &movdqu (@X[-2&7],&QWP(-32,$inp)); | |
642 | &movdqu (@X[-1&7],&QWP(-16,$inp)); | |
643 | &pshufb (@X[-4&7],@X[2]); # byte swap | |
644 | &pshufb (@X[-3&7],@X[2]); | |
645 | &pshufb (@X[-2&7],@X[2]); | |
646 | &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
647 | &pshufb (@X[-1&7],@X[2]); | |
648 | &paddd (@X[-4&7],@X[3]); # add K_00_19 | |
649 | &paddd (@X[-3&7],@X[3]); | |
650 | &paddd (@X[-2&7],@X[3]); | |
651 | &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU | |
652 | &psubd (@X[-4&7],@X[3]); # restore X[] | |
653 | &movdqa (&QWP(0+16,"esp"),@X[-3&7]); | |
654 | &psubd (@X[-3&7],@X[3]); | |
655 | &movdqa (&QWP(0+32,"esp"),@X[-2&7]); | |
35c77b73 | 656 | &mov (@T[1],$C); |
0c149802 | 657 | &psubd (@X[-2&7],@X[3]); |
35c77b73 | 658 | &xor (@T[1],$D); |
b217ca63 | 659 | &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); |
35c77b73 | 660 | &and (@T[0],@T[1]); |
0c149802 AP |
661 | &jmp (&label("loop")); |
662 | ||
663 | ###################################################################### | |
69687aa8 | 664 | # SSE instruction sequence is first broken to groups of independent |
0c149802 AP |
665 | # instructions, independent in respect to their inputs and shifter |
666 | # (not all architectures have more than one). Then IALU instructions | |
667 | # are "knitted in" between the SSE groups. Distance is maintained for | |
668 | # SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer | |
669 | # [which allegedly also implements SSSE3]... | |
670 | # | |
671 | # Temporary registers usage. X[2] is volatile at the entry and at the | |
672 | # end is restored from backtrace ring buffer. X[3] is expected to | |
69687aa8 | 673 | # contain current K_XX_XX constant and is used to calculate X[-1]+K |
0c149802 AP |
674 | # from previous round, it becomes volatile the moment the value is |
675 | # saved to stack for transfer to IALU. X[4] becomes volatile whenever | |
676 | # X[-4] is accumulated and offloaded to backtrace ring buffer, at the | |
677 | # end it is loaded with next K_XX_XX [which becomes X[3] in next | |
678 | # round]... | |
679 | # | |
69687aa8 | 680 | sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 |
0c149802 AP |
681 | { use integer; |
682 | my $body = shift; | |
683 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
684 | my ($a,$b,$c,$d,$e); | |
685 | ||
b217ca63 | 686 | eval(shift(@insns)); # ror |
0c149802 AP |
687 | eval(shift(@insns)); |
688 | eval(shift(@insns)); | |
b217ca63 | 689 | &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); |
0c149802 AP |
690 | &movdqa (@X[2],@X[-1&7]); |
691 | eval(shift(@insns)); | |
692 | eval(shift(@insns)); | |
693 | ||
694 | &paddd (@X[3],@X[-1&7]); | |
695 | &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer | |
b217ca63 | 696 | eval(shift(@insns)); # rol |
0c149802 AP |
697 | eval(shift(@insns)); |
698 | &psrldq (@X[2],4); # "X[-3]", 3 dwords | |
699 | eval(shift(@insns)); | |
700 | eval(shift(@insns)); | |
701 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
702 | eval(shift(@insns)); | |
b217ca63 | 703 | eval(shift(@insns)); # ror |
0c149802 AP |
704 | |
705 | &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" | |
706 | eval(shift(@insns)); | |
707 | eval(shift(@insns)); | |
708 | eval(shift(@insns)); | |
0c149802 AP |
709 | |
710 | &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" | |
711 | eval(shift(@insns)); | |
b217ca63 | 712 | eval(shift(@insns)); # rol |
0c149802 AP |
713 | &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU |
714 | eval(shift(@insns)); | |
715 | eval(shift(@insns)); | |
716 | ||
717 | &movdqa (@X[4],@X[0]); | |
0c149802 AP |
718 | eval(shift(@insns)); |
719 | eval(shift(@insns)); | |
b217ca63 AP |
720 | eval(shift(@insns)); # ror |
721 | &movdqa (@X[2],@X[0]); | |
0c149802 AP |
722 | eval(shift(@insns)); |
723 | ||
724 | &pslldq (@X[4],12); # "X[0]"<<96, extract one dword | |
725 | &paddd (@X[0],@X[0]); | |
726 | eval(shift(@insns)); | |
727 | eval(shift(@insns)); | |
0c149802 AP |
728 | |
729 | &psrld (@X[2],31); | |
730 | eval(shift(@insns)); | |
b217ca63 | 731 | eval(shift(@insns)); # rol |
0c149802 AP |
732 | &movdqa (@X[3],@X[4]); |
733 | eval(shift(@insns)); | |
734 | eval(shift(@insns)); | |
b217ca63 | 735 | eval(shift(@insns)); |
0c149802 AP |
736 | |
737 | &psrld (@X[4],30); | |
0c149802 | 738 | eval(shift(@insns)); |
b217ca63 AP |
739 | eval(shift(@insns)); # ror |
740 | &por (@X[0],@X[2]); # "X[0]"<<<=1 | |
0c149802 AP |
741 | eval(shift(@insns)); |
742 | &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer | |
743 | eval(shift(@insns)); | |
744 | eval(shift(@insns)); | |
745 | ||
746 | &pslld (@X[3],2); | |
0c149802 | 747 | eval(shift(@insns)); |
b217ca63 AP |
748 | eval(shift(@insns)); # rol |
749 | &pxor (@X[0],@X[4]); | |
0c149802 AP |
750 | &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX |
751 | eval(shift(@insns)); | |
752 | eval(shift(@insns)); | |
753 | ||
754 | &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 | |
b217ca63 AP |
755 | &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) |
756 | &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); | |
0c149802 AP |
757 | eval(shift(@insns)); |
758 | eval(shift(@insns)); | |
759 | ||
760 | foreach (@insns) { eval; } # remaining instructions [if any] | |
761 | ||
762 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
763 | } | |
764 | ||
765 | sub Xupdate_ssse3_32_79() | |
766 | { use integer; | |
767 | my $body = shift; | |
69f45c52 | 768 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions |
0c149802 AP |
769 | my ($a,$b,$c,$d,$e); |
770 | ||
0c149802 AP |
771 | eval(shift(@insns)); # body_20_39 |
772 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
b217ca63 | 773 | &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) |
0c149802 AP |
774 | eval(shift(@insns)); |
775 | eval(shift(@insns)); | |
776 | eval(shift(@insns)); # rol | |
777 | ||
778 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
779 | &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer | |
780 | eval(shift(@insns)); | |
781 | eval(shift(@insns)); | |
b217ca63 | 782 | eval(shift(@insns)) if (@insns[0] =~ /_rol/); |
0c149802 AP |
783 | if ($Xi%5) { |
784 | &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... | |
785 | } else { # ... or load next one | |
786 | &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); | |
787 | } | |
0c149802 | 788 | eval(shift(@insns)); # ror |
b217ca63 | 789 | &paddd (@X[3],@X[-1&7]); |
0c149802 AP |
790 | eval(shift(@insns)); |
791 | ||
792 | &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" | |
793 | eval(shift(@insns)); # body_20_39 | |
794 | eval(shift(@insns)); | |
795 | eval(shift(@insns)); | |
796 | eval(shift(@insns)); # rol | |
797 | ||
798 | &movdqa (@X[2],@X[0]); | |
799 | &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU | |
800 | eval(shift(@insns)); | |
801 | eval(shift(@insns)); | |
802 | eval(shift(@insns)); # ror | |
803 | eval(shift(@insns)); | |
b217ca63 | 804 | eval(shift(@insns)) if (@insns[0] =~ /_rol/); |
0c149802 AP |
805 | |
806 | &pslld (@X[0],2); | |
807 | eval(shift(@insns)); # body_20_39 | |
808 | eval(shift(@insns)); | |
809 | &psrld (@X[2],30); | |
810 | eval(shift(@insns)); | |
811 | eval(shift(@insns)); # rol | |
812 | eval(shift(@insns)); | |
813 | eval(shift(@insns)); | |
814 | eval(shift(@insns)); # ror | |
815 | eval(shift(@insns)); | |
b217ca63 AP |
816 | eval(shift(@insns)) if (@insns[1] =~ /_rol/); |
817 | eval(shift(@insns)) if (@insns[0] =~ /_rol/); | |
0c149802 AP |
818 | |
819 | &por (@X[0],@X[2]); # "X[0]"<<<=2 | |
820 | eval(shift(@insns)); # body_20_39 | |
821 | eval(shift(@insns)); | |
822 | &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer | |
823 | eval(shift(@insns)); | |
824 | eval(shift(@insns)); # rol | |
825 | eval(shift(@insns)); | |
826 | eval(shift(@insns)); | |
827 | eval(shift(@insns)); # ror | |
b217ca63 | 828 | &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) |
0c149802 AP |
829 | eval(shift(@insns)); |
830 | ||
831 | foreach (@insns) { eval; } # remaining instructions | |
832 | ||
833 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
834 | } | |
835 | ||
836 | sub Xuplast_ssse3_80() | |
837 | { use integer; | |
838 | my $body = shift; | |
839 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
840 | my ($a,$b,$c,$d,$e); | |
841 | ||
b217ca63 AP |
842 | eval(shift(@insns)); |
843 | eval(shift(@insns)); | |
844 | eval(shift(@insns)); | |
845 | eval(shift(@insns)); | |
846 | eval(shift(@insns)); | |
847 | eval(shift(@insns)); | |
0c149802 AP |
848 | eval(shift(@insns)); |
849 | &paddd (@X[3],@X[-1&7]); | |
850 | eval(shift(@insns)); | |
851 | eval(shift(@insns)); | |
852 | eval(shift(@insns)); | |
853 | eval(shift(@insns)); | |
854 | ||
855 | &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU | |
856 | ||
857 | foreach (@insns) { eval; } # remaining instructions | |
858 | ||
859 | &mov ($inp=@T[1],&DWP(192+4,"esp")); | |
860 | &cmp ($inp,&DWP(192+8,"esp")); | |
861 | &je (&label("done")); | |
862 | ||
863 | &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 | |
864 | &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask | |
865 | &movdqu (@X[-4&7],&QWP(0,$inp)); # load input | |
866 | &movdqu (@X[-3&7],&QWP(16,$inp)); | |
867 | &movdqu (@X[-2&7],&QWP(32,$inp)); | |
868 | &movdqu (@X[-1&7],&QWP(48,$inp)); | |
869 | &add ($inp,64); | |
870 | &pshufb (@X[-4&7],@X[2]); # byte swap | |
871 | &mov (&DWP(192+4,"esp"),$inp); | |
872 | &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
873 | ||
874 | $Xi=0; | |
875 | } | |
876 | ||
877 | sub Xloop_ssse3() | |
878 | { use integer; | |
879 | my $body = shift; | |
880 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
881 | my ($a,$b,$c,$d,$e); | |
882 | ||
883 | eval(shift(@insns)); | |
884 | eval(shift(@insns)); | |
b217ca63 AP |
885 | eval(shift(@insns)); |
886 | eval(shift(@insns)); | |
887 | eval(shift(@insns)); | |
888 | eval(shift(@insns)); | |
889 | eval(shift(@insns)); | |
0c149802 AP |
890 | &pshufb (@X[($Xi-3)&7],@X[2]); |
891 | eval(shift(@insns)); | |
892 | eval(shift(@insns)); | |
b217ca63 AP |
893 | eval(shift(@insns)); |
894 | eval(shift(@insns)); | |
0c149802 AP |
895 | &paddd (@X[($Xi-4)&7],@X[3]); |
896 | eval(shift(@insns)); | |
897 | eval(shift(@insns)); | |
898 | eval(shift(@insns)); | |
899 | eval(shift(@insns)); | |
900 | &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU | |
901 | eval(shift(@insns)); | |
902 | eval(shift(@insns)); | |
b217ca63 AP |
903 | eval(shift(@insns)); |
904 | eval(shift(@insns)); | |
0c149802 AP |
905 | &psubd (@X[($Xi-4)&7],@X[3]); |
906 | ||
907 | foreach (@insns) { eval; } | |
908 | $Xi++; | |
909 | } | |
910 | ||
911 | sub Xtail_ssse3() | |
912 | { use integer; | |
913 | my $body = shift; | |
914 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
915 | my ($a,$b,$c,$d,$e); | |
916 | ||
917 | foreach (@insns) { eval; } | |
918 | } | |
919 | ||
35c77b73 AP |
920 | sub body_00_19 () { # ((c^d)&b)^d |
921 | # on start @T[0]=(c^d)&b | |
922 | return &body_20_39() if ($rx==19); $rx++; | |
0c149802 AP |
923 | ( |
924 | '($a,$b,$c,$d,$e)=@V;'. | |
35c77b73 AP |
925 | '&$_ror ($b,$j?7:2);', # $b>>>2 |
926 | '&xor (@T[0],$d);', | |
0c149802 | 927 | '&mov (@T[1],$a);', # $b in next round |
35c77b73 AP |
928 | |
929 | '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer | |
930 | '&xor ($b,$c);', # $c^$d for next round | |
931 | ||
0c149802 | 932 | '&$_rol ($a,5);', |
35c77b73 AP |
933 | '&add ($e,@T[0]);', |
934 | '&and (@T[1],$b);', # ($b&($c^$d)) for next round | |
935 | ||
936 | '&xor ($b,$c);', # restore $b | |
937 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
0c149802 AP |
938 | ); |
939 | } | |
940 | ||
35c77b73 AP |
941 | sub body_20_39 () { # b^d^c |
942 | # on entry @T[0]=b^d | |
943 | return &body_40_59() if ($rx==39); $rx++; | |
0c149802 AP |
944 | ( |
945 | '($a,$b,$c,$d,$e)=@V;'. | |
35c77b73 AP |
946 | '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer |
947 | '&xor (@T[0],$d) if($j==19);'. | |
948 | '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) | |
0c149802 | 949 | '&mov (@T[1],$a);', # $b in next round |
35c77b73 | 950 | |
0c149802 | 951 | '&$_rol ($a,5);', |
35c77b73 AP |
952 | '&add ($e,@T[0]);', |
953 | '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round | |
954 | ||
0c149802 | 955 | '&$_ror ($b,7);', # $b>>>2 |
35c77b73 | 956 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' |
0c149802 AP |
957 | ); |
958 | } | |
959 | ||
35c77b73 AP |
960 | sub body_40_59 () { # ((b^c)&(c^d))^c |
961 | # on entry @T[0]=(b^c), (c^=d) | |
962 | $rx++; | |
0c149802 AP |
963 | ( |
964 | '($a,$b,$c,$d,$e)=@V;'. | |
35c77b73 AP |
965 | '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer |
966 | '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) | |
967 | '&xor ($c,$d) if ($j>=40);', # restore $c | |
968 | ||
0c149802 | 969 | '&$_ror ($b,7);', # $b>>>2 |
35c77b73 | 970 | '&mov (@T[1],$a);', # $b for next round |
69f45c52 | 971 | '&xor (@T[0],$c);', |
35c77b73 | 972 | |
0c149802 AP |
973 | '&$_rol ($a,5);', |
974 | '&add ($e,@T[0]);', | |
35c77b73 AP |
975 | '&xor (@T[1],$c) if ($j==59);'. |
976 | '&xor (@T[1],$b) if ($j< 59);', # b^c for next round | |
977 | ||
978 | '&xor ($b,$c) if ($j< 59);', # c^d for next round | |
979 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
0c149802 AP |
980 | ); |
981 | } | |
b217ca63 AP |
982 | ###### |
983 | sub bodyx_00_19 () { # ((c^d)&b)^d | |
984 | # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K | |
985 | return &bodyx_20_39() if ($rx==19); $rx++; | |
986 | ( | |
987 | '($a,$b,$c,$d,$e)=@V;'. | |
988 | ||
989 | '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 | |
990 | '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 | |
991 | '&lea ($e,&DWP(0,$e,@T[0]));', | |
992 | '&rorx (@T[0],$a,5);', | |
993 | ||
994 | '&andn (@T[1],$a,$c);', | |
995 | '&and ($a,$b)', | |
996 | '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer | |
997 | ||
998 | '&xor (@T[1],$a)', | |
999 | '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
1000 | ); | |
1001 | } | |
1002 | ||
1003 | sub bodyx_20_39 () { # b^d^c | |
1004 | # on start $b=b^c^d | |
1005 | return &bodyx_40_59() if ($rx==39); $rx++; | |
1006 | ( | |
1007 | '($a,$b,$c,$d,$e)=@V;'. | |
1008 | ||
1009 | '&add ($e,($j==19?@T[0]:$b))', | |
1010 | '&rorx ($b,@T[1],7);', # $b>>>2 | |
1011 | '&rorx (@T[0],$a,5);', | |
1012 | ||
1013 | '&xor ($a,$b) if ($j<79);', | |
1014 | '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer | |
1015 | '&xor ($a,$c) if ($j<79);', | |
1016 | '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
1017 | ); | |
1018 | } | |
1019 | ||
1020 | sub bodyx_40_59 () { # ((b^c)&(c^d))^c | |
1021 | # on start $b=((b^c)&(c^d))^c | |
1022 | return &bodyx_20_39() if ($rx==59); $rx++; | |
1023 | ( | |
1024 | '($a,$b,$c,$d,$e)=@V;'. | |
1025 | ||
1026 | '&rorx (@T[0],$a,5)', | |
1027 | '&lea ($e,&DWP(0,$e,$b))', | |
1028 | '&rorx ($b,@T[1],7)', # $b>>>2 | |
1029 | '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer | |
1030 | ||
1031 | '&mov (@T[1],$c)', | |
1032 | '&xor ($a,$b)', # b^c for next round | |
1033 | '&xor (@T[1],$b)', # c^d for next round | |
1034 | ||
1035 | '&and ($a,@T[1])', | |
1036 | '&add ($e,@T[0])', | |
1037 | '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
1038 | ); | |
1039 | } | |
0c149802 AP |
1040 | |
1041 | &set_label("loop",16); | |
1042 | &Xupdate_ssse3_16_31(\&body_00_19); | |
1043 | &Xupdate_ssse3_16_31(\&body_00_19); | |
1044 | &Xupdate_ssse3_16_31(\&body_00_19); | |
1045 | &Xupdate_ssse3_16_31(\&body_00_19); | |
1046 | &Xupdate_ssse3_32_79(\&body_00_19); | |
1047 | &Xupdate_ssse3_32_79(\&body_20_39); | |
1048 | &Xupdate_ssse3_32_79(\&body_20_39); | |
1049 | &Xupdate_ssse3_32_79(\&body_20_39); | |
1050 | &Xupdate_ssse3_32_79(\&body_20_39); | |
1051 | &Xupdate_ssse3_32_79(\&body_20_39); | |
1052 | &Xupdate_ssse3_32_79(\&body_40_59); | |
1053 | &Xupdate_ssse3_32_79(\&body_40_59); | |
1054 | &Xupdate_ssse3_32_79(\&body_40_59); | |
1055 | &Xupdate_ssse3_32_79(\&body_40_59); | |
1056 | &Xupdate_ssse3_32_79(\&body_40_59); | |
1057 | &Xupdate_ssse3_32_79(\&body_20_39); | |
1058 | &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | |
1059 | ||
1060 | $saved_j=$j; @saved_V=@V; | |
1061 | ||
1062 | &Xloop_ssse3(\&body_20_39); | |
1063 | &Xloop_ssse3(\&body_20_39); | |
1064 | &Xloop_ssse3(\&body_20_39); | |
1065 | ||
1066 | &mov (@T[1],&DWP(192,"esp")); # update context | |
1067 | &add ($A,&DWP(0,@T[1])); | |
1068 | &add (@T[0],&DWP(4,@T[1])); # $b | |
1069 | &add ($C,&DWP(8,@T[1])); | |
1070 | &mov (&DWP(0,@T[1]),$A); | |
1071 | &add ($D,&DWP(12,@T[1])); | |
1072 | &mov (&DWP(4,@T[1]),@T[0]); | |
1073 | &add ($E,&DWP(16,@T[1])); | |
1074 | &mov (&DWP(8,@T[1]),$C); | |
35c77b73 | 1075 | &mov ($B,$C); |
0c149802 | 1076 | &mov (&DWP(12,@T[1]),$D); |
35c77b73 | 1077 | &xor ($B,$D); |
0c149802 | 1078 | &mov (&DWP(16,@T[1]),$E); |
b217ca63 AP |
1079 | &mov (@T[1],@T[0]); |
1080 | &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); | |
1081 | &and (@T[0],$B); | |
1082 | &mov ($B,$T[1]); | |
0c149802 AP |
1083 | |
1084 | &jmp (&label("loop")); | |
1085 | ||
1086 | &set_label("done",16); $j=$saved_j; @V=@saved_V; | |
1087 | ||
1088 | &Xtail_ssse3(\&body_20_39); | |
1089 | &Xtail_ssse3(\&body_20_39); | |
1090 | &Xtail_ssse3(\&body_20_39); | |
1091 | ||
1092 | &mov (@T[1],&DWP(192,"esp")); # update context | |
1093 | &add ($A,&DWP(0,@T[1])); | |
1094 | &mov ("esp",&DWP(192+12,"esp")); # restore %esp | |
1095 | &add (@T[0],&DWP(4,@T[1])); # $b | |
1096 | &add ($C,&DWP(8,@T[1])); | |
1097 | &mov (&DWP(0,@T[1]),$A); | |
1098 | &add ($D,&DWP(12,@T[1])); | |
1099 | &mov (&DWP(4,@T[1]),@T[0]); | |
1100 | &add ($E,&DWP(16,@T[1])); | |
1101 | &mov (&DWP(8,@T[1]),$C); | |
1102 | &mov (&DWP(12,@T[1]),$D); | |
1103 | &mov (&DWP(16,@T[1]),$E); | |
1104 | ||
1105 | &function_end("_sha1_block_data_order_ssse3"); | |
1106 | ||
35c77b73 AP |
1107 | $rx=0; # reset |
1108 | ||
0c149802 AP |
1109 | if ($ymm) { |
1110 | my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded | |
1111 | my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 | |
1112 | my @V=($A,$B,$C,$D,$E); | |
1113 | my $j=0; # hash round | |
1114 | my @T=($T,$tmp1); | |
1115 | my $inp; | |
1116 | ||
1117 | my $_rol=sub { &shld(@_[0],@_) }; | |
1118 | my $_ror=sub { &shrd(@_[0],@_) }; | |
1119 | ||
1120 | &function_begin("_sha1_block_data_order_avx"); | |
1121 | &call (&label("pic_point")); # make it PIC! | |
1122 | &set_label("pic_point"); | |
1123 | &blindpop($tmp1); | |
1124 | &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
1125 | &set_label("avx_shortcut"); | |
1126 | &vzeroall(); | |
1127 | ||
1128 | &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 | |
1129 | &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 | |
1130 | &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 | |
1131 | &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 | |
1132 | &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask | |
1133 | ||
1134 | &mov ($E,&wparam(0)); # load argument block | |
1135 | &mov ($inp=@T[1],&wparam(1)); | |
1136 | &mov ($D,&wparam(2)); | |
1137 | &mov (@T[0],"esp"); | |
1138 | ||
1139 | # stack frame layout | |
1140 | # | |
1141 | # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area | |
1142 | # X[4]+K X[5]+K X[6]+K X[7]+K | |
1143 | # X[8]+K X[9]+K X[10]+K X[11]+K | |
1144 | # X[12]+K X[13]+K X[14]+K X[15]+K | |
1145 | # | |
1146 | # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area | |
1147 | # X[4] X[5] X[6] X[7] | |
1148 | # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 | |
1149 | # | |
1150 | # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants | |
1151 | # K_40_59 K_40_59 K_40_59 K_40_59 | |
1152 | # K_60_79 K_60_79 K_60_79 K_60_79 | |
1153 | # K_00_19 K_00_19 K_00_19 K_00_19 | |
1154 | # pbswap mask | |
1155 | # | |
1156 | # +192 ctx # argument block | |
1157 | # +196 inp | |
1158 | # +200 end | |
1159 | # +204 esp | |
1160 | &sub ("esp",208); | |
1161 | &and ("esp",-64); | |
1162 | ||
1163 | &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants | |
1164 | &vmovdqa(&QWP(112+16,"esp"),@X[5]); | |
1165 | &vmovdqa(&QWP(112+32,"esp"),@X[6]); | |
1166 | &shl ($D,6); # len*64 | |
1167 | &vmovdqa(&QWP(112+48,"esp"),@X[3]); | |
1168 | &add ($D,$inp); # end of input | |
1169 | &vmovdqa(&QWP(112+64,"esp"),@X[2]); | |
1170 | &add ($inp,64); | |
1171 | &mov (&DWP(192+0,"esp"),$E); # save argument block | |
1172 | &mov (&DWP(192+4,"esp"),$inp); | |
1173 | &mov (&DWP(192+8,"esp"),$D); | |
1174 | &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp | |
1175 | ||
1176 | &mov ($A,&DWP(0,$E)); # load context | |
1177 | &mov ($B,&DWP(4,$E)); | |
1178 | &mov ($C,&DWP(8,$E)); | |
1179 | &mov ($D,&DWP(12,$E)); | |
1180 | &mov ($E,&DWP(16,$E)); | |
1181 | &mov (@T[0],$B); # magic seed | |
1182 | ||
1183 | &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] | |
1184 | &vmovdqu(@X[-3&7],&QWP(-48,$inp)); | |
1185 | &vmovdqu(@X[-2&7],&QWP(-32,$inp)); | |
1186 | &vmovdqu(@X[-1&7],&QWP(-16,$inp)); | |
1187 | &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
1188 | &vpshufb(@X[-3&7],@X[-3&7],@X[2]); | |
1189 | &vpshufb(@X[-2&7],@X[-2&7],@X[2]); | |
1190 | &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
1191 | &vpshufb(@X[-1&7],@X[-1&7],@X[2]); | |
1192 | &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 | |
1193 | &vpaddd (@X[1],@X[-3&7],@X[3]); | |
1194 | &vpaddd (@X[2],@X[-2&7],@X[3]); | |
1195 | &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU | |
35c77b73 | 1196 | &mov (@T[1],$C); |
0c149802 | 1197 | &vmovdqa(&QWP(0+16,"esp"),@X[1]); |
35c77b73 | 1198 | &xor (@T[1],$D); |
0c149802 | 1199 | &vmovdqa(&QWP(0+32,"esp"),@X[2]); |
35c77b73 | 1200 | &and (@T[0],@T[1]); |
0c149802 AP |
1201 | &jmp (&label("loop")); |
1202 | ||
69687aa8 | 1203 | sub Xupdate_avx_16_31() # recall that $Xi starts with 4 |
0c149802 AP |
1204 | { use integer; |
1205 | my $body = shift; | |
1206 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
1207 | my ($a,$b,$c,$d,$e); | |
1208 | ||
1209 | eval(shift(@insns)); | |
1210 | eval(shift(@insns)); | |
1211 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
1212 | eval(shift(@insns)); | |
1213 | eval(shift(@insns)); | |
1214 | ||
1215 | &vpaddd (@X[3],@X[3],@X[-1&7]); | |
1216 | &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer | |
1217 | eval(shift(@insns)); | |
1218 | eval(shift(@insns)); | |
1219 | &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords | |
1220 | eval(shift(@insns)); | |
1221 | eval(shift(@insns)); | |
1222 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
1223 | eval(shift(@insns)); | |
1224 | eval(shift(@insns)); | |
1225 | ||
1226 | &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" | |
1227 | eval(shift(@insns)); | |
1228 | eval(shift(@insns)); | |
1229 | &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU | |
1230 | eval(shift(@insns)); | |
1231 | eval(shift(@insns)); | |
1232 | ||
1233 | &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" | |
1234 | eval(shift(@insns)); | |
1235 | eval(shift(@insns)); | |
1236 | eval(shift(@insns)); | |
1237 | eval(shift(@insns)); | |
1238 | ||
1239 | &vpsrld (@X[2],@X[0],31); | |
1240 | eval(shift(@insns)); | |
1241 | eval(shift(@insns)); | |
1242 | eval(shift(@insns)); | |
1243 | eval(shift(@insns)); | |
1244 | ||
1245 | &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword | |
1246 | &vpaddd (@X[0],@X[0],@X[0]); | |
1247 | eval(shift(@insns)); | |
1248 | eval(shift(@insns)); | |
1249 | eval(shift(@insns)); | |
1250 | eval(shift(@insns)); | |
1251 | ||
1252 | &vpsrld (@X[3],@X[4],30); | |
1253 | &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 | |
1254 | eval(shift(@insns)); | |
1255 | eval(shift(@insns)); | |
1256 | eval(shift(@insns)); | |
1257 | eval(shift(@insns)); | |
1258 | ||
1259 | &vpslld (@X[4],@X[4],2); | |
1260 | &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer | |
1261 | eval(shift(@insns)); | |
1262 | eval(shift(@insns)); | |
1263 | &vpxor (@X[0],@X[0],@X[3]); | |
1264 | eval(shift(@insns)); | |
1265 | eval(shift(@insns)); | |
1266 | eval(shift(@insns)); | |
1267 | eval(shift(@insns)); | |
1268 | ||
1269 | &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 | |
1270 | eval(shift(@insns)); | |
1271 | eval(shift(@insns)); | |
1272 | &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX | |
1273 | eval(shift(@insns)); | |
1274 | eval(shift(@insns)); | |
1275 | ||
1276 | foreach (@insns) { eval; } # remaining instructions [if any] | |
1277 | ||
1278 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
1279 | } | |
1280 | ||
1281 | sub Xupdate_avx_32_79() | |
1282 | { use integer; | |
1283 | my $body = shift; | |
69f45c52 | 1284 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions |
0c149802 AP |
1285 | my ($a,$b,$c,$d,$e); |
1286 | ||
1287 | &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | |
1288 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
1289 | eval(shift(@insns)); # body_20_39 | |
1290 | eval(shift(@insns)); | |
1291 | eval(shift(@insns)); | |
1292 | eval(shift(@insns)); # rol | |
1293 | ||
1294 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
1295 | &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer | |
1296 | eval(shift(@insns)); | |
1297 | eval(shift(@insns)); | |
1298 | if ($Xi%5) { | |
1299 | &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... | |
1300 | } else { # ... or load next one | |
1301 | &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); | |
1302 | } | |
1303 | &vpaddd (@X[3],@X[3],@X[-1&7]); | |
1304 | eval(shift(@insns)); # ror | |
1305 | eval(shift(@insns)); | |
1306 | ||
1307 | &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" | |
1308 | eval(shift(@insns)); # body_20_39 | |
1309 | eval(shift(@insns)); | |
1310 | eval(shift(@insns)); | |
1311 | eval(shift(@insns)); # rol | |
1312 | ||
1313 | &vpsrld (@X[2],@X[0],30); | |
1314 | &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU | |
1315 | eval(shift(@insns)); | |
1316 | eval(shift(@insns)); | |
1317 | eval(shift(@insns)); # ror | |
1318 | eval(shift(@insns)); | |
1319 | ||
1320 | &vpslld (@X[0],@X[0],2); | |
1321 | eval(shift(@insns)); # body_20_39 | |
1322 | eval(shift(@insns)); | |
1323 | eval(shift(@insns)); | |
1324 | eval(shift(@insns)); # rol | |
1325 | eval(shift(@insns)); | |
1326 | eval(shift(@insns)); | |
1327 | eval(shift(@insns)); # ror | |
1328 | eval(shift(@insns)); | |
1329 | ||
1330 | &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 | |
1331 | eval(shift(@insns)); # body_20_39 | |
1332 | eval(shift(@insns)); | |
1333 | &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer | |
1334 | eval(shift(@insns)); | |
1335 | eval(shift(@insns)); # rol | |
1336 | eval(shift(@insns)); | |
1337 | eval(shift(@insns)); | |
1338 | eval(shift(@insns)); # ror | |
1339 | eval(shift(@insns)); | |
1340 | ||
1341 | foreach (@insns) { eval; } # remaining instructions | |
1342 | ||
1343 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
1344 | } | |
1345 | ||
1346 | sub Xuplast_avx_80() | |
1347 | { use integer; | |
1348 | my $body = shift; | |
1349 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
1350 | my ($a,$b,$c,$d,$e); | |
1351 | ||
1352 | eval(shift(@insns)); | |
1353 | &vpaddd (@X[3],@X[3],@X[-1&7]); | |
1354 | eval(shift(@insns)); | |
1355 | eval(shift(@insns)); | |
1356 | eval(shift(@insns)); | |
1357 | eval(shift(@insns)); | |
1358 | ||
1359 | &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU | |
1360 | ||
1361 | foreach (@insns) { eval; } # remaining instructions | |
1362 | ||
1363 | &mov ($inp=@T[1],&DWP(192+4,"esp")); | |
1364 | &cmp ($inp,&DWP(192+8,"esp")); | |
1365 | &je (&label("done")); | |
1366 | ||
1367 | &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 | |
1368 | &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask | |
1369 | &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input | |
1370 | &vmovdqu(@X[-3&7],&QWP(16,$inp)); | |
1371 | &vmovdqu(@X[-2&7],&QWP(32,$inp)); | |
1372 | &vmovdqu(@X[-1&7],&QWP(48,$inp)); | |
1373 | &add ($inp,64); | |
1374 | &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
1375 | &mov (&DWP(192+4,"esp"),$inp); | |
1376 | &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
1377 | ||
1378 | $Xi=0; | |
1379 | } | |
1380 | ||
1381 | sub Xloop_avx() | |
1382 | { use integer; | |
1383 | my $body = shift; | |
1384 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
1385 | my ($a,$b,$c,$d,$e); | |
1386 | ||
1387 | eval(shift(@insns)); | |
1388 | eval(shift(@insns)); | |
1389 | &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | |
1390 | eval(shift(@insns)); | |
1391 | eval(shift(@insns)); | |
1392 | &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); | |
1393 | eval(shift(@insns)); | |
1394 | eval(shift(@insns)); | |
1395 | eval(shift(@insns)); | |
1396 | eval(shift(@insns)); | |
1397 | &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU | |
1398 | eval(shift(@insns)); | |
1399 | eval(shift(@insns)); | |
1400 | ||
1401 | foreach (@insns) { eval; } | |
1402 | $Xi++; | |
1403 | } | |
1404 | ||
1405 | sub Xtail_avx() | |
1406 | { use integer; | |
1407 | my $body = shift; | |
1408 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
1409 | my ($a,$b,$c,$d,$e); | |
1410 | ||
1411 | foreach (@insns) { eval; } | |
1412 | } | |
1413 | ||
1414 | &set_label("loop",16); | |
1415 | &Xupdate_avx_16_31(\&body_00_19); | |
1416 | &Xupdate_avx_16_31(\&body_00_19); | |
1417 | &Xupdate_avx_16_31(\&body_00_19); | |
1418 | &Xupdate_avx_16_31(\&body_00_19); | |
1419 | &Xupdate_avx_32_79(\&body_00_19); | |
1420 | &Xupdate_avx_32_79(\&body_20_39); | |
1421 | &Xupdate_avx_32_79(\&body_20_39); | |
1422 | &Xupdate_avx_32_79(\&body_20_39); | |
1423 | &Xupdate_avx_32_79(\&body_20_39); | |
1424 | &Xupdate_avx_32_79(\&body_20_39); | |
1425 | &Xupdate_avx_32_79(\&body_40_59); | |
1426 | &Xupdate_avx_32_79(\&body_40_59); | |
1427 | &Xupdate_avx_32_79(\&body_40_59); | |
1428 | &Xupdate_avx_32_79(\&body_40_59); | |
1429 | &Xupdate_avx_32_79(\&body_40_59); | |
1430 | &Xupdate_avx_32_79(\&body_20_39); | |
1431 | &Xuplast_avx_80(\&body_20_39); # can jump to "done" | |
1432 | ||
1433 | $saved_j=$j; @saved_V=@V; | |
1434 | ||
1435 | &Xloop_avx(\&body_20_39); | |
1436 | &Xloop_avx(\&body_20_39); | |
1437 | &Xloop_avx(\&body_20_39); | |
1438 | ||
1439 | &mov (@T[1],&DWP(192,"esp")); # update context | |
1440 | &add ($A,&DWP(0,@T[1])); | |
1441 | &add (@T[0],&DWP(4,@T[1])); # $b | |
1442 | &add ($C,&DWP(8,@T[1])); | |
1443 | &mov (&DWP(0,@T[1]),$A); | |
1444 | &add ($D,&DWP(12,@T[1])); | |
1445 | &mov (&DWP(4,@T[1]),@T[0]); | |
1446 | &add ($E,&DWP(16,@T[1])); | |
35c77b73 | 1447 | &mov ($B,$C); |
0c149802 | 1448 | &mov (&DWP(8,@T[1]),$C); |
35c77b73 | 1449 | &xor ($B,$D); |
0c149802 AP |
1450 | &mov (&DWP(12,@T[1]),$D); |
1451 | &mov (&DWP(16,@T[1]),$E); | |
b217ca63 AP |
1452 | &mov (@T[1],@T[0]); |
1453 | &and (@T[0],$B); | |
1454 | &mov ($B,@T[1]); | |
0c149802 AP |
1455 | |
1456 | &jmp (&label("loop")); | |
1457 | ||
1458 | &set_label("done",16); $j=$saved_j; @V=@saved_V; | |
1459 | ||
1460 | &Xtail_avx(\&body_20_39); | |
1461 | &Xtail_avx(\&body_20_39); | |
1462 | &Xtail_avx(\&body_20_39); | |
1463 | ||
1464 | &vzeroall(); | |
1465 | ||
1466 | &mov (@T[1],&DWP(192,"esp")); # update context | |
1467 | &add ($A,&DWP(0,@T[1])); | |
1468 | &mov ("esp",&DWP(192+12,"esp")); # restore %esp | |
1469 | &add (@T[0],&DWP(4,@T[1])); # $b | |
1470 | &add ($C,&DWP(8,@T[1])); | |
1471 | &mov (&DWP(0,@T[1]),$A); | |
1472 | &add ($D,&DWP(12,@T[1])); | |
1473 | &mov (&DWP(4,@T[1]),@T[0]); | |
1474 | &add ($E,&DWP(16,@T[1])); | |
1475 | &mov (&DWP(8,@T[1]),$C); | |
1476 | &mov (&DWP(12,@T[1]),$D); | |
1477 | &mov (&DWP(16,@T[1]),$E); | |
1478 | &function_end("_sha1_block_data_order_avx"); | |
1479 | } | |
1480 | &set_label("K_XX_XX",64); | |
1481 | &data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 | |
1482 | &data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 | |
1483 | &data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 | |
1484 | &data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 | |
1485 | &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask | |
619b9466 | 1486 | &data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0); |
0c149802 | 1487 | } |
f0f61f6d | 1488 | &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
58964a49 | 1489 | |
500b5a18 | 1490 | &asm_finish(); |
e87e380a RL |
1491 | |
1492 | close STDOUT; |