]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
a598ed0d | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
2337eb58 AP |
9 | # |
10 | # ==================================================================== | |
83698d31 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
2337eb58 | 12 | # project. Rights for redistribution and usage in source and binary |
389c09fa | 13 | # forms are granted according to the License. |
2337eb58 AP |
14 | # ==================================================================== |
15 | # | |
16 | # sha256/512_block procedure for x86_64. | |
17 | # | |
4a5b8a5b AP |
18 | # 40% improvement over compiler-generated code on Opteron. On EM64T |
19 | # sha256 was observed to run >80% faster and sha512 - >40%. No magical | |
2337eb58 AP |
20 | # tricks, just straight implementation... I really wonder why gcc |
21 | # [being armed with inline assembler] fails to generate as fast code. | |
22 | # The only thing which is cool about this module is that it's very | |
23 | # same instruction sequence used for both SHA-256 and SHA-512. In | |
24 | # former case the instructions operate on 32-bit operands, while in | |
25 | # latter - on 64-bit ones. All I had to do is to get one flavor right, | |
26 | # the other one passed the test right away:-) | |
27 | # | |
28 | # sha256_block runs in ~1005 cycles on Opteron, which gives you | |
29 | # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock | |
30 | # frequency in GHz. sha512_block runs in ~1275 cycles, which results | |
31 | # in 128*1000/1275=100MBps per GHz. Is there room for improvement? | |
32 | # Well, if you compare it to IA-64 implementation, which maintains | |
33 | # X[16] in register bank[!], tends to 4 instructions per CPU clock | |
34 | # cycle and runs in 1003 cycles, 1275 is very good result for 3-way | |
35 | # issue Opteron pipeline and X[16] maintained in memory. So that *if* | |
36 | # there is a way to improve it, *then* the only way would be to try to | |
37 | # offload X[16] updates to SSE unit, but that would require "deeper" | |
38 | # loop unroll, which in turn would naturally cause size blow-up, not | |
39 | # to mention increased complexity! And once again, only *if* it's | |
40 | # actually possible to noticeably improve overall ILP, instruction | |
41 | # level parallelism, on a given CPU implementation in this case. | |
42 | # | |
43 | # Special note on Intel EM64T. While Opteron CPU exhibits perfect | |
60250017 | 44 | # performance ratio of 1.5 between 64- and 32-bit flavors [see above], |
4a5b8a5b AP |
45 | # [currently available] EM64T CPUs apparently are far from it. On the |
46 | # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit | |
47 | # sha256_block:-( This is presumably because 64-bit shifts/rotates | |
48 | # apparently are not atomic instructions, but implemented in microcode. | |
83698d31 AP |
49 | # |
50 | # May 2012. | |
51 | # | |
52 | # Optimization including one of Pavel Semjanov's ideas, alternative | |
a8f3b8b5 | 53 | # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and |
c7f690c2 | 54 | # unfortunately -2% SHA512 on P4 [which nobody should care about |
a8f3b8b5 AP |
55 | # that much]. |
56 | # | |
57 | # June 2012. | |
58 | # | |
59 | # Add SIMD code paths, see below for improvement coefficients. SSSE3 | |
60 | # code path was not attempted for SHA512, because improvement is not | |
61 | # estimated to be high enough, noticeably less than 9%, to justify | |
62 | # the effort, not on pre-AVX processors. [Obviously with exclusion | |
63 | # for VIA Nano, but it has SHA512 instruction that is faster and | |
64 | # should be used instead.] For reference, corresponding estimated | |
65 | # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that | |
66 | # higher coefficients are observed on VIA Nano and Bulldozer has more | |
67 | # to do with specifics of their architecture [which is topic for | |
68 | # separate discussion]. | |
c4558efb AP |
69 | # |
70 | # November 2012. | |
71 | # | |
72 | # Add AVX2 code path. Two consecutive input blocks are loaded to | |
73 | # 256-bit %ymm registers, with data from first block to least | |
74 | # significant 128-bit halves and data from second to most significant. | |
75 | # The data is then processed with same SIMD instruction sequence as | |
76 | # for AVX, but with %ymm as operands. Side effect is increased stack | |
619b9466 AP |
77 | # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB |
78 | # code size increase. | |
79 | # | |
80 | # March 2014. | |
81 | # | |
82 | # Add support for Intel SHA Extensions. | |
a8f3b8b5 AP |
83 | |
84 | ###################################################################### | |
85 | # Current performance in cycles per processed byte (less is better): | |
86 | # | |
87 | # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) | |
88 | # | |
c7f690c2 AP |
89 | # AMD K8 14.9 - - 9.57 - |
90 | # P4 17.3 - - 30.8 - | |
91 | # Core 2 15.6 13.8(+13%) - 9.97 - | |
92 | # Westmere 14.8 12.3(+19%) - 9.58 - | |
504bbcf3 AP |
93 | # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) |
94 | # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) | |
c7f690c2 | 95 | # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) |
b7f5503f | 96 | # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) |
c7f690c2 | 97 | # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) |
54f8f9a1 | 98 | # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) |
504bbcf3 | 99 | # VIA Nano 23.0 16.5(+39%) - 14.7 - |
c7f690c2 | 100 | # Atom 23.0 18.9(+22%) - 14.7 - |
b59f92e7 | 101 | # Silvermont 27.4 20.6(+33%) - 17.5 - |
64d92d74 | 102 | # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) |
ace05265 | 103 | # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - |
a8f3b8b5 | 104 | # |
ace05265 | 105 | # (*) whichever best applicable, including SHAEXT; |
a8f3b8b5 AP |
106 | # (**) switch from ror to shrd stands for fair share of improvement; |
107 | # (***) execution time is fully determined by remaining integer-only | |
108 | # part, body_00_15; reducing the amount of SIMD instructions | |
109 | # below certain limit makes no difference/sense; to conserve | |
110 | # space SHA256 XOP code path is therefore omitted; | |
2337eb58 | 111 | |
1aa89a7a RL |
112 | # $output is the last argument if it looks like a file (it has an extension) |
113 | # $flavour is the first argument if it doesn't look like a file | |
114 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
115 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
55eab3b7 | 116 | |
be01f79d AP |
117 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
118 | ||
55eab3b7 AP |
119 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
120 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
121 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
122 | die "can't locate x86_64-xlate.pl"; | |
123 | ||
c4558efb AP |
124 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
125 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
126 | $avx = ($1>=2.19) + ($1>=2.22); | |
127 | } | |
128 | ||
129 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
130 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
131 | $avx = ($1>=2.09) + ($1>=2.10); | |
132 | } | |
133 | ||
134 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
135 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
136 | $avx = ($1>=10) + ($1>=11); | |
137 | } | |
a8f3b8b5 | 138 | |
9bb3e5fd | 139 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { |
a356e488 | 140 | $avx = ($2>=3.0) + ($2>3.0); |
ac171925 AP |
141 | } |
142 | ||
977f32e8 AP |
143 | $shaext=1; ### set to zero if compiling for 1.0.1 |
144 | $avx=1 if (!$shaext && $avx); | |
145 | ||
1aa89a7a RL |
146 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
147 | or die "can't call $xlate: $!"; | |
46bf83f0 | 148 | *STDOUT=*OUT; |
2337eb58 AP |
149 | |
150 | if ($output =~ /512/) { | |
c5f17d45 | 151 | $func="sha512_block_data_order"; |
2337eb58 AP |
152 | $TABLE="K512"; |
153 | $SZ=8; | |
154 | @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", | |
155 | "%r8", "%r9", "%r10","%r11"); | |
83698d31 | 156 | ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); |
2337eb58 AP |
157 | @Sigma0=(28,34,39); |
158 | @Sigma1=(14,18,41); | |
159 | @sigma0=(1, 8, 7); | |
160 | @sigma1=(19,61, 6); | |
161 | $rounds=80; | |
162 | } else { | |
c5f17d45 | 163 | $func="sha256_block_data_order"; |
2337eb58 AP |
164 | $TABLE="K256"; |
165 | $SZ=4; | |
166 | @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", | |
167 | "%r8d","%r9d","%r10d","%r11d"); | |
83698d31 | 168 | ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); |
2337eb58 AP |
169 | @Sigma0=( 2,13,22); |
170 | @Sigma1=( 6,11,25); | |
171 | @sigma0=( 7,18, 3); | |
172 | @sigma1=(17,19,10); | |
173 | $rounds=64; | |
174 | } | |
175 | ||
83698d31 | 176 | $ctx="%rdi"; # 1st arg, zapped by $a3 |
2337eb58 AP |
177 | $inp="%rsi"; # 2nd arg |
178 | $Tbl="%rbp"; | |
179 | ||
180 | $_ctx="16*$SZ+0*8(%rsp)"; | |
181 | $_inp="16*$SZ+1*8(%rsp)"; | |
182 | $_end="16*$SZ+2*8(%rsp)"; | |
399976c7 | 183 | $_rsp="`16*$SZ+3*8`(%rsp)"; |
c5f17d45 | 184 | $framesz="16*$SZ+4*8"; |
2337eb58 AP |
185 | |
186 | ||
187 | sub ROUND_00_15() | |
188 | { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | |
c4558efb AP |
189 | my $STRIDE=$SZ; |
190 | $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); | |
2337eb58 AP |
191 | |
192 | $code.=<<___; | |
d2fd65f6 | 193 | ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 |
2337eb58 AP |
194 | mov $f,$a2 |
195 | ||
d2fd65f6 | 196 | xor $e,$a0 |
c7f690c2 | 197 | ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 |
2337eb58 AP |
198 | xor $g,$a2 # f^g |
199 | ||
3a9b3852 | 200 | mov $T1,`$SZ*($i&0xf)`(%rsp) |
d2fd65f6 | 201 | xor $a,$a1 |
2337eb58 | 202 | and $e,$a2 # (f^g)&e |
83698d31 | 203 | |
3a9b3852 | 204 | ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 |
83698d31 AP |
205 | add $h,$T1 # T1+=h |
206 | xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g | |
2337eb58 | 207 | |
d2fd65f6 AP |
208 | ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 |
209 | xor $e,$a0 | |
3a9b3852 | 210 | add $a2,$T1 # T1+=Ch(e,f,g) |
2337eb58 | 211 | |
83698d31 | 212 | mov $a,$a2 |
3a9b3852 | 213 | add ($Tbl),$T1 # T1+=K[round] |
d2fd65f6 | 214 | xor $a,$a1 |
2337eb58 | 215 | |
83698d31 | 216 | xor $b,$a2 # a^b, b^c in next round |
c7f690c2 | 217 | ror \$$Sigma1[0],$a0 # Sigma1(e) |
83698d31 | 218 | mov $b,$h |
2337eb58 | 219 | |
83698d31 | 220 | and $a2,$a3 |
c7f690c2 | 221 | ror \$$Sigma0[0],$a1 # Sigma0(a) |
d2fd65f6 | 222 | add $a0,$T1 # T1+=Sigma1(e) |
2337eb58 | 223 | |
83698d31 | 224 | xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) |
2337eb58 | 225 | add $T1,$d # d+=T1 |
2337eb58 | 226 | add $T1,$h # h+=T1 |
c7f690c2 | 227 | |
c4558efb | 228 | lea $STRIDE($Tbl),$Tbl # round++ |
c7f690c2 AP |
229 | ___ |
230 | $code.=<<___ if ($i<15); | |
d2fd65f6 | 231 | add $a1,$h # h+=Sigma0(a) |
2337eb58 | 232 | ___ |
83698d31 | 233 | ($a2,$a3) = ($a3,$a2); |
2337eb58 AP |
234 | } |
235 | ||
236 | sub ROUND_16_XX() | |
237 | { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | |
238 | ||
239 | $code.=<<___; | |
c7f690c2 AP |
240 | mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 |
241 | mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 | |
83698d31 | 242 | |
d2fd65f6 | 243 | mov $a0,$T1 |
83698d31 | 244 | ror \$`$sigma0[1]-$sigma0[0]`,$a0 |
c7f690c2 AP |
245 | add $a1,$a # modulo-scheduled h+=Sigma0(a) |
246 | mov $a2,$a1 | |
247 | ror \$`$sigma1[1]-$sigma1[0]`,$a2 | |
2337eb58 | 248 | |
83698d31 AP |
249 | xor $T1,$a0 |
250 | shr \$$sigma0[2],$T1 | |
251 | ror \$$sigma0[0],$a0 | |
c7f690c2 AP |
252 | xor $a1,$a2 |
253 | shr \$$sigma1[2],$a1 | |
2337eb58 | 254 | |
c7f690c2 | 255 | ror \$$sigma1[0],$a2 |
83698d31 | 256 | xor $a0,$T1 # sigma0(X[(i+1)&0xf]) |
c7f690c2 | 257 | xor $a1,$a2 # sigma1(X[(i+14)&0xf]) |
83698d31 | 258 | add `$SZ*(($i+9)&0xf)`(%rsp),$T1 |
2337eb58 AP |
259 | |
260 | add `$SZ*($i&0xf)`(%rsp),$T1 | |
d2fd65f6 | 261 | mov $e,$a0 |
c7f690c2 | 262 | add $a2,$T1 |
d2fd65f6 | 263 | mov $a,$a1 |
2337eb58 AP |
264 | ___ |
265 | &ROUND_00_15(@_); | |
266 | } | |
267 | ||
268 | $code=<<___; | |
269 | .text | |
270 | ||
a8f3b8b5 | 271 | .extern OPENSSL_ia32cap_P |
2337eb58 | 272 | .globl $func |
c4558efb | 273 | .type $func,\@function,3 |
2337eb58 AP |
274 | .align 16 |
275 | $func: | |
399976c7 | 276 | .cfi_startproc |
a8f3b8b5 AP |
277 | ___ |
278 | $code.=<<___ if ($SZ==4 || $avx); | |
279 | lea OPENSSL_ia32cap_P(%rip),%r11 | |
c4558efb AP |
280 | mov 0(%r11),%r9d |
281 | mov 4(%r11),%r10d | |
282 | mov 8(%r11),%r11d | |
a8f3b8b5 | 283 | ___ |
977f32e8 | 284 | $code.=<<___ if ($SZ==4 && $shaext); |
619b9466 AP |
285 | test \$`1<<29`,%r11d # check for SHA |
286 | jnz _shaext_shortcut | |
287 | ___ | |
f6ff1aa8 | 288 | $code.=<<___ if ($avx && $SZ==8); |
c4558efb | 289 | test \$`1<<11`,%r10d # check for XOP |
a8f3b8b5 AP |
290 | jnz .Lxop_shortcut |
291 | ___ | |
c4558efb AP |
292 | $code.=<<___ if ($avx>1); |
293 | and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 | |
294 | cmp \$`1<<8|1<<5|1<<3`,%r11d | |
295 | je .Lavx2_shortcut | |
296 | ___ | |
a8f3b8b5 | 297 | $code.=<<___ if ($avx); |
c4558efb AP |
298 | and \$`1<<30`,%r9d # mask "Intel CPU" bit |
299 | and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits | |
300 | or %r9d,%r10d | |
301 | cmp \$`1<<28|1<<9|1<<30`,%r10d | |
a8f3b8b5 AP |
302 | je .Lavx_shortcut |
303 | ___ | |
304 | $code.=<<___ if ($SZ==4); | |
c4558efb | 305 | test \$`1<<9`,%r10d |
a8f3b8b5 AP |
306 | jnz .Lssse3_shortcut |
307 | ___ | |
308 | $code.=<<___; | |
384e6de4 | 309 | mov %rsp,%rax # copy %rsp |
399976c7 | 310 | .cfi_def_cfa_register %rax |
2337eb58 | 311 | push %rbx |
399976c7 | 312 | .cfi_push %rbx |
2337eb58 | 313 | push %rbp |
399976c7 | 314 | .cfi_push %rbp |
2337eb58 | 315 | push %r12 |
399976c7 | 316 | .cfi_push %r12 |
2337eb58 | 317 | push %r13 |
399976c7 | 318 | .cfi_push %r13 |
2337eb58 | 319 | push %r14 |
399976c7 | 320 | .cfi_push %r14 |
2337eb58 | 321 | push %r15 |
399976c7 | 322 | .cfi_push %r15 |
2337eb58 AP |
323 | shl \$4,%rdx # num*16 |
324 | sub \$$framesz,%rsp | |
325 | lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |
326 | and \$-64,%rsp # align stack frame | |
327 | mov $ctx,$_ctx # save ctx, 1st arg | |
328 | mov $inp,$_inp # save inp, 2nd arh | |
329 | mov %rdx,$_end # save end pointer, "3rd" arg | |
384e6de4 | 330 | mov %rax,$_rsp # save copy of %rsp |
399976c7 | 331 | .cfi_cfa_expression $_rsp,deref,+8 |
be01f79d | 332 | .Lprologue: |
2337eb58 | 333 | |
2337eb58 AP |
334 | mov $SZ*0($ctx),$A |
335 | mov $SZ*1($ctx),$B | |
336 | mov $SZ*2($ctx),$C | |
337 | mov $SZ*3($ctx),$D | |
338 | mov $SZ*4($ctx),$E | |
339 | mov $SZ*5($ctx),$F | |
340 | mov $SZ*6($ctx),$G | |
341 | mov $SZ*7($ctx),$H | |
342 | jmp .Lloop | |
343 | ||
344 | .align 16 | |
345 | .Lloop: | |
83698d31 AP |
346 | mov $B,$a3 |
347 | lea $TABLE(%rip),$Tbl | |
348 | xor $C,$a3 # magic | |
2337eb58 | 349 | ___ |
2337eb58 AP |
350 | for($i=0;$i<16;$i++) { |
351 | $code.=" mov $SZ*$i($inp),$T1\n"; | |
d2fd65f6 AP |
352 | $code.=" mov @ROT[4],$a0\n"; |
353 | $code.=" mov @ROT[0],$a1\n"; | |
2337eb58 AP |
354 | $code.=" bswap $T1\n"; |
355 | &ROUND_00_15($i,@ROT); | |
356 | unshift(@ROT,pop(@ROT)); | |
357 | } | |
358 | $code.=<<___; | |
359 | jmp .Lrounds_16_xx | |
360 | .align 16 | |
361 | .Lrounds_16_xx: | |
362 | ___ | |
363 | for(;$i<32;$i++) { | |
364 | &ROUND_16_XX($i,@ROT); | |
365 | unshift(@ROT,pop(@ROT)); | |
366 | } | |
367 | ||
368 | $code.=<<___; | |
a8f3b8b5 | 369 | cmpb \$0,`$SZ-1`($Tbl) |
83698d31 | 370 | jnz .Lrounds_16_xx |
2337eb58 AP |
371 | |
372 | mov $_ctx,$ctx | |
c7f690c2 | 373 | add $a1,$A # modulo-scheduled h+=Sigma0(a) |
2337eb58 AP |
374 | lea 16*$SZ($inp),$inp |
375 | ||
376 | add $SZ*0($ctx),$A | |
377 | add $SZ*1($ctx),$B | |
378 | add $SZ*2($ctx),$C | |
379 | add $SZ*3($ctx),$D | |
380 | add $SZ*4($ctx),$E | |
381 | add $SZ*5($ctx),$F | |
382 | add $SZ*6($ctx),$G | |
383 | add $SZ*7($ctx),$H | |
384 | ||
385 | cmp $_end,$inp | |
386 | ||
387 | mov $A,$SZ*0($ctx) | |
388 | mov $B,$SZ*1($ctx) | |
389 | mov $C,$SZ*2($ctx) | |
390 | mov $D,$SZ*3($ctx) | |
391 | mov $E,$SZ*4($ctx) | |
392 | mov $F,$SZ*5($ctx) | |
393 | mov $G,$SZ*6($ctx) | |
394 | mov $H,$SZ*7($ctx) | |
395 | jb .Lloop | |
396 | ||
be01f79d | 397 | mov $_rsp,%rsi |
399976c7 | 398 | .cfi_def_cfa %rsi,8 |
384e6de4 | 399 | mov -48(%rsi),%r15 |
399976c7 | 400 | .cfi_restore %r15 |
384e6de4 | 401 | mov -40(%rsi),%r14 |
399976c7 | 402 | .cfi_restore %r14 |
384e6de4 | 403 | mov -32(%rsi),%r13 |
399976c7 | 404 | .cfi_restore %r13 |
384e6de4 | 405 | mov -24(%rsi),%r12 |
399976c7 | 406 | .cfi_restore %r12 |
384e6de4 | 407 | mov -16(%rsi),%rbp |
399976c7 | 408 | .cfi_restore %rbp |
384e6de4 | 409 | mov -8(%rsi),%rbx |
399976c7 | 410 | .cfi_restore %rbx |
384e6de4 | 411 | lea (%rsi),%rsp |
399976c7 | 412 | .cfi_def_cfa_register %rsp |
be01f79d | 413 | .Lepilogue: |
2337eb58 | 414 | ret |
399976c7 | 415 | .cfi_endproc |
2337eb58 AP |
416 | .size $func,.-$func |
417 | ___ | |
418 | ||
419 | if ($SZ==4) { | |
420 | $code.=<<___; | |
421 | .align 64 | |
422 | .type $TABLE,\@object | |
423 | $TABLE: | |
424 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
c4558efb AP |
425 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
426 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
2337eb58 AP |
427 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
428 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
c4558efb AP |
429 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 |
430 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
2337eb58 AP |
431 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 |
432 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
c4558efb AP |
433 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc |
434 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
2337eb58 AP |
435 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da |
436 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
c4558efb AP |
437 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 |
438 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
2337eb58 AP |
439 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 |
440 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
c4558efb | 441 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 |
2337eb58 | 442 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 |
c4558efb AP |
443 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 |
444 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
2337eb58 AP |
445 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 |
446 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
c4558efb AP |
447 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 |
448 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
2337eb58 AP |
449 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 |
450 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
c4558efb AP |
451 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 |
452 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
2337eb58 AP |
453 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
454 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
c4558efb | 455 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
a8f3b8b5 | 456 | |
c4558efb | 457 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f |
a8f3b8b5 AP |
458 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f |
459 | .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff | |
c4558efb AP |
460 | .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff |
461 | .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 | |
a8f3b8b5 | 462 | .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 |
83698d31 | 463 | .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
2337eb58 AP |
464 | ___ |
465 | } else { | |
466 | $code.=<<___; | |
467 | .align 64 | |
468 | .type $TABLE,\@object | |
469 | $TABLE: | |
470 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd | |
c4558efb AP |
471 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd |
472 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc | |
2337eb58 AP |
473 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc |
474 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 | |
c4558efb AP |
475 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 |
476 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 | |
2337eb58 AP |
477 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 |
478 | .quad 0xd807aa98a3030242,0x12835b0145706fbe | |
c4558efb AP |
479 | .quad 0xd807aa98a3030242,0x12835b0145706fbe |
480 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 | |
2337eb58 AP |
481 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 |
482 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 | |
c4558efb | 483 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 |
2337eb58 | 484 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 |
c4558efb AP |
485 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 |
486 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 | |
2337eb58 AP |
487 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 |
488 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 | |
c4558efb AP |
489 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 |
490 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 | |
2337eb58 AP |
491 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 |
492 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 | |
c4558efb AP |
493 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 |
494 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 | |
2337eb58 AP |
495 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 |
496 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 | |
c4558efb AP |
497 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 |
498 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 | |
2337eb58 AP |
499 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 |
500 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 | |
c4558efb AP |
501 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 |
502 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 | |
2337eb58 AP |
503 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 |
504 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df | |
c4558efb AP |
505 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df |
506 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 | |
2337eb58 AP |
507 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 |
508 | .quad 0x81c2c92e47edaee6,0x92722c851482353b | |
c4558efb AP |
509 | .quad 0x81c2c92e47edaee6,0x92722c851482353b |
510 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 | |
2337eb58 AP |
511 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 |
512 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 | |
c4558efb AP |
513 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 |
514 | .quad 0xd192e819d6ef5218,0xd69906245565a910 | |
2337eb58 AP |
515 | .quad 0xd192e819d6ef5218,0xd69906245565a910 |
516 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 | |
c4558efb | 517 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 |
2337eb58 | 518 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 |
c4558efb AP |
519 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 |
520 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 | |
2337eb58 AP |
521 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 |
522 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb | |
c4558efb AP |
523 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb |
524 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 | |
2337eb58 AP |
525 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 |
526 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 | |
c4558efb AP |
527 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 |
528 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec | |
2337eb58 AP |
529 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec |
530 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 | |
c4558efb AP |
531 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 |
532 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b | |
2337eb58 AP |
533 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b |
534 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 | |
c4558efb AP |
535 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 |
536 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | |
2337eb58 AP |
537 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 |
538 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 | |
c4558efb AP |
539 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 |
540 | .quad 0x113f9804bef90dae,0x1b710b35131c471b | |
2337eb58 AP |
541 | .quad 0x113f9804bef90dae,0x1b710b35131c471b |
542 | .quad 0x28db77f523047d84,0x32caab7b40c72493 | |
c4558efb AP |
543 | .quad 0x28db77f523047d84,0x32caab7b40c72493 |
544 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | |
2337eb58 AP |
545 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c |
546 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | |
c4558efb AP |
547 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a |
548 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | |
2337eb58 | 549 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 |
a8f3b8b5 AP |
550 | |
551 | .quad 0x0001020304050607,0x08090a0b0c0d0e0f | |
c4558efb AP |
552 | .quad 0x0001020304050607,0x08090a0b0c0d0e0f |
553 | .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | |
2337eb58 AP |
554 | ___ |
555 | } | |
556 | ||
a8f3b8b5 AP |
557 | ###################################################################### |
558 | # SIMD code paths | |
559 | # | |
977f32e8 | 560 | if ($SZ==4 && $shaext) {{{ |
619b9466 AP |
561 | ###################################################################### |
562 | # Intel SHA Extensions implementation of SHA256 update function. | |
563 | # | |
564 | my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); | |
565 | ||
566 | my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); | |
567 | my @MSG=map("%xmm$_",(3..6)); | |
568 | ||
569 | $code.=<<___; | |
570 | .type sha256_block_data_order_shaext,\@function,3 | |
571 | .align 64 | |
572 | sha256_block_data_order_shaext: | |
573 | _shaext_shortcut: | |
b0d3442e | 574 | .cfi_startproc |
619b9466 AP |
575 | ___ |
576 | $code.=<<___ if ($win64); | |
577 | lea `-8-5*16`(%rsp),%rsp | |
578 | movaps %xmm6,-8-5*16(%rax) | |
579 | movaps %xmm7,-8-4*16(%rax) | |
580 | movaps %xmm8,-8-3*16(%rax) | |
581 | movaps %xmm9,-8-2*16(%rax) | |
582 | movaps %xmm10,-8-1*16(%rax) | |
583 | .Lprologue_shaext: | |
584 | ___ | |
585 | $code.=<<___; | |
586 | lea K256+0x80(%rip),$Tbl | |
587 | movdqu ($ctx),$ABEF # DCBA | |
588 | movdqu 16($ctx),$CDGH # HGFE | |
589 | movdqa 0x200-0x80($Tbl),$TMP # byte swap mask | |
590 | ||
591 | pshufd \$0x1b,$ABEF,$Wi # ABCD | |
592 | pshufd \$0xb1,$ABEF,$ABEF # CDAB | |
593 | pshufd \$0x1b,$CDGH,$CDGH # EFGH | |
594 | movdqa $TMP,$BSWAP # offload | |
595 | palignr \$8,$CDGH,$ABEF # ABEF | |
596 | punpcklqdq $Wi,$CDGH # CDGH | |
597 | jmp .Loop_shaext | |
598 | ||
599 | .align 16 | |
600 | .Loop_shaext: | |
601 | movdqu ($inp),@MSG[0] | |
602 | movdqu 0x10($inp),@MSG[1] | |
603 | movdqu 0x20($inp),@MSG[2] | |
604 | pshufb $TMP,@MSG[0] | |
605 | movdqu 0x30($inp),@MSG[3] | |
606 | ||
607 | movdqa 0*32-0x80($Tbl),$Wi | |
608 | paddd @MSG[0],$Wi | |
609 | pshufb $TMP,@MSG[1] | |
610 | movdqa $CDGH,$CDGH_SAVE # offload | |
611 | sha256rnds2 $ABEF,$CDGH # 0-3 | |
612 | pshufd \$0x0e,$Wi,$Wi | |
613 | nop | |
614 | movdqa $ABEF,$ABEF_SAVE # offload | |
615 | sha256rnds2 $CDGH,$ABEF | |
616 | ||
617 | movdqa 1*32-0x80($Tbl),$Wi | |
618 | paddd @MSG[1],$Wi | |
619 | pshufb $TMP,@MSG[2] | |
620 | sha256rnds2 $ABEF,$CDGH # 4-7 | |
621 | pshufd \$0x0e,$Wi,$Wi | |
622 | lea 0x40($inp),$inp | |
623 | sha256msg1 @MSG[1],@MSG[0] | |
624 | sha256rnds2 $CDGH,$ABEF | |
625 | ||
626 | movdqa 2*32-0x80($Tbl),$Wi | |
627 | paddd @MSG[2],$Wi | |
628 | pshufb $TMP,@MSG[3] | |
629 | sha256rnds2 $ABEF,$CDGH # 8-11 | |
630 | pshufd \$0x0e,$Wi,$Wi | |
631 | movdqa @MSG[3],$TMP | |
632 | palignr \$4,@MSG[2],$TMP | |
633 | nop | |
634 | paddd $TMP,@MSG[0] | |
635 | sha256msg1 @MSG[2],@MSG[1] | |
636 | sha256rnds2 $CDGH,$ABEF | |
637 | ||
638 | movdqa 3*32-0x80($Tbl),$Wi | |
639 | paddd @MSG[3],$Wi | |
640 | sha256msg2 @MSG[3],@MSG[0] | |
641 | sha256rnds2 $ABEF,$CDGH # 12-15 | |
642 | pshufd \$0x0e,$Wi,$Wi | |
643 | movdqa @MSG[0],$TMP | |
644 | palignr \$4,@MSG[3],$TMP | |
645 | nop | |
646 | paddd $TMP,@MSG[1] | |
647 | sha256msg1 @MSG[3],@MSG[2] | |
648 | sha256rnds2 $CDGH,$ABEF | |
649 | ___ | |
650 | for($i=4;$i<16-3;$i++) { | |
651 | $code.=<<___; | |
652 | movdqa $i*32-0x80($Tbl),$Wi | |
653 | paddd @MSG[0],$Wi | |
654 | sha256msg2 @MSG[0],@MSG[1] | |
655 | sha256rnds2 $ABEF,$CDGH # 16-19... | |
656 | pshufd \$0x0e,$Wi,$Wi | |
657 | movdqa @MSG[1],$TMP | |
658 | palignr \$4,@MSG[0],$TMP | |
659 | nop | |
660 | paddd $TMP,@MSG[2] | |
661 | sha256msg1 @MSG[0],@MSG[3] | |
662 | sha256rnds2 $CDGH,$ABEF | |
663 | ___ | |
664 | push(@MSG,shift(@MSG)); | |
665 | } | |
666 | $code.=<<___; | |
667 | movdqa 13*32-0x80($Tbl),$Wi | |
668 | paddd @MSG[0],$Wi | |
669 | sha256msg2 @MSG[0],@MSG[1] | |
670 | sha256rnds2 $ABEF,$CDGH # 52-55 | |
671 | pshufd \$0x0e,$Wi,$Wi | |
672 | movdqa @MSG[1],$TMP | |
673 | palignr \$4,@MSG[0],$TMP | |
674 | sha256rnds2 $CDGH,$ABEF | |
675 | paddd $TMP,@MSG[2] | |
676 | ||
677 | movdqa 14*32-0x80($Tbl),$Wi | |
678 | paddd @MSG[1],$Wi | |
679 | sha256rnds2 $ABEF,$CDGH # 56-59 | |
680 | pshufd \$0x0e,$Wi,$Wi | |
681 | sha256msg2 @MSG[1],@MSG[2] | |
682 | movdqa $BSWAP,$TMP | |
683 | sha256rnds2 $CDGH,$ABEF | |
684 | ||
685 | movdqa 15*32-0x80($Tbl),$Wi | |
686 | paddd @MSG[2],$Wi | |
687 | nop | |
688 | sha256rnds2 $ABEF,$CDGH # 60-63 | |
689 | pshufd \$0x0e,$Wi,$Wi | |
690 | dec $num | |
691 | nop | |
692 | sha256rnds2 $CDGH,$ABEF | |
693 | ||
694 | paddd $CDGH_SAVE,$CDGH | |
695 | paddd $ABEF_SAVE,$ABEF | |
696 | jnz .Loop_shaext | |
697 | ||
698 | pshufd \$0xb1,$CDGH,$CDGH # DCHG | |
699 | pshufd \$0x1b,$ABEF,$TMP # FEBA | |
700 | pshufd \$0xb1,$ABEF,$ABEF # BAFE | |
701 | punpckhqdq $CDGH,$ABEF # DCBA | |
702 | palignr \$8,$TMP,$CDGH # HGFE | |
703 | ||
704 | movdqu $ABEF,($ctx) | |
705 | movdqu $CDGH,16($ctx) | |
706 | ___ | |
707 | $code.=<<___ if ($win64); | |
708 | movaps -8-5*16(%rax),%xmm6 | |
709 | movaps -8-4*16(%rax),%xmm7 | |
710 | movaps -8-3*16(%rax),%xmm8 | |
711 | movaps -8-2*16(%rax),%xmm9 | |
712 | movaps -8-1*16(%rax),%xmm10 | |
713 | mov %rax,%rsp | |
714 | .Lepilogue_shaext: | |
715 | ___ | |
716 | $code.=<<___; | |
717 | ret | |
b0d3442e | 718 | .cfi_endproc |
619b9466 AP |
719 | .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext |
720 | ___ | |
721 | }}} | |
a8f3b8b5 AP |
722 | {{{ |
723 | ||
724 | my $a4=$T1; | |
725 | my ($a,$b,$c,$d,$e,$f,$g,$h); | |
726 | ||
727 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | |
728 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | |
729 | my $arg = pop; | |
730 | $arg = "\$$arg" if ($arg*1 eq $arg); | |
731 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | |
732 | } | |
733 | ||
734 | sub body_00_15 () { | |
735 | ( | |
736 | '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. | |
737 | ||
738 | '&ror ($a0,$Sigma1[2]-$Sigma1[1])', | |
739 | '&mov ($a,$a1)', | |
740 | '&mov ($a4,$f)', | |
741 | ||
a8f3b8b5 | 742 | '&ror ($a1,$Sigma0[2]-$Sigma0[1])', |
c7f690c2 | 743 | '&xor ($a0,$e)', |
a8f3b8b5 AP |
744 | '&xor ($a4,$g)', # f^g |
745 | ||
746 | '&ror ($a0,$Sigma1[1]-$Sigma1[0])', | |
747 | '&xor ($a1,$a)', | |
748 | '&and ($a4,$e)', # (f^g)&e | |
749 | ||
750 | '&xor ($a0,$e)', | |
751 | '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] | |
752 | '&mov ($a2,$a)', | |
753 | ||
a8f3b8b5 | 754 | '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g |
c7f690c2 | 755 | '&ror ($a1,$Sigma0[1]-$Sigma0[0])', |
a8f3b8b5 AP |
756 | '&xor ($a2,$b)', # a^b, b^c in next round |
757 | ||
a8f3b8b5 | 758 | '&add ($h,$a4)', # h+=Ch(e,f,g) |
c7f690c2 | 759 | '&ror ($a0,$Sigma1[0])', # Sigma1(e) |
a8f3b8b5 AP |
760 | '&and ($a3,$a2)', # (b^c)&(a^b) |
761 | ||
762 | '&xor ($a1,$a)', | |
763 | '&add ($h,$a0)', # h+=Sigma1(e) | |
764 | '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) | |
765 | ||
a8f3b8b5 | 766 | '&ror ($a1,$Sigma0[0])', # Sigma0(a) |
c7f690c2 | 767 | '&add ($d,$h)', # d+=h |
a8f3b8b5 AP |
768 | '&add ($h,$a3)', # h+=Maj(a,b,c) |
769 | ||
770 | '&mov ($a0,$d)', | |
771 | '&add ($a1,$h);'. # h+=Sigma0(a) | |
772 | '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' | |
773 | ); | |
774 | } | |
775 | ||
776 | ###################################################################### | |
777 | # SSSE3 code path | |
778 | # | |
779 | if ($SZ==4) { # SHA256 only | |
780 | my @X = map("%xmm$_",(0..3)); | |
781 | my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); | |
782 | ||
783 | $code.=<<___; | |
c4558efb | 784 | .type ${func}_ssse3,\@function,3 |
a8f3b8b5 AP |
785 | .align 64 |
786 | ${func}_ssse3: | |
399976c7 | 787 | .cfi_startproc |
a8f3b8b5 | 788 | .Lssse3_shortcut: |
384e6de4 | 789 | mov %rsp,%rax # copy %rsp |
399976c7 | 790 | .cfi_def_cfa_register %rax |
a8f3b8b5 | 791 | push %rbx |
399976c7 | 792 | .cfi_push %rbx |
a8f3b8b5 | 793 | push %rbp |
399976c7 | 794 | .cfi_push %rbp |
a8f3b8b5 | 795 | push %r12 |
399976c7 | 796 | .cfi_push %r12 |
a8f3b8b5 | 797 | push %r13 |
399976c7 | 798 | .cfi_push %r13 |
a8f3b8b5 | 799 | push %r14 |
399976c7 | 800 | .cfi_push %r14 |
a8f3b8b5 | 801 | push %r15 |
399976c7 | 802 | .cfi_push %r15 |
a8f3b8b5 AP |
803 | shl \$4,%rdx # num*16 |
804 | sub \$`$framesz+$win64*16*4`,%rsp | |
805 | lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |
806 | and \$-64,%rsp # align stack frame | |
807 | mov $ctx,$_ctx # save ctx, 1st arg | |
808 | mov $inp,$_inp # save inp, 2nd arh | |
809 | mov %rdx,$_end # save end pointer, "3rd" arg | |
384e6de4 | 810 | mov %rax,$_rsp # save copy of %rsp |
399976c7 | 811 | .cfi_cfa_expression $_rsp,deref,+8 |
a8f3b8b5 AP |
812 | ___ |
813 | $code.=<<___ if ($win64); | |
814 | movaps %xmm6,16*$SZ+32(%rsp) | |
815 | movaps %xmm7,16*$SZ+48(%rsp) | |
816 | movaps %xmm8,16*$SZ+64(%rsp) | |
817 | movaps %xmm9,16*$SZ+80(%rsp) | |
818 | ___ | |
819 | $code.=<<___; | |
820 | .Lprologue_ssse3: | |
821 | ||
822 | mov $SZ*0($ctx),$A | |
823 | mov $SZ*1($ctx),$B | |
824 | mov $SZ*2($ctx),$C | |
825 | mov $SZ*3($ctx),$D | |
826 | mov $SZ*4($ctx),$E | |
827 | mov $SZ*5($ctx),$F | |
828 | mov $SZ*6($ctx),$G | |
829 | mov $SZ*7($ctx),$H | |
830 | ___ | |
831 | ||
832 | $code.=<<___; | |
504bbcf3 AP |
833 | #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 |
834 | #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 | |
a8f3b8b5 AP |
835 | jmp .Lloop_ssse3 |
836 | .align 16 | |
837 | .Lloop_ssse3: | |
c4558efb | 838 | movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
a8f3b8b5 AP |
839 | movdqu 0x00($inp),@X[0] |
840 | movdqu 0x10($inp),@X[1] | |
841 | movdqu 0x20($inp),@X[2] | |
a8f3b8b5 | 842 | pshufb $t3,@X[0] |
619b9466 | 843 | movdqu 0x30($inp),@X[3] |
a8f3b8b5 AP |
844 | lea $TABLE(%rip),$Tbl |
845 | pshufb $t3,@X[1] | |
846 | movdqa 0x00($Tbl),$t0 | |
c4558efb | 847 | movdqa 0x20($Tbl),$t1 |
619b9466 | 848 | pshufb $t3,@X[2] |
a8f3b8b5 | 849 | paddd @X[0],$t0 |
c4558efb | 850 | movdqa 0x40($Tbl),$t2 |
a8f3b8b5 | 851 | pshufb $t3,@X[3] |
c4558efb | 852 | movdqa 0x60($Tbl),$t3 |
a8f3b8b5 AP |
853 | paddd @X[1],$t1 |
854 | paddd @X[2],$t2 | |
855 | paddd @X[3],$t3 | |
856 | movdqa $t0,0x00(%rsp) | |
857 | mov $A,$a1 | |
858 | movdqa $t1,0x10(%rsp) | |
859 | mov $B,$a3 | |
860 | movdqa $t2,0x20(%rsp) | |
861 | xor $C,$a3 # magic | |
862 | movdqa $t3,0x30(%rsp) | |
863 | mov $E,$a0 | |
864 | jmp .Lssse3_00_47 | |
865 | ||
866 | .align 16 | |
867 | .Lssse3_00_47: | |
147cca8f | 868 | sub \$`-16*2*$SZ`,$Tbl # size optimization |
a8f3b8b5 AP |
869 | ___ |
870 | sub Xupdate_256_SSSE3 () { | |
871 | ( | |
872 | '&movdqa ($t0,@X[1]);', | |
873 | '&movdqa ($t3,@X[3])', | |
874 | '&palignr ($t0,@X[0],$SZ)', # X[1..4] | |
875 | '&palignr ($t3,@X[2],$SZ);', # X[9..12] | |
876 | '&movdqa ($t1,$t0)', | |
877 | '&movdqa ($t2,$t0);', | |
878 | '&psrld ($t0,$sigma0[2])', | |
879 | '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] | |
880 | '&psrld ($t2,$sigma0[0])', | |
881 | '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] | |
882 | '&pslld ($t1,8*$SZ-$sigma0[1]);'. | |
883 | '&pxor ($t0,$t2)', | |
884 | '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. | |
885 | '&pxor ($t0,$t1)', | |
886 | '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. | |
887 | '&pxor ($t0,$t2);', | |
888 | '&movdqa ($t2,$t3)', | |
889 | '&pxor ($t0,$t1);', # sigma0(X[1..4]) | |
890 | '&psrld ($t3,$sigma1[2])', | |
891 | '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) | |
892 | '&psrlq ($t2,$sigma1[0])', | |
893 | '&pxor ($t3,$t2);', | |
894 | '&psrlq ($t2,$sigma1[1]-$sigma1[0])', | |
895 | '&pxor ($t3,$t2)', | |
896 | '&pshufb ($t3,$t4)', # sigma1(X[14..15]) | |
897 | '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) | |
898 | '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] | |
899 | '&movdqa ($t2,$t3);', | |
900 | '&psrld ($t3,$sigma1[2])', | |
901 | '&psrlq ($t2,$sigma1[0])', | |
902 | '&pxor ($t3,$t2);', | |
903 | '&psrlq ($t2,$sigma1[1]-$sigma1[0])', | |
904 | '&pxor ($t3,$t2);', | |
c4558efb | 905 | '&movdqa ($t2,16*2*$j."($Tbl)")', |
a8f3b8b5 AP |
906 | '&pshufb ($t3,$t5)', |
907 | '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) | |
908 | ); | |
909 | } | |
910 | ||
911 | sub SSSE3_256_00_47 () { | |
912 | my $j = shift; | |
913 | my $body = shift; | |
914 | my @X = @_; | |
915 | my @insns = (&$body,&$body,&$body,&$body); # 104 instructions | |
916 | ||
917 | if (0) { | |
918 | foreach (Xupdate_256_SSSE3()) { # 36 instructions | |
919 | eval; | |
920 | eval(shift(@insns)); | |
921 | eval(shift(@insns)); | |
922 | eval(shift(@insns)); | |
923 | } | |
c7f690c2 | 924 | } else { # squeeze extra 4% on Westmere and 19% on Atom |
a8f3b8b5 | 925 | eval(shift(@insns)); #@ |
a8f3b8b5 AP |
926 | &movdqa ($t0,@X[1]); |
927 | eval(shift(@insns)); | |
c7f690c2 | 928 | eval(shift(@insns)); |
a8f3b8b5 | 929 | &movdqa ($t3,@X[3]); |
c7f690c2 AP |
930 | eval(shift(@insns)); #@ |
931 | eval(shift(@insns)); | |
a8f3b8b5 AP |
932 | eval(shift(@insns)); |
933 | eval(shift(@insns)); #@ | |
934 | eval(shift(@insns)); | |
935 | &palignr ($t0,@X[0],$SZ); # X[1..4] | |
a8f3b8b5 | 936 | eval(shift(@insns)); |
a8f3b8b5 | 937 | eval(shift(@insns)); |
c7f690c2 | 938 | &palignr ($t3,@X[2],$SZ); # X[9..12] |
a8f3b8b5 AP |
939 | eval(shift(@insns)); |
940 | eval(shift(@insns)); | |
941 | eval(shift(@insns)); | |
942 | eval(shift(@insns)); #@ | |
a8f3b8b5 AP |
943 | &movdqa ($t1,$t0); |
944 | eval(shift(@insns)); | |
c7f690c2 | 945 | eval(shift(@insns)); |
a8f3b8b5 AP |
946 | &movdqa ($t2,$t0); |
947 | eval(shift(@insns)); #@ | |
948 | eval(shift(@insns)); | |
a8f3b8b5 AP |
949 | &psrld ($t0,$sigma0[2]); |
950 | eval(shift(@insns)); | |
951 | eval(shift(@insns)); | |
952 | eval(shift(@insns)); | |
953 | &paddd (@X[0],$t3); # X[0..3] += X[9..12] | |
a8f3b8b5 AP |
954 | eval(shift(@insns)); #@ |
955 | eval(shift(@insns)); | |
956 | &psrld ($t2,$sigma0[0]); | |
957 | eval(shift(@insns)); | |
958 | eval(shift(@insns)); | |
a8f3b8b5 AP |
959 | &pshufd ($t3,@X[3],0b11111010); # X[4..15] |
960 | eval(shift(@insns)); | |
c7f690c2 | 961 | eval(shift(@insns)); #@ |
a8f3b8b5 AP |
962 | &pslld ($t1,8*$SZ-$sigma0[1]); |
963 | eval(shift(@insns)); | |
c7f690c2 | 964 | eval(shift(@insns)); |
a8f3b8b5 AP |
965 | &pxor ($t0,$t2); |
966 | eval(shift(@insns)); #@ | |
967 | eval(shift(@insns)); | |
c7f690c2 | 968 | eval(shift(@insns)); |
a8f3b8b5 | 969 | eval(shift(@insns)); #@ |
c7f690c2 | 970 | &psrld ($t2,$sigma0[1]-$sigma0[0]); |
a8f3b8b5 AP |
971 | eval(shift(@insns)); |
972 | &pxor ($t0,$t1); | |
973 | eval(shift(@insns)); | |
974 | eval(shift(@insns)); | |
975 | &pslld ($t1,$sigma0[1]-$sigma0[0]); | |
976 | eval(shift(@insns)); | |
c7f690c2 | 977 | eval(shift(@insns)); |
a8f3b8b5 AP |
978 | &pxor ($t0,$t2); |
979 | eval(shift(@insns)); | |
980 | eval(shift(@insns)); #@ | |
a8f3b8b5 AP |
981 | &movdqa ($t2,$t3); |
982 | eval(shift(@insns)); | |
a8f3b8b5 AP |
983 | eval(shift(@insns)); |
984 | &pxor ($t0,$t1); # sigma0(X[1..4]) | |
c7f690c2 | 985 | eval(shift(@insns)); #@ |
a8f3b8b5 AP |
986 | eval(shift(@insns)); |
987 | eval(shift(@insns)); | |
988 | &psrld ($t3,$sigma1[2]); | |
989 | eval(shift(@insns)); | |
990 | eval(shift(@insns)); | |
991 | &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) | |
a8f3b8b5 AP |
992 | eval(shift(@insns)); #@ |
993 | eval(shift(@insns)); | |
a8f3b8b5 AP |
994 | &psrlq ($t2,$sigma1[0]); |
995 | eval(shift(@insns)); | |
a8f3b8b5 AP |
996 | eval(shift(@insns)); |
997 | eval(shift(@insns)); | |
998 | &pxor ($t3,$t2); | |
c7f690c2 AP |
999 | eval(shift(@insns)); #@ |
1000 | eval(shift(@insns)); | |
a8f3b8b5 AP |
1001 | eval(shift(@insns)); |
1002 | eval(shift(@insns)); #@ | |
1003 | &psrlq ($t2,$sigma1[1]-$sigma1[0]); | |
1004 | eval(shift(@insns)); | |
a8f3b8b5 AP |
1005 | eval(shift(@insns)); |
1006 | &pxor ($t3,$t2); | |
c7f690c2 | 1007 | eval(shift(@insns)); #@ |
a8f3b8b5 AP |
1008 | eval(shift(@insns)); |
1009 | eval(shift(@insns)); | |
504bbcf3 AP |
1010 | #&pshufb ($t3,$t4); # sigma1(X[14..15]) |
1011 | &pshufd ($t3,$t3,0b10000000); | |
a8f3b8b5 | 1012 | eval(shift(@insns)); |
c7f690c2 | 1013 | eval(shift(@insns)); |
a8f3b8b5 | 1014 | eval(shift(@insns)); |
504bbcf3 | 1015 | &psrldq ($t3,8); |
a8f3b8b5 AP |
1016 | eval(shift(@insns)); |
1017 | eval(shift(@insns)); #@ | |
c7f690c2 AP |
1018 | eval(shift(@insns)); |
1019 | eval(shift(@insns)); | |
1020 | eval(shift(@insns)); #@ | |
a8f3b8b5 AP |
1021 | &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) |
1022 | eval(shift(@insns)); | |
a8f3b8b5 AP |
1023 | eval(shift(@insns)); |
1024 | eval(shift(@insns)); | |
c7f690c2 | 1025 | &pshufd ($t3,@X[0],0b01010000); # X[16..17] |
a8f3b8b5 | 1026 | eval(shift(@insns)); |
c7f690c2 | 1027 | eval(shift(@insns)); #@ |
a8f3b8b5 AP |
1028 | eval(shift(@insns)); |
1029 | &movdqa ($t2,$t3); | |
1030 | eval(shift(@insns)); | |
a8f3b8b5 AP |
1031 | eval(shift(@insns)); |
1032 | &psrld ($t3,$sigma1[2]); | |
1033 | eval(shift(@insns)); | |
a8f3b8b5 | 1034 | eval(shift(@insns)); #@ |
c7f690c2 | 1035 | &psrlq ($t2,$sigma1[0]); |
a8f3b8b5 AP |
1036 | eval(shift(@insns)); |
1037 | eval(shift(@insns)); | |
1038 | &pxor ($t3,$t2); | |
c7f690c2 AP |
1039 | eval(shift(@insns)); #@ |
1040 | eval(shift(@insns)); | |
a8f3b8b5 AP |
1041 | eval(shift(@insns)); |
1042 | eval(shift(@insns)); #@ | |
1043 | eval(shift(@insns)); | |
1044 | &psrlq ($t2,$sigma1[1]-$sigma1[0]); | |
a8f3b8b5 AP |
1045 | eval(shift(@insns)); |
1046 | eval(shift(@insns)); | |
1047 | eval(shift(@insns)); | |
1048 | &pxor ($t3,$t2); | |
1049 | eval(shift(@insns)); | |
1050 | eval(shift(@insns)); | |
a8f3b8b5 | 1051 | eval(shift(@insns)); #@ |
504bbcf3 AP |
1052 | #&pshufb ($t3,$t5); |
1053 | &pshufd ($t3,$t3,0b00001000); | |
a8f3b8b5 | 1054 | eval(shift(@insns)); |
c7f690c2 AP |
1055 | eval(shift(@insns)); |
1056 | &movdqa ($t2,16*2*$j."($Tbl)"); | |
a8f3b8b5 AP |
1057 | eval(shift(@insns)); #@ |
1058 | eval(shift(@insns)); | |
504bbcf3 | 1059 | &pslldq ($t3,8); |
a8f3b8b5 AP |
1060 | eval(shift(@insns)); |
1061 | eval(shift(@insns)); | |
a8f3b8b5 | 1062 | eval(shift(@insns)); |
c7f690c2 AP |
1063 | &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) |
1064 | eval(shift(@insns)); #@ | |
a8f3b8b5 AP |
1065 | eval(shift(@insns)); |
1066 | eval(shift(@insns)); | |
1067 | } | |
1068 | &paddd ($t2,@X[0]); | |
1069 | foreach (@insns) { eval; } # remaining instructions | |
1070 | &movdqa (16*$j."(%rsp)",$t2); | |
1071 | } | |
1072 | ||
1073 | for ($i=0,$j=0; $j<4; $j++) { | |
1074 | &SSSE3_256_00_47($j,\&body_00_15,@X); | |
1075 | push(@X,shift(@X)); # rotate(@X) | |
1076 | } | |
c4558efb | 1077 | &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); |
a8f3b8b5 AP |
1078 | &jne (".Lssse3_00_47"); |
1079 | ||
1080 | for ($i=0; $i<16; ) { | |
1081 | foreach(body_00_15()) { eval; } | |
1082 | } | |
1083 | $code.=<<___; | |
1084 | mov $_ctx,$ctx | |
1085 | mov $a1,$A | |
1086 | ||
1087 | add $SZ*0($ctx),$A | |
1088 | lea 16*$SZ($inp),$inp | |
1089 | add $SZ*1($ctx),$B | |
1090 | add $SZ*2($ctx),$C | |
1091 | add $SZ*3($ctx),$D | |
1092 | add $SZ*4($ctx),$E | |
1093 | add $SZ*5($ctx),$F | |
1094 | add $SZ*6($ctx),$G | |
1095 | add $SZ*7($ctx),$H | |
1096 | ||
1097 | cmp $_end,$inp | |
1098 | ||
1099 | mov $A,$SZ*0($ctx) | |
1100 | mov $B,$SZ*1($ctx) | |
1101 | mov $C,$SZ*2($ctx) | |
1102 | mov $D,$SZ*3($ctx) | |
1103 | mov $E,$SZ*4($ctx) | |
1104 | mov $F,$SZ*5($ctx) | |
1105 | mov $G,$SZ*6($ctx) | |
1106 | mov $H,$SZ*7($ctx) | |
1107 | jb .Lloop_ssse3 | |
1108 | ||
1109 | mov $_rsp,%rsi | |
399976c7 | 1110 | .cfi_def_cfa %rsi,8 |
a8f3b8b5 AP |
1111 | ___ |
1112 | $code.=<<___ if ($win64); | |
1113 | movaps 16*$SZ+32(%rsp),%xmm6 | |
1114 | movaps 16*$SZ+48(%rsp),%xmm7 | |
1115 | movaps 16*$SZ+64(%rsp),%xmm8 | |
1116 | movaps 16*$SZ+80(%rsp),%xmm9 | |
1117 | ___ | |
1118 | $code.=<<___; | |
384e6de4 | 1119 | mov -48(%rsi),%r15 |
399976c7 | 1120 | .cfi_restore %r15 |
384e6de4 | 1121 | mov -40(%rsi),%r14 |
399976c7 | 1122 | .cfi_restore %r14 |
384e6de4 | 1123 | mov -32(%rsi),%r13 |
399976c7 | 1124 | .cfi_restore %r13 |
384e6de4 | 1125 | mov -24(%rsi),%r12 |
399976c7 | 1126 | .cfi_restore %r12 |
384e6de4 | 1127 | mov -16(%rsi),%rbp |
399976c7 | 1128 | .cfi_restore %rbp |
384e6de4 | 1129 | mov -8(%rsi),%rbx |
399976c7 | 1130 | .cfi_restore %rbx |
384e6de4 | 1131 | lea (%rsi),%rsp |
399976c7 | 1132 | .cfi_def_cfa_register %rsp |
a8f3b8b5 AP |
1133 | .Lepilogue_ssse3: |
1134 | ret | |
399976c7 | 1135 | .cfi_endproc |
a8f3b8b5 AP |
1136 | .size ${func}_ssse3,.-${func}_ssse3 |
1137 | ___ | |
1138 | } | |
1139 | ||
1140 | if ($avx) {{ | |
1141 | ###################################################################### | |
1142 | # XOP code path | |
1143 | # | |
f6ff1aa8 | 1144 | if ($SZ==8) { # SHA512 only |
a8f3b8b5 | 1145 | $code.=<<___; |
c4558efb | 1146 | .type ${func}_xop,\@function,3 |
a8f3b8b5 AP |
1147 | .align 64 |
1148 | ${func}_xop: | |
399976c7 | 1149 | .cfi_startproc |
a8f3b8b5 | 1150 | .Lxop_shortcut: |
384e6de4 | 1151 | mov %rsp,%rax # copy %rsp |
399976c7 | 1152 | .cfi_def_cfa_register %rax |
a8f3b8b5 | 1153 | push %rbx |
399976c7 | 1154 | .cfi_push %rbx |
a8f3b8b5 | 1155 | push %rbp |
399976c7 | 1156 | .cfi_push %rbp |
a8f3b8b5 | 1157 | push %r12 |
399976c7 | 1158 | .cfi_push %r12 |
a8f3b8b5 | 1159 | push %r13 |
399976c7 | 1160 | .cfi_push %r13 |
a8f3b8b5 | 1161 | push %r14 |
399976c7 | 1162 | .cfi_push %r14 |
a8f3b8b5 | 1163 | push %r15 |
399976c7 | 1164 | .cfi_push %r15 |
a8f3b8b5 AP |
1165 | shl \$4,%rdx # num*16 |
1166 | sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp | |
1167 | lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |
1168 | and \$-64,%rsp # align stack frame | |
1169 | mov $ctx,$_ctx # save ctx, 1st arg | |
1170 | mov $inp,$_inp # save inp, 2nd arh | |
1171 | mov %rdx,$_end # save end pointer, "3rd" arg | |
384e6de4 | 1172 | mov %rax,$_rsp # save copy of %rsp |
399976c7 | 1173 | .cfi_cfa_expression $_rsp,deref,+8 |
a8f3b8b5 AP |
1174 | ___ |
1175 | $code.=<<___ if ($win64); | |
1176 | movaps %xmm6,16*$SZ+32(%rsp) | |
1177 | movaps %xmm7,16*$SZ+48(%rsp) | |
1178 | movaps %xmm8,16*$SZ+64(%rsp) | |
1179 | movaps %xmm9,16*$SZ+80(%rsp) | |
1180 | ___ | |
1181 | $code.=<<___ if ($win64 && $SZ>4); | |
1182 | movaps %xmm10,16*$SZ+96(%rsp) | |
1183 | movaps %xmm11,16*$SZ+112(%rsp) | |
1184 | ___ | |
1185 | $code.=<<___; | |
1186 | .Lprologue_xop: | |
1187 | ||
00678437 | 1188 | vzeroupper |
a8f3b8b5 AP |
1189 | mov $SZ*0($ctx),$A |
1190 | mov $SZ*1($ctx),$B | |
1191 | mov $SZ*2($ctx),$C | |
1192 | mov $SZ*3($ctx),$D | |
1193 | mov $SZ*4($ctx),$E | |
1194 | mov $SZ*5($ctx),$F | |
1195 | mov $SZ*6($ctx),$G | |
1196 | mov $SZ*7($ctx),$H | |
1197 | jmp .Lloop_xop | |
1198 | ___ | |
1199 | if ($SZ==4) { # SHA256 | |
1200 | my @X = map("%xmm$_",(0..3)); | |
1201 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); | |
1202 | ||
1203 | $code.=<<___; | |
1204 | .align 16 | |
1205 | .Lloop_xop: | |
c4558efb | 1206 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
a8f3b8b5 AP |
1207 | vmovdqu 0x00($inp),@X[0] |
1208 | vmovdqu 0x10($inp),@X[1] | |
1209 | vmovdqu 0x20($inp),@X[2] | |
1210 | vmovdqu 0x30($inp),@X[3] | |
1211 | vpshufb $t3,@X[0],@X[0] | |
1212 | lea $TABLE(%rip),$Tbl | |
1213 | vpshufb $t3,@X[1],@X[1] | |
1214 | vpshufb $t3,@X[2],@X[2] | |
1215 | vpaddd 0x00($Tbl),@X[0],$t0 | |
1216 | vpshufb $t3,@X[3],@X[3] | |
c4558efb AP |
1217 | vpaddd 0x20($Tbl),@X[1],$t1 |
1218 | vpaddd 0x40($Tbl),@X[2],$t2 | |
1219 | vpaddd 0x60($Tbl),@X[3],$t3 | |
a8f3b8b5 AP |
1220 | vmovdqa $t0,0x00(%rsp) |
1221 | mov $A,$a1 | |
1222 | vmovdqa $t1,0x10(%rsp) | |
1223 | mov $B,$a3 | |
1224 | vmovdqa $t2,0x20(%rsp) | |
1225 | xor $C,$a3 # magic | |
1226 | vmovdqa $t3,0x30(%rsp) | |
1227 | mov $E,$a0 | |
1228 | jmp .Lxop_00_47 | |
1229 | ||
1230 | .align 16 | |
1231 | .Lxop_00_47: | |
147cca8f | 1232 | sub \$`-16*2*$SZ`,$Tbl # size optimization |
a8f3b8b5 AP |
1233 | ___ |
1234 | sub XOP_256_00_47 () { | |
1235 | my $j = shift; | |
1236 | my $body = shift; | |
1237 | my @X = @_; | |
1238 | my @insns = (&$body,&$body,&$body,&$body); # 104 instructions | |
1239 | ||
1240 | &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] | |
1241 | eval(shift(@insns)); | |
1242 | eval(shift(@insns)); | |
1243 | &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] | |
1244 | eval(shift(@insns)); | |
1245 | eval(shift(@insns)); | |
1246 | &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); | |
1247 | eval(shift(@insns)); | |
1248 | eval(shift(@insns)); | |
1249 | &vpsrld ($t0,$t0,$sigma0[2]); | |
1250 | eval(shift(@insns)); | |
1251 | eval(shift(@insns)); | |
1252 | &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] | |
1253 | eval(shift(@insns)); | |
1254 | eval(shift(@insns)); | |
1255 | eval(shift(@insns)); | |
1256 | eval(shift(@insns)); | |
1257 | &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); | |
1258 | eval(shift(@insns)); | |
1259 | eval(shift(@insns)); | |
1260 | &vpxor ($t0,$t0,$t1); | |
1261 | eval(shift(@insns)); | |
1262 | eval(shift(@insns)); | |
1263 | eval(shift(@insns)); | |
1264 | eval(shift(@insns)); | |
1265 | &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); | |
1266 | eval(shift(@insns)); | |
1267 | eval(shift(@insns)); | |
1268 | &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) | |
1269 | eval(shift(@insns)); | |
1270 | eval(shift(@insns)); | |
1271 | &vpsrld ($t2,@X[3],$sigma1[2]); | |
1272 | eval(shift(@insns)); | |
1273 | eval(shift(@insns)); | |
1274 | &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) | |
1275 | eval(shift(@insns)); | |
1276 | eval(shift(@insns)); | |
1277 | &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); | |
1278 | eval(shift(@insns)); | |
1279 | eval(shift(@insns)); | |
1280 | &vpxor ($t3,$t3,$t2); | |
1281 | eval(shift(@insns)); | |
1282 | eval(shift(@insns)); | |
1283 | eval(shift(@insns)); | |
1284 | eval(shift(@insns)); | |
1285 | &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) | |
1286 | eval(shift(@insns)); | |
1287 | eval(shift(@insns)); | |
1288 | eval(shift(@insns)); | |
1289 | eval(shift(@insns)); | |
1290 | &vpsrldq ($t3,$t3,8); | |
1291 | eval(shift(@insns)); | |
1292 | eval(shift(@insns)); | |
1293 | eval(shift(@insns)); | |
1294 | eval(shift(@insns)); | |
1295 | &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) | |
1296 | eval(shift(@insns)); | |
1297 | eval(shift(@insns)); | |
1298 | eval(shift(@insns)); | |
1299 | eval(shift(@insns)); | |
1300 | &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); | |
1301 | eval(shift(@insns)); | |
1302 | eval(shift(@insns)); | |
1303 | &vpsrld ($t2,@X[0],$sigma1[2]); | |
1304 | eval(shift(@insns)); | |
1305 | eval(shift(@insns)); | |
1306 | &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); | |
1307 | eval(shift(@insns)); | |
1308 | eval(shift(@insns)); | |
1309 | &vpxor ($t3,$t3,$t2); | |
1310 | eval(shift(@insns)); | |
1311 | eval(shift(@insns)); | |
1312 | eval(shift(@insns)); | |
1313 | eval(shift(@insns)); | |
1314 | &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) | |
1315 | eval(shift(@insns)); | |
1316 | eval(shift(@insns)); | |
1317 | eval(shift(@insns)); | |
1318 | eval(shift(@insns)); | |
1319 | &vpslldq ($t3,$t3,8); # 22 instructions | |
1320 | eval(shift(@insns)); | |
1321 | eval(shift(@insns)); | |
1322 | eval(shift(@insns)); | |
1323 | eval(shift(@insns)); | |
1324 | &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) | |
1325 | eval(shift(@insns)); | |
1326 | eval(shift(@insns)); | |
1327 | eval(shift(@insns)); | |
1328 | eval(shift(@insns)); | |
c4558efb | 1329 | &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); |
a8f3b8b5 AP |
1330 | foreach (@insns) { eval; } # remaining instructions |
1331 | &vmovdqa (16*$j."(%rsp)",$t2); | |
1332 | } | |
1333 | ||
1334 | for ($i=0,$j=0; $j<4; $j++) { | |
1335 | &XOP_256_00_47($j,\&body_00_15,@X); | |
1336 | push(@X,shift(@X)); # rotate(@X) | |
1337 | } | |
c4558efb | 1338 | &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); |
a8f3b8b5 AP |
1339 | &jne (".Lxop_00_47"); |
1340 | ||
1341 | for ($i=0; $i<16; ) { | |
1342 | foreach(body_00_15()) { eval; } | |
1343 | } | |
1344 | ||
1345 | } else { # SHA512 | |
1346 | my @X = map("%xmm$_",(0..7)); | |
1347 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); | |
1348 | ||
1349 | $code.=<<___; | |
1350 | .align 16 | |
1351 | .Lloop_xop: | |
c4558efb | 1352 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
a8f3b8b5 | 1353 | vmovdqu 0x00($inp),@X[0] |
c4558efb | 1354 | lea $TABLE+0x80(%rip),$Tbl # size optimization |
a8f3b8b5 AP |
1355 | vmovdqu 0x10($inp),@X[1] |
1356 | vmovdqu 0x20($inp),@X[2] | |
1357 | vpshufb $t3,@X[0],@X[0] | |
1358 | vmovdqu 0x30($inp),@X[3] | |
1359 | vpshufb $t3,@X[1],@X[1] | |
1360 | vmovdqu 0x40($inp),@X[4] | |
1361 | vpshufb $t3,@X[2],@X[2] | |
1362 | vmovdqu 0x50($inp),@X[5] | |
1363 | vpshufb $t3,@X[3],@X[3] | |
1364 | vmovdqu 0x60($inp),@X[6] | |
1365 | vpshufb $t3,@X[4],@X[4] | |
1366 | vmovdqu 0x70($inp),@X[7] | |
1367 | vpshufb $t3,@X[5],@X[5] | |
c4558efb | 1368 | vpaddq -0x80($Tbl),@X[0],$t0 |
a8f3b8b5 | 1369 | vpshufb $t3,@X[6],@X[6] |
c4558efb | 1370 | vpaddq -0x60($Tbl),@X[1],$t1 |
a8f3b8b5 | 1371 | vpshufb $t3,@X[7],@X[7] |
c4558efb AP |
1372 | vpaddq -0x40($Tbl),@X[2],$t2 |
1373 | vpaddq -0x20($Tbl),@X[3],$t3 | |
a8f3b8b5 | 1374 | vmovdqa $t0,0x00(%rsp) |
c4558efb | 1375 | vpaddq 0x00($Tbl),@X[4],$t0 |
a8f3b8b5 | 1376 | vmovdqa $t1,0x10(%rsp) |
c4558efb | 1377 | vpaddq 0x20($Tbl),@X[5],$t1 |
a8f3b8b5 | 1378 | vmovdqa $t2,0x20(%rsp) |
c4558efb | 1379 | vpaddq 0x40($Tbl),@X[6],$t2 |
a8f3b8b5 | 1380 | vmovdqa $t3,0x30(%rsp) |
c4558efb | 1381 | vpaddq 0x60($Tbl),@X[7],$t3 |
a8f3b8b5 AP |
1382 | vmovdqa $t0,0x40(%rsp) |
1383 | mov $A,$a1 | |
1384 | vmovdqa $t1,0x50(%rsp) | |
1385 | mov $B,$a3 | |
1386 | vmovdqa $t2,0x60(%rsp) | |
1387 | xor $C,$a3 # magic | |
1388 | vmovdqa $t3,0x70(%rsp) | |
1389 | mov $E,$a0 | |
1390 | jmp .Lxop_00_47 | |
1391 | ||
1392 | .align 16 | |
1393 | .Lxop_00_47: | |
147cca8f | 1394 | add \$`16*2*$SZ`,$Tbl |
a8f3b8b5 AP |
1395 | ___ |
1396 | sub XOP_512_00_47 () { | |
1397 | my $j = shift; | |
1398 | my $body = shift; | |
1399 | my @X = @_; | |
1400 | my @insns = (&$body,&$body); # 52 instructions | |
1401 | ||
1402 | &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] | |
1403 | eval(shift(@insns)); | |
1404 | eval(shift(@insns)); | |
1405 | &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] | |
1406 | eval(shift(@insns)); | |
1407 | eval(shift(@insns)); | |
1408 | &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); | |
1409 | eval(shift(@insns)); | |
1410 | eval(shift(@insns)); | |
1411 | &vpsrlq ($t0,$t0,$sigma0[2]); | |
1412 | eval(shift(@insns)); | |
1413 | eval(shift(@insns)); | |
1414 | &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] | |
1415 | eval(shift(@insns)); | |
1416 | eval(shift(@insns)); | |
1417 | eval(shift(@insns)); | |
1418 | eval(shift(@insns)); | |
1419 | &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); | |
1420 | eval(shift(@insns)); | |
1421 | eval(shift(@insns)); | |
1422 | &vpxor ($t0,$t0,$t1); | |
1423 | eval(shift(@insns)); | |
1424 | eval(shift(@insns)); | |
1425 | eval(shift(@insns)); | |
1426 | eval(shift(@insns)); | |
1427 | &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); | |
1428 | eval(shift(@insns)); | |
1429 | eval(shift(@insns)); | |
1430 | &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) | |
1431 | eval(shift(@insns)); | |
1432 | eval(shift(@insns)); | |
1433 | &vpsrlq ($t2,@X[7],$sigma1[2]); | |
1434 | eval(shift(@insns)); | |
1435 | eval(shift(@insns)); | |
1436 | &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) | |
1437 | eval(shift(@insns)); | |
1438 | eval(shift(@insns)); | |
1439 | &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); | |
1440 | eval(shift(@insns)); | |
1441 | eval(shift(@insns)); | |
1442 | &vpxor ($t3,$t3,$t2); | |
1443 | eval(shift(@insns)); | |
1444 | eval(shift(@insns)); | |
1445 | eval(shift(@insns)); | |
1446 | eval(shift(@insns)); | |
1447 | &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) | |
1448 | eval(shift(@insns)); | |
1449 | eval(shift(@insns)); | |
1450 | eval(shift(@insns)); | |
1451 | eval(shift(@insns)); | |
1452 | &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) | |
1453 | eval(shift(@insns)); | |
1454 | eval(shift(@insns)); | |
1455 | eval(shift(@insns)); | |
1456 | eval(shift(@insns)); | |
c4558efb | 1457 | &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); |
a8f3b8b5 AP |
1458 | foreach (@insns) { eval; } # remaining instructions |
1459 | &vmovdqa (16*$j."(%rsp)",$t2); | |
1460 | } | |
1461 | ||
1462 | for ($i=0,$j=0; $j<8; $j++) { | |
1463 | &XOP_512_00_47($j,\&body_00_15,@X); | |
1464 | push(@X,shift(@X)); # rotate(@X) | |
1465 | } | |
c4558efb | 1466 | &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); |
a8f3b8b5 AP |
1467 | &jne (".Lxop_00_47"); |
1468 | ||
1469 | for ($i=0; $i<16; ) { | |
1470 | foreach(body_00_15()) { eval; } | |
1471 | } | |
1472 | } | |
1473 | $code.=<<___; | |
1474 | mov $_ctx,$ctx | |
1475 | mov $a1,$A | |
1476 | ||
1477 | add $SZ*0($ctx),$A | |
1478 | lea 16*$SZ($inp),$inp | |
1479 | add $SZ*1($ctx),$B | |
1480 | add $SZ*2($ctx),$C | |
1481 | add $SZ*3($ctx),$D | |
1482 | add $SZ*4($ctx),$E | |
1483 | add $SZ*5($ctx),$F | |
1484 | add $SZ*6($ctx),$G | |
1485 | add $SZ*7($ctx),$H | |
1486 | ||
1487 | cmp $_end,$inp | |
1488 | ||
1489 | mov $A,$SZ*0($ctx) | |
1490 | mov $B,$SZ*1($ctx) | |
1491 | mov $C,$SZ*2($ctx) | |
1492 | mov $D,$SZ*3($ctx) | |
1493 | mov $E,$SZ*4($ctx) | |
1494 | mov $F,$SZ*5($ctx) | |
1495 | mov $G,$SZ*6($ctx) | |
1496 | mov $H,$SZ*7($ctx) | |
1497 | jb .Lloop_xop | |
1498 | ||
1499 | mov $_rsp,%rsi | |
399976c7 | 1500 | .cfi_def_cfa %rsi,8 |
00678437 | 1501 | vzeroupper |
a8f3b8b5 AP |
1502 | ___ |
1503 | $code.=<<___ if ($win64); | |
1504 | movaps 16*$SZ+32(%rsp),%xmm6 | |
1505 | movaps 16*$SZ+48(%rsp),%xmm7 | |
1506 | movaps 16*$SZ+64(%rsp),%xmm8 | |
1507 | movaps 16*$SZ+80(%rsp),%xmm9 | |
1508 | ___ | |
1509 | $code.=<<___ if ($win64 && $SZ>4); | |
1510 | movaps 16*$SZ+96(%rsp),%xmm10 | |
1511 | movaps 16*$SZ+112(%rsp),%xmm11 | |
1512 | ___ | |
1513 | $code.=<<___; | |
384e6de4 | 1514 | mov -48(%rsi),%r15 |
399976c7 | 1515 | .cfi_restore %r15 |
384e6de4 | 1516 | mov -40(%rsi),%r14 |
399976c7 | 1517 | .cfi_restore %r14 |
384e6de4 | 1518 | mov -32(%rsi),%r13 |
399976c7 | 1519 | .cfi_restore %r13 |
384e6de4 | 1520 | mov -24(%rsi),%r12 |
399976c7 | 1521 | .cfi_restore %r12 |
384e6de4 | 1522 | mov -16(%rsi),%rbp |
399976c7 | 1523 | .cfi_restore %rbp |
384e6de4 | 1524 | mov -8(%rsi),%rbx |
399976c7 | 1525 | .cfi_restore %rbx |
384e6de4 | 1526 | lea (%rsi),%rsp |
399976c7 | 1527 | .cfi_def_cfa_register %rsp |
a8f3b8b5 AP |
1528 | .Lepilogue_xop: |
1529 | ret | |
399976c7 | 1530 | .cfi_endproc |
a8f3b8b5 AP |
1531 | .size ${func}_xop,.-${func}_xop |
1532 | ___ | |
1533 | } | |
1534 | ###################################################################### | |
1535 | # AVX+shrd code path | |
1536 | # | |
1537 | local *ror = sub { &shrd(@_[0],@_) }; | |
1538 | ||
1539 | $code.=<<___; | |
c4558efb | 1540 | .type ${func}_avx,\@function,3 |
a8f3b8b5 AP |
1541 | .align 64 |
1542 | ${func}_avx: | |
399976c7 | 1543 | .cfi_startproc |
a8f3b8b5 | 1544 | .Lavx_shortcut: |
384e6de4 | 1545 | mov %rsp,%rax # copy %rsp |
399976c7 | 1546 | .cfi_def_cfa_register %rax |
a8f3b8b5 | 1547 | push %rbx |
399976c7 | 1548 | .cfi_push %rbx |
a8f3b8b5 | 1549 | push %rbp |
399976c7 | 1550 | .cfi_push %rbp |
a8f3b8b5 | 1551 | push %r12 |
399976c7 | 1552 | .cfi_push %r12 |
a8f3b8b5 | 1553 | push %r13 |
399976c7 | 1554 | .cfi_push %r13 |
a8f3b8b5 | 1555 | push %r14 |
399976c7 | 1556 | .cfi_push %r14 |
a8f3b8b5 | 1557 | push %r15 |
399976c7 | 1558 | .cfi_push %r15 |
a8f3b8b5 AP |
1559 | shl \$4,%rdx # num*16 |
1560 | sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp | |
1561 | lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |
1562 | and \$-64,%rsp # align stack frame | |
1563 | mov $ctx,$_ctx # save ctx, 1st arg | |
1564 | mov $inp,$_inp # save inp, 2nd arh | |
1565 | mov %rdx,$_end # save end pointer, "3rd" arg | |
384e6de4 | 1566 | mov %rax,$_rsp # save copy of %rsp |
399976c7 | 1567 | .cfi_cfa_expression $_rsp,deref,+8 |
a8f3b8b5 AP |
1568 | ___ |
1569 | $code.=<<___ if ($win64); | |
1570 | movaps %xmm6,16*$SZ+32(%rsp) | |
1571 | movaps %xmm7,16*$SZ+48(%rsp) | |
1572 | movaps %xmm8,16*$SZ+64(%rsp) | |
1573 | movaps %xmm9,16*$SZ+80(%rsp) | |
1574 | ___ | |
1575 | $code.=<<___ if ($win64 && $SZ>4); | |
1576 | movaps %xmm10,16*$SZ+96(%rsp) | |
1577 | movaps %xmm11,16*$SZ+112(%rsp) | |
1578 | ___ | |
1579 | $code.=<<___; | |
1580 | .Lprologue_avx: | |
1581 | ||
00678437 | 1582 | vzeroupper |
a8f3b8b5 AP |
1583 | mov $SZ*0($ctx),$A |
1584 | mov $SZ*1($ctx),$B | |
1585 | mov $SZ*2($ctx),$C | |
1586 | mov $SZ*3($ctx),$D | |
1587 | mov $SZ*4($ctx),$E | |
1588 | mov $SZ*5($ctx),$F | |
1589 | mov $SZ*6($ctx),$G | |
1590 | mov $SZ*7($ctx),$H | |
1591 | ___ | |
1592 | if ($SZ==4) { # SHA256 | |
1593 | my @X = map("%xmm$_",(0..3)); | |
1594 | my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); | |
1595 | ||
1596 | $code.=<<___; | |
c4558efb AP |
1597 | vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 |
1598 | vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 | |
a8f3b8b5 AP |
1599 | jmp .Lloop_avx |
1600 | .align 16 | |
1601 | .Lloop_avx: | |
c4558efb | 1602 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
a8f3b8b5 AP |
1603 | vmovdqu 0x00($inp),@X[0] |
1604 | vmovdqu 0x10($inp),@X[1] | |
1605 | vmovdqu 0x20($inp),@X[2] | |
1606 | vmovdqu 0x30($inp),@X[3] | |
1607 | vpshufb $t3,@X[0],@X[0] | |
1608 | lea $TABLE(%rip),$Tbl | |
1609 | vpshufb $t3,@X[1],@X[1] | |
1610 | vpshufb $t3,@X[2],@X[2] | |
1611 | vpaddd 0x00($Tbl),@X[0],$t0 | |
1612 | vpshufb $t3,@X[3],@X[3] | |
c4558efb AP |
1613 | vpaddd 0x20($Tbl),@X[1],$t1 |
1614 | vpaddd 0x40($Tbl),@X[2],$t2 | |
1615 | vpaddd 0x60($Tbl),@X[3],$t3 | |
a8f3b8b5 AP |
1616 | vmovdqa $t0,0x00(%rsp) |
1617 | mov $A,$a1 | |
1618 | vmovdqa $t1,0x10(%rsp) | |
1619 | mov $B,$a3 | |
1620 | vmovdqa $t2,0x20(%rsp) | |
1621 | xor $C,$a3 # magic | |
1622 | vmovdqa $t3,0x30(%rsp) | |
1623 | mov $E,$a0 | |
1624 | jmp .Lavx_00_47 | |
1625 | ||
1626 | .align 16 | |
1627 | .Lavx_00_47: | |
147cca8f | 1628 | sub \$`-16*2*$SZ`,$Tbl # size optimization |
a8f3b8b5 AP |
1629 | ___ |
1630 | sub Xupdate_256_AVX () { | |
1631 | ( | |
1632 | '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] | |
1633 | '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] | |
1634 | '&vpsrld ($t2,$t0,$sigma0[0]);', | |
1635 | '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] | |
1636 | '&vpsrld ($t3,$t0,$sigma0[2])', | |
1637 | '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', | |
1638 | '&vpxor ($t0,$t3,$t2)', | |
1639 | '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] | |
1640 | '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', | |
1641 | '&vpxor ($t0,$t0,$t1)', | |
1642 | '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', | |
1643 | '&vpxor ($t0,$t0,$t2)', | |
1644 | '&vpsrld ($t2,$t3,$sigma1[2]);', | |
1645 | '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) | |
1646 | '&vpsrlq ($t3,$t3,$sigma1[0]);', | |
1647 | '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) | |
1648 | '&vpxor ($t2,$t2,$t3);', | |
1649 | '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', | |
1650 | '&vpxor ($t2,$t2,$t3)', | |
1651 | '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) | |
1652 | '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) | |
1653 | '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] | |
1654 | '&vpsrld ($t2,$t3,$sigma1[2])', | |
1655 | '&vpsrlq ($t3,$t3,$sigma1[0])', | |
1656 | '&vpxor ($t2,$t2,$t3);', | |
1657 | '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', | |
1658 | '&vpxor ($t2,$t2,$t3)', | |
1659 | '&vpshufb ($t2,$t2,$t5)', | |
1660 | '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) | |
1661 | ); | |
1662 | } | |
1663 | ||
1664 | sub AVX_256_00_47 () { | |
1665 | my $j = shift; | |
1666 | my $body = shift; | |
1667 | my @X = @_; | |
1668 | my @insns = (&$body,&$body,&$body,&$body); # 104 instructions | |
1669 | ||
1670 | foreach (Xupdate_256_AVX()) { # 29 instructions | |
1671 | eval; | |
1672 | eval(shift(@insns)); | |
1673 | eval(shift(@insns)); | |
1674 | eval(shift(@insns)); | |
1675 | } | |
c4558efb | 1676 | &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); |
a8f3b8b5 AP |
1677 | foreach (@insns) { eval; } # remaining instructions |
1678 | &vmovdqa (16*$j."(%rsp)",$t2); | |
1679 | } | |
1680 | ||
1681 | for ($i=0,$j=0; $j<4; $j++) { | |
1682 | &AVX_256_00_47($j,\&body_00_15,@X); | |
1683 | push(@X,shift(@X)); # rotate(@X) | |
1684 | } | |
c4558efb | 1685 | &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); |
a8f3b8b5 AP |
1686 | &jne (".Lavx_00_47"); |
1687 | ||
1688 | for ($i=0; $i<16; ) { | |
1689 | foreach(body_00_15()) { eval; } | |
1690 | } | |
1691 | ||
1692 | } else { # SHA512 | |
1693 | my @X = map("%xmm$_",(0..7)); | |
1694 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); | |
1695 | ||
1696 | $code.=<<___; | |
1697 | jmp .Lloop_avx | |
1698 | .align 16 | |
1699 | .Lloop_avx: | |
c4558efb | 1700 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
a8f3b8b5 | 1701 | vmovdqu 0x00($inp),@X[0] |
c4558efb | 1702 | lea $TABLE+0x80(%rip),$Tbl # size optimization |
a8f3b8b5 AP |
1703 | vmovdqu 0x10($inp),@X[1] |
1704 | vmovdqu 0x20($inp),@X[2] | |
1705 | vpshufb $t3,@X[0],@X[0] | |
1706 | vmovdqu 0x30($inp),@X[3] | |
1707 | vpshufb $t3,@X[1],@X[1] | |
1708 | vmovdqu 0x40($inp),@X[4] | |
1709 | vpshufb $t3,@X[2],@X[2] | |
1710 | vmovdqu 0x50($inp),@X[5] | |
1711 | vpshufb $t3,@X[3],@X[3] | |
1712 | vmovdqu 0x60($inp),@X[6] | |
1713 | vpshufb $t3,@X[4],@X[4] | |
1714 | vmovdqu 0x70($inp),@X[7] | |
1715 | vpshufb $t3,@X[5],@X[5] | |
c4558efb | 1716 | vpaddq -0x80($Tbl),@X[0],$t0 |
a8f3b8b5 | 1717 | vpshufb $t3,@X[6],@X[6] |
c4558efb | 1718 | vpaddq -0x60($Tbl),@X[1],$t1 |
a8f3b8b5 | 1719 | vpshufb $t3,@X[7],@X[7] |
c4558efb AP |
1720 | vpaddq -0x40($Tbl),@X[2],$t2 |
1721 | vpaddq -0x20($Tbl),@X[3],$t3 | |
a8f3b8b5 | 1722 | vmovdqa $t0,0x00(%rsp) |
c4558efb | 1723 | vpaddq 0x00($Tbl),@X[4],$t0 |
a8f3b8b5 | 1724 | vmovdqa $t1,0x10(%rsp) |
c4558efb | 1725 | vpaddq 0x20($Tbl),@X[5],$t1 |
a8f3b8b5 | 1726 | vmovdqa $t2,0x20(%rsp) |
c4558efb | 1727 | vpaddq 0x40($Tbl),@X[6],$t2 |
a8f3b8b5 | 1728 | vmovdqa $t3,0x30(%rsp) |
c4558efb | 1729 | vpaddq 0x60($Tbl),@X[7],$t3 |
a8f3b8b5 AP |
1730 | vmovdqa $t0,0x40(%rsp) |
1731 | mov $A,$a1 | |
1732 | vmovdqa $t1,0x50(%rsp) | |
1733 | mov $B,$a3 | |
1734 | vmovdqa $t2,0x60(%rsp) | |
1735 | xor $C,$a3 # magic | |
1736 | vmovdqa $t3,0x70(%rsp) | |
1737 | mov $E,$a0 | |
1738 | jmp .Lavx_00_47 | |
1739 | ||
1740 | .align 16 | |
1741 | .Lavx_00_47: | |
147cca8f | 1742 | add \$`16*2*$SZ`,$Tbl |
a8f3b8b5 AP |
1743 | ___ |
1744 | sub Xupdate_512_AVX () { | |
1745 | ( | |
1746 | '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] | |
1747 | '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] | |
c4558efb AP |
1748 | '&vpsrlq ($t2,$t0,$sigma0[0])', |
1749 | '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] | |
a8f3b8b5 AP |
1750 | '&vpsrlq ($t3,$t0,$sigma0[2])', |
1751 | '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', | |
1752 | '&vpxor ($t0,$t3,$t2)', | |
1753 | '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', | |
1754 | '&vpxor ($t0,$t0,$t1)', | |
1755 | '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', | |
1756 | '&vpxor ($t0,$t0,$t2)', | |
1757 | '&vpsrlq ($t3,@X[7],$sigma1[2]);', | |
1758 | '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) | |
c4558efb | 1759 | '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', |
a8f3b8b5 AP |
1760 | '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) |
1761 | '&vpsrlq ($t1,@X[7],$sigma1[0]);', | |
1762 | '&vpxor ($t3,$t3,$t2)', | |
1763 | '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', | |
1764 | '&vpxor ($t3,$t3,$t1)', | |
1765 | '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', | |
1766 | '&vpxor ($t3,$t3,$t2)', | |
1767 | '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) | |
1768 | '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) | |
1769 | ); | |
1770 | } | |
1771 | ||
1772 | sub AVX_512_00_47 () { | |
1773 | my $j = shift; | |
1774 | my $body = shift; | |
1775 | my @X = @_; | |
1776 | my @insns = (&$body,&$body); # 52 instructions | |
1777 | ||
1778 | foreach (Xupdate_512_AVX()) { # 23 instructions | |
1779 | eval; | |
1780 | eval(shift(@insns)); | |
1781 | eval(shift(@insns)); | |
1782 | } | |
c4558efb | 1783 | &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); |
a8f3b8b5 AP |
1784 | foreach (@insns) { eval; } # remaining instructions |
1785 | &vmovdqa (16*$j."(%rsp)",$t2); | |
1786 | } | |
1787 | ||
1788 | for ($i=0,$j=0; $j<8; $j++) { | |
1789 | &AVX_512_00_47($j,\&body_00_15,@X); | |
1790 | push(@X,shift(@X)); # rotate(@X) | |
1791 | } | |
c4558efb | 1792 | &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); |
a8f3b8b5 AP |
1793 | &jne (".Lavx_00_47"); |
1794 | ||
1795 | for ($i=0; $i<16; ) { | |
1796 | foreach(body_00_15()) { eval; } | |
1797 | } | |
1798 | } | |
1799 | $code.=<<___; | |
1800 | mov $_ctx,$ctx | |
1801 | mov $a1,$A | |
1802 | ||
1803 | add $SZ*0($ctx),$A | |
1804 | lea 16*$SZ($inp),$inp | |
1805 | add $SZ*1($ctx),$B | |
1806 | add $SZ*2($ctx),$C | |
1807 | add $SZ*3($ctx),$D | |
1808 | add $SZ*4($ctx),$E | |
1809 | add $SZ*5($ctx),$F | |
1810 | add $SZ*6($ctx),$G | |
1811 | add $SZ*7($ctx),$H | |
1812 | ||
1813 | cmp $_end,$inp | |
1814 | ||
1815 | mov $A,$SZ*0($ctx) | |
1816 | mov $B,$SZ*1($ctx) | |
1817 | mov $C,$SZ*2($ctx) | |
1818 | mov $D,$SZ*3($ctx) | |
1819 | mov $E,$SZ*4($ctx) | |
1820 | mov $F,$SZ*5($ctx) | |
1821 | mov $G,$SZ*6($ctx) | |
1822 | mov $H,$SZ*7($ctx) | |
1823 | jb .Lloop_avx | |
1824 | ||
1825 | mov $_rsp,%rsi | |
399976c7 | 1826 | .cfi_def_cfa %rsi,8 |
00678437 | 1827 | vzeroupper |
a8f3b8b5 AP |
1828 | ___ |
1829 | $code.=<<___ if ($win64); | |
1830 | movaps 16*$SZ+32(%rsp),%xmm6 | |
1831 | movaps 16*$SZ+48(%rsp),%xmm7 | |
1832 | movaps 16*$SZ+64(%rsp),%xmm8 | |
1833 | movaps 16*$SZ+80(%rsp),%xmm9 | |
1834 | ___ | |
1835 | $code.=<<___ if ($win64 && $SZ>4); | |
1836 | movaps 16*$SZ+96(%rsp),%xmm10 | |
1837 | movaps 16*$SZ+112(%rsp),%xmm11 | |
1838 | ___ | |
1839 | $code.=<<___; | |
384e6de4 | 1840 | mov -48(%rsi),%r15 |
399976c7 | 1841 | .cfi_restore %r15 |
384e6de4 | 1842 | mov -40(%rsi),%r14 |
399976c7 | 1843 | .cfi_restore %r14 |
384e6de4 | 1844 | mov -32(%rsi),%r13 |
399976c7 | 1845 | .cfi_restore %r13 |
384e6de4 | 1846 | mov -24(%rsi),%r12 |
399976c7 | 1847 | .cfi_restore %r12 |
384e6de4 | 1848 | mov -16(%rsi),%rbp |
399976c7 | 1849 | .cfi_restore %rbp |
384e6de4 | 1850 | mov -8(%rsi),%rbx |
399976c7 | 1851 | .cfi_restore %rbx |
384e6de4 | 1852 | lea (%rsi),%rsp |
399976c7 | 1853 | .cfi_def_cfa_register %rsp |
a8f3b8b5 AP |
1854 | .Lepilogue_avx: |
1855 | ret | |
399976c7 | 1856 | .cfi_endproc |
a8f3b8b5 AP |
1857 | .size ${func}_avx,.-${func}_avx |
1858 | ___ | |
c4558efb AP |
1859 | |
1860 | if ($avx>1) {{ | |
1861 | ###################################################################### | |
1862 | # AVX2+BMI code path | |
1863 | # | |
609b0852 | 1864 | my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp |
c4558efb AP |
1865 | my $PUSH8=8*2*$SZ; |
1866 | use integer; | |
1867 | ||
1868 | sub bodyx_00_15 () { | |
1869 | # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f | |
1870 | ( | |
1871 | '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. | |
1872 | ||
1873 | '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] | |
1874 | '&and ($a4,$e)', # f&e | |
1875 | '&rorx ($a0,$e,$Sigma1[2])', | |
1876 | '&rorx ($a2,$e,$Sigma1[1])', | |
1877 | ||
1878 | '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past | |
1879 | '&lea ($h,"($h,$a4)")', | |
1880 | '&andn ($a4,$e,$g)', # ~e&g | |
1881 | '&xor ($a0,$a2)', | |
1882 | ||
1883 | '&rorx ($a1,$e,$Sigma1[0])', | |
1884 | '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) | |
1885 | '&xor ($a0,$a1)', # Sigma1(e) | |
1886 | '&mov ($a2,$a)', | |
1887 | ||
1888 | '&rorx ($a4,$a,$Sigma0[2])', | |
1889 | '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) | |
1890 | '&xor ($a2,$b)', # a^b, b^c in next round | |
1891 | '&rorx ($a1,$a,$Sigma0[1])', | |
1892 | ||
1893 | '&rorx ($a0,$a,$Sigma0[0])', | |
1894 | '&lea ($d,"($d,$h)")', # d+=h | |
1895 | '&and ($a3,$a2)', # (b^c)&(a^b) | |
1896 | '&xor ($a1,$a4)', | |
1897 | ||
1898 | '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) | |
1899 | '&xor ($a1,$a0)', # Sigma0(a) | |
1900 | '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) | |
1901 | '&mov ($a4,$e)', # copy of f in future | |
1902 | ||
1903 | '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' | |
1904 | ); | |
1905 | # and at the finish one has to $a+=$a1 | |
1906 | } | |
1907 | ||
1908 | $code.=<<___; | |
1909 | .type ${func}_avx2,\@function,3 | |
1910 | .align 64 | |
1911 | ${func}_avx2: | |
399976c7 | 1912 | .cfi_startproc |
c4558efb | 1913 | .Lavx2_shortcut: |
384e6de4 | 1914 | mov %rsp,%rax # copy %rsp |
399976c7 | 1915 | .cfi_def_cfa_register %rax |
c4558efb | 1916 | push %rbx |
399976c7 | 1917 | .cfi_push %rbx |
c4558efb | 1918 | push %rbp |
399976c7 | 1919 | .cfi_push %rbp |
c4558efb | 1920 | push %r12 |
399976c7 | 1921 | .cfi_push %r12 |
c4558efb | 1922 | push %r13 |
399976c7 | 1923 | .cfi_push %r13 |
c4558efb | 1924 | push %r14 |
399976c7 | 1925 | .cfi_push %r14 |
c4558efb | 1926 | push %r15 |
399976c7 | 1927 | .cfi_push %r15 |
c4558efb AP |
1928 | sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp |
1929 | shl \$4,%rdx # num*16 | |
1930 | and \$-256*$SZ,%rsp # align stack frame | |
1931 | lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |
1932 | add \$`2*$SZ*($rounds-8)`,%rsp | |
1933 | mov $ctx,$_ctx # save ctx, 1st arg | |
1934 | mov $inp,$_inp # save inp, 2nd arh | |
1935 | mov %rdx,$_end # save end pointer, "3rd" arg | |
384e6de4 | 1936 | mov %rax,$_rsp # save copy of %rsp |
399976c7 | 1937 | .cfi_cfa_expression $_rsp,deref,+8 |
c4558efb AP |
1938 | ___ |
1939 | $code.=<<___ if ($win64); | |
1940 | movaps %xmm6,16*$SZ+32(%rsp) | |
1941 | movaps %xmm7,16*$SZ+48(%rsp) | |
1942 | movaps %xmm8,16*$SZ+64(%rsp) | |
1943 | movaps %xmm9,16*$SZ+80(%rsp) | |
1944 | ___ | |
1945 | $code.=<<___ if ($win64 && $SZ>4); | |
1946 | movaps %xmm10,16*$SZ+96(%rsp) | |
1947 | movaps %xmm11,16*$SZ+112(%rsp) | |
1948 | ___ | |
1949 | $code.=<<___; | |
1950 | .Lprologue_avx2: | |
1951 | ||
00678437 | 1952 | vzeroupper |
c4558efb AP |
1953 | sub \$-16*$SZ,$inp # inp++, size optimization |
1954 | mov $SZ*0($ctx),$A | |
504bbcf3 | 1955 | mov $inp,%r12 # borrow $T1 |
c4558efb AP |
1956 | mov $SZ*1($ctx),$B |
1957 | cmp %rdx,$inp # $_end | |
1958 | mov $SZ*2($ctx),$C | |
504bbcf3 | 1959 | cmove %rsp,%r12 # next block or random data |
c4558efb AP |
1960 | mov $SZ*3($ctx),$D |
1961 | mov $SZ*4($ctx),$E | |
1962 | mov $SZ*5($ctx),$F | |
1963 | mov $SZ*6($ctx),$G | |
1964 | mov $SZ*7($ctx),$H | |
1965 | ___ | |
1966 | if ($SZ==4) { # SHA256 | |
1967 | my @X = map("%ymm$_",(0..3)); | |
1968 | my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); | |
1969 | ||
1970 | $code.=<<___; | |
1971 | vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 | |
1972 | vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 | |
1973 | jmp .Loop_avx2 | |
1974 | .align 16 | |
1975 | .Loop_avx2: | |
c4558efb | 1976 | vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 |
504bbcf3 AP |
1977 | vmovdqu -16*$SZ+0($inp),%xmm0 |
1978 | vmovdqu -16*$SZ+16($inp),%xmm1 | |
1979 | vmovdqu -16*$SZ+32($inp),%xmm2 | |
1980 | vmovdqu -16*$SZ+48($inp),%xmm3 | |
c4558efb | 1981 | #mov $inp,$_inp # offload $inp |
504bbcf3 AP |
1982 | vinserti128 \$1,(%r12),@X[0],@X[0] |
1983 | vinserti128 \$1,16(%r12),@X[1],@X[1] | |
1984 | vpshufb $t3,@X[0],@X[0] | |
1985 | vinserti128 \$1,32(%r12),@X[2],@X[2] | |
1986 | vpshufb $t3,@X[1],@X[1] | |
1987 | vinserti128 \$1,48(%r12),@X[3],@X[3] | |
c4558efb AP |
1988 | |
1989 | lea $TABLE(%rip),$Tbl | |
c4558efb AP |
1990 | vpshufb $t3,@X[2],@X[2] |
1991 | vpaddd 0x00($Tbl),@X[0],$t0 | |
1992 | vpshufb $t3,@X[3],@X[3] | |
1993 | vpaddd 0x20($Tbl),@X[1],$t1 | |
1994 | vpaddd 0x40($Tbl),@X[2],$t2 | |
1995 | vpaddd 0x60($Tbl),@X[3],$t3 | |
1996 | vmovdqa $t0,0x00(%rsp) | |
1997 | xor $a1,$a1 | |
1998 | vmovdqa $t1,0x20(%rsp) | |
9ce91035 BE |
1999 | ___ |
2000 | $code.=<<___ if (!$win64); | |
2001 | # temporarily use %rdi as frame pointer | |
2002 | mov $_rsp,%rdi | |
2003 | .cfi_def_cfa %rdi,8 | |
2004 | ___ | |
2005 | $code.=<<___; | |
c4558efb | 2006 | lea -$PUSH8(%rsp),%rsp |
9ce91035 BE |
2007 | ___ |
2008 | $code.=<<___ if (!$win64); | |
2009 | # the frame info is at $_rsp, but the stack is moving... | |
2010 | # so a second frame pointer is saved at -8(%rsp) | |
2011 | # that is in the red zone | |
2012 | mov %rdi,-8(%rsp) | |
2013 | .cfi_cfa_expression %rsp-8,deref,+8 | |
2014 | ___ | |
2015 | $code.=<<___; | |
c4558efb AP |
2016 | mov $B,$a3 |
2017 | vmovdqa $t2,0x00(%rsp) | |
2018 | xor $C,$a3 # magic | |
2019 | vmovdqa $t3,0x20(%rsp) | |
2020 | mov $F,$a4 | |
2021 | sub \$-16*2*$SZ,$Tbl # size optimization | |
2022 | jmp .Lavx2_00_47 | |
2023 | ||
2024 | .align 16 | |
2025 | .Lavx2_00_47: | |
2026 | ___ | |
2027 | ||
2028 | sub AVX2_256_00_47 () { | |
2029 | my $j = shift; | |
2030 | my $body = shift; | |
2031 | my @X = @_; | |
2032 | my @insns = (&$body,&$body,&$body,&$body); # 96 instructions | |
2033 | my $base = "+2*$PUSH8(%rsp)"; | |
2034 | ||
9ce91035 BE |
2035 | if (($j%2)==0) { |
2036 | &lea ("%rsp","-$PUSH8(%rsp)"); | |
2037 | $code.=<<___ if (!$win64); | |
2038 | .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 | |
2039 | # copy secondary frame pointer to new location again at -8(%rsp) | |
2040 | pushq $PUSH8-8(%rsp) | |
2041 | .cfi_cfa_expression %rsp,deref,+8 | |
2042 | lea 8(%rsp),%rsp | |
2043 | .cfi_cfa_expression %rsp-8,deref,+8 | |
2044 | ___ | |
2045 | } | |
2046 | ||
c4558efb AP |
2047 | foreach (Xupdate_256_AVX()) { # 29 instructions |
2048 | eval; | |
2049 | eval(shift(@insns)); | |
2050 | eval(shift(@insns)); | |
2051 | eval(shift(@insns)); | |
2052 | } | |
2053 | &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); | |
2054 | foreach (@insns) { eval; } # remaining instructions | |
2055 | &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); | |
2056 | } | |
2057 | ||
2058 | for ($i=0,$j=0; $j<4; $j++) { | |
2059 | &AVX2_256_00_47($j,\&bodyx_00_15,@X); | |
2060 | push(@X,shift(@X)); # rotate(@X) | |
2061 | } | |
2062 | &lea ($Tbl,16*2*$SZ."($Tbl)"); | |
2063 | &cmpb (($SZ-1)."($Tbl)",0); | |
2064 | &jne (".Lavx2_00_47"); | |
2065 | ||
2066 | for ($i=0; $i<16; ) { | |
2067 | my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; | |
2068 | foreach(bodyx_00_15()) { eval; } | |
2069 | } | |
2070 | } else { # SHA512 | |
2071 | my @X = map("%ymm$_",(0..7)); | |
2072 | my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); | |
2073 | ||
2074 | $code.=<<___; | |
2075 | jmp .Loop_avx2 | |
2076 | .align 16 | |
2077 | .Loop_avx2: | |
504bbcf3 AP |
2078 | vmovdqu -16*$SZ($inp),%xmm0 |
2079 | vmovdqu -16*$SZ+16($inp),%xmm1 | |
2080 | vmovdqu -16*$SZ+32($inp),%xmm2 | |
c4558efb | 2081 | lea $TABLE+0x80(%rip),$Tbl # size optimization |
504bbcf3 AP |
2082 | vmovdqu -16*$SZ+48($inp),%xmm3 |
2083 | vmovdqu -16*$SZ+64($inp),%xmm4 | |
2084 | vmovdqu -16*$SZ+80($inp),%xmm5 | |
2085 | vmovdqu -16*$SZ+96($inp),%xmm6 | |
2086 | vmovdqu -16*$SZ+112($inp),%xmm7 | |
2087 | #mov $inp,$_inp # offload $inp | |
2088 | vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 | |
2089 | vinserti128 \$1,(%r12),@X[0],@X[0] | |
2090 | vinserti128 \$1,16(%r12),@X[1],@X[1] | |
2091 | vpshufb $t2,@X[0],@X[0] | |
2092 | vinserti128 \$1,32(%r12),@X[2],@X[2] | |
2093 | vpshufb $t2,@X[1],@X[1] | |
2094 | vinserti128 \$1,48(%r12),@X[3],@X[3] | |
2095 | vpshufb $t2,@X[2],@X[2] | |
2096 | vinserti128 \$1,64(%r12),@X[4],@X[4] | |
2097 | vpshufb $t2,@X[3],@X[3] | |
2098 | vinserti128 \$1,80(%r12),@X[5],@X[5] | |
2099 | vpshufb $t2,@X[4],@X[4] | |
2100 | vinserti128 \$1,96(%r12),@X[6],@X[6] | |
2101 | vpshufb $t2,@X[5],@X[5] | |
2102 | vinserti128 \$1,112(%r12),@X[7],@X[7] | |
2103 | ||
c4558efb AP |
2104 | vpaddq -0x80($Tbl),@X[0],$t0 |
2105 | vpshufb $t2,@X[6],@X[6] | |
2106 | vpaddq -0x60($Tbl),@X[1],$t1 | |
2107 | vpshufb $t2,@X[7],@X[7] | |
2108 | vpaddq -0x40($Tbl),@X[2],$t2 | |
2109 | vpaddq -0x20($Tbl),@X[3],$t3 | |
2110 | vmovdqa $t0,0x00(%rsp) | |
2111 | vpaddq 0x00($Tbl),@X[4],$t0 | |
2112 | vmovdqa $t1,0x20(%rsp) | |
2113 | vpaddq 0x20($Tbl),@X[5],$t1 | |
2114 | vmovdqa $t2,0x40(%rsp) | |
2115 | vpaddq 0x40($Tbl),@X[6],$t2 | |
2116 | vmovdqa $t3,0x60(%rsp) | |
9ce91035 BE |
2117 | ___ |
2118 | $code.=<<___ if (!$win64); | |
2119 | # temporarily use %rdi as frame pointer | |
2120 | mov $_rsp,%rdi | |
2121 | .cfi_def_cfa %rdi,8 | |
2122 | ___ | |
2123 | $code.=<<___; | |
c4558efb | 2124 | lea -$PUSH8(%rsp),%rsp |
9ce91035 BE |
2125 | ___ |
2126 | $code.=<<___ if (!$win64); | |
2127 | # the frame info is at $_rsp, but the stack is moving... | |
2128 | # so a second frame pointer is saved at -8(%rsp) | |
2129 | # that is in the red zone | |
2130 | mov %rdi,-8(%rsp) | |
2131 | .cfi_cfa_expression %rsp-8,deref,+8 | |
2132 | ___ | |
2133 | $code.=<<___; | |
c4558efb AP |
2134 | vpaddq 0x60($Tbl),@X[7],$t3 |
2135 | vmovdqa $t0,0x00(%rsp) | |
2136 | xor $a1,$a1 | |
2137 | vmovdqa $t1,0x20(%rsp) | |
2138 | mov $B,$a3 | |
2139 | vmovdqa $t2,0x40(%rsp) | |
2140 | xor $C,$a3 # magic | |
2141 | vmovdqa $t3,0x60(%rsp) | |
2142 | mov $F,$a4 | |
2143 | add \$16*2*$SZ,$Tbl | |
2144 | jmp .Lavx2_00_47 | |
2145 | ||
2146 | .align 16 | |
2147 | .Lavx2_00_47: | |
2148 | ___ | |
2149 | ||
2150 | sub AVX2_512_00_47 () { | |
2151 | my $j = shift; | |
2152 | my $body = shift; | |
2153 | my @X = @_; | |
2154 | my @insns = (&$body,&$body); # 48 instructions | |
2155 | my $base = "+2*$PUSH8(%rsp)"; | |
2156 | ||
9ce91035 BE |
2157 | if (($j%4)==0) { |
2158 | &lea ("%rsp","-$PUSH8(%rsp)"); | |
2159 | $code.=<<___ if (!$win64); | |
2160 | .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 | |
2161 | # copy secondary frame pointer to new location again at -8(%rsp) | |
2162 | pushq $PUSH8-8(%rsp) | |
2163 | .cfi_cfa_expression %rsp,deref,+8 | |
2164 | lea 8(%rsp),%rsp | |
2165 | .cfi_cfa_expression %rsp-8,deref,+8 | |
2166 | ___ | |
2167 | } | |
2168 | ||
c4558efb AP |
2169 | foreach (Xupdate_512_AVX()) { # 23 instructions |
2170 | eval; | |
2171 | if ($_ !~ /\;$/) { | |
2172 | eval(shift(@insns)); | |
2173 | eval(shift(@insns)); | |
2174 | eval(shift(@insns)); | |
2175 | } | |
2176 | } | |
2177 | &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); | |
2178 | foreach (@insns) { eval; } # remaining instructions | |
2179 | &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); | |
2180 | } | |
2181 | ||
2182 | for ($i=0,$j=0; $j<8; $j++) { | |
2183 | &AVX2_512_00_47($j,\&bodyx_00_15,@X); | |
2184 | push(@X,shift(@X)); # rotate(@X) | |
2185 | } | |
2186 | &lea ($Tbl,16*2*$SZ."($Tbl)"); | |
2187 | &cmpb (($SZ-1-0x80)."($Tbl)",0); | |
2188 | &jne (".Lavx2_00_47"); | |
2189 | ||
2190 | for ($i=0; $i<16; ) { | |
2191 | my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; | |
2192 | foreach(bodyx_00_15()) { eval; } | |
2193 | } | |
2194 | } | |
2195 | $code.=<<___; | |
2196 | mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx | |
2197 | add $a1,$A | |
2198 | #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp | |
2199 | lea `2*$SZ*($rounds-8)`(%rsp),$Tbl | |
2200 | ||
2201 | add $SZ*0($ctx),$A | |
2202 | add $SZ*1($ctx),$B | |
2203 | add $SZ*2($ctx),$C | |
2204 | add $SZ*3($ctx),$D | |
2205 | add $SZ*4($ctx),$E | |
2206 | add $SZ*5($ctx),$F | |
2207 | add $SZ*6($ctx),$G | |
2208 | add $SZ*7($ctx),$H | |
2209 | ||
2210 | mov $A,$SZ*0($ctx) | |
2211 | mov $B,$SZ*1($ctx) | |
2212 | mov $C,$SZ*2($ctx) | |
2213 | mov $D,$SZ*3($ctx) | |
2214 | mov $E,$SZ*4($ctx) | |
2215 | mov $F,$SZ*5($ctx) | |
2216 | mov $G,$SZ*6($ctx) | |
2217 | mov $H,$SZ*7($ctx) | |
2218 | ||
2219 | cmp `$PUSH8+2*8`($Tbl),$inp # $_end | |
2220 | je .Ldone_avx2 | |
2221 | ||
2222 | xor $a1,$a1 | |
2223 | mov $B,$a3 | |
2224 | xor $C,$a3 # magic | |
2225 | mov $F,$a4 | |
2226 | jmp .Lower_avx2 | |
2227 | .align 16 | |
2228 | .Lower_avx2: | |
2229 | ___ | |
2230 | for ($i=0; $i<8; ) { | |
2231 | my $base="+16($Tbl)"; | |
2232 | foreach(bodyx_00_15()) { eval; } | |
2233 | } | |
2234 | $code.=<<___; | |
2235 | lea -$PUSH8($Tbl),$Tbl | |
2236 | cmp %rsp,$Tbl | |
2237 | jae .Lower_avx2 | |
2238 | ||
2239 | mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx | |
2240 | add $a1,$A | |
2241 | #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp | |
2242 | lea `2*$SZ*($rounds-8)`(%rsp),%rsp | |
9ce91035 BE |
2243 | # restore frame pointer to original location at $_rsp |
2244 | .cfi_cfa_expression $_rsp,deref,+8 | |
c4558efb AP |
2245 | |
2246 | add $SZ*0($ctx),$A | |
2247 | add $SZ*1($ctx),$B | |
2248 | add $SZ*2($ctx),$C | |
2249 | add $SZ*3($ctx),$D | |
2250 | add $SZ*4($ctx),$E | |
2251 | add $SZ*5($ctx),$F | |
2252 | lea `2*16*$SZ`($inp),$inp # inp+=2 | |
2253 | add $SZ*6($ctx),$G | |
504bbcf3 | 2254 | mov $inp,%r12 |
c4558efb AP |
2255 | add $SZ*7($ctx),$H |
2256 | cmp $_end,$inp | |
2257 | ||
2258 | mov $A,$SZ*0($ctx) | |
504bbcf3 | 2259 | cmove %rsp,%r12 # next block or stale data |
c4558efb AP |
2260 | mov $B,$SZ*1($ctx) |
2261 | mov $C,$SZ*2($ctx) | |
2262 | mov $D,$SZ*3($ctx) | |
2263 | mov $E,$SZ*4($ctx) | |
2264 | mov $F,$SZ*5($ctx) | |
2265 | mov $G,$SZ*6($ctx) | |
2266 | mov $H,$SZ*7($ctx) | |
2267 | ||
c4558efb AP |
2268 | jbe .Loop_avx2 |
2269 | lea (%rsp),$Tbl | |
9ce91035 BE |
2270 | # temporarily use $Tbl as index to $_rsp |
2271 | # this avoids the need to save a secondary frame pointer at -8(%rsp) | |
2272 | .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 | |
c4558efb AP |
2273 | |
2274 | .Ldone_avx2: | |
9ce91035 | 2275 | mov `16*$SZ+3*8`($Tbl),%rsi |
399976c7 | 2276 | .cfi_def_cfa %rsi,8 |
00678437 | 2277 | vzeroupper |
c4558efb AP |
2278 | ___ |
2279 | $code.=<<___ if ($win64); | |
9ce91035 BE |
2280 | movaps 16*$SZ+32($Tbl),%xmm6 |
2281 | movaps 16*$SZ+48($Tbl),%xmm7 | |
2282 | movaps 16*$SZ+64($Tbl),%xmm8 | |
2283 | movaps 16*$SZ+80($Tbl),%xmm9 | |
c4558efb AP |
2284 | ___ |
2285 | $code.=<<___ if ($win64 && $SZ>4); | |
9ce91035 BE |
2286 | movaps 16*$SZ+96($Tbl),%xmm10 |
2287 | movaps 16*$SZ+112($Tbl),%xmm11 | |
c4558efb AP |
2288 | ___ |
2289 | $code.=<<___; | |
384e6de4 | 2290 | mov -48(%rsi),%r15 |
399976c7 | 2291 | .cfi_restore %r15 |
384e6de4 | 2292 | mov -40(%rsi),%r14 |
399976c7 | 2293 | .cfi_restore %r14 |
384e6de4 | 2294 | mov -32(%rsi),%r13 |
399976c7 | 2295 | .cfi_restore %r13 |
384e6de4 | 2296 | mov -24(%rsi),%r12 |
399976c7 | 2297 | .cfi_restore %r12 |
384e6de4 | 2298 | mov -16(%rsi),%rbp |
399976c7 | 2299 | .cfi_restore %rbp |
384e6de4 | 2300 | mov -8(%rsi),%rbx |
399976c7 | 2301 | .cfi_restore %rbx |
384e6de4 | 2302 | lea (%rsi),%rsp |
399976c7 | 2303 | .cfi_def_cfa_register %rsp |
c4558efb AP |
2304 | .Lepilogue_avx2: |
2305 | ret | |
399976c7 | 2306 | .cfi_endproc |
c4558efb AP |
2307 | .size ${func}_avx2,.-${func}_avx2 |
2308 | ___ | |
2309 | }} | |
a8f3b8b5 AP |
2310 | }}}}} |
2311 | ||
be01f79d AP |
2312 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
2313 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
2314 | if ($win64) { | |
2315 | $rec="%rcx"; | |
2316 | $frame="%rdx"; | |
2317 | $context="%r8"; | |
2318 | $disp="%r9"; | |
2319 | ||
2320 | $code.=<<___; | |
2321 | .extern __imp_RtlVirtualUnwind | |
2322 | .type se_handler,\@abi-omnipotent | |
2323 | .align 16 | |
2324 | se_handler: | |
2325 | push %rsi | |
2326 | push %rdi | |
2327 | push %rbx | |
2328 | push %rbp | |
2329 | push %r12 | |
2330 | push %r13 | |
2331 | push %r14 | |
2332 | push %r15 | |
2333 | pushfq | |
2334 | sub \$64,%rsp | |
2335 | ||
2336 | mov 120($context),%rax # pull context->Rax | |
2337 | mov 248($context),%rbx # pull context->Rip | |
2338 | ||
a8f3b8b5 AP |
2339 | mov 8($disp),%rsi # disp->ImageBase |
2340 | mov 56($disp),%r11 # disp->HanderlData | |
2341 | ||
2342 | mov 0(%r11),%r10d # HandlerData[0] | |
2343 | lea (%rsi,%r10),%r10 # prologue label | |
2344 | cmp %r10,%rbx # context->Rip<prologue label | |
be01f79d AP |
2345 | jb .Lin_prologue |
2346 | ||
2347 | mov 152($context),%rax # pull context->Rsp | |
2348 | ||
a8f3b8b5 AP |
2349 | mov 4(%r11),%r10d # HandlerData[1] |
2350 | lea (%rsi,%r10),%r10 # epilogue label | |
2351 | cmp %r10,%rbx # context->Rip>=epilogue label | |
be01f79d | 2352 | jae .Lin_prologue |
c4558efb AP |
2353 | ___ |
2354 | $code.=<<___ if ($avx>1); | |
2355 | lea .Lavx2_shortcut(%rip),%r10 | |
2356 | cmp %r10,%rbx # context->Rip<avx2_shortcut | |
2357 | jb .Lnot_in_avx2 | |
2358 | ||
2359 | and \$-256*$SZ,%rax | |
2360 | add \$`2*$SZ*($rounds-8)`,%rax | |
2361 | .Lnot_in_avx2: | |
2362 | ___ | |
2363 | $code.=<<___; | |
a8f3b8b5 | 2364 | mov %rax,%rsi # put aside Rsp |
be01f79d | 2365 | mov 16*$SZ+3*8(%rax),%rax # pull $_rsp |
be01f79d AP |
2366 | |
2367 | mov -8(%rax),%rbx | |
2368 | mov -16(%rax),%rbp | |
2369 | mov -24(%rax),%r12 | |
2370 | mov -32(%rax),%r13 | |
2371 | mov -40(%rax),%r14 | |
2372 | mov -48(%rax),%r15 | |
2373 | mov %rbx,144($context) # restore context->Rbx | |
2374 | mov %rbp,160($context) # restore context->Rbp | |
2375 | mov %r12,216($context) # restore context->R12 | |
2376 | mov %r13,224($context) # restore context->R13 | |
2377 | mov %r14,232($context) # restore context->R14 | |
2378 | mov %r15,240($context) # restore context->R15 | |
2379 | ||
a8f3b8b5 AP |
2380 | lea .Lepilogue(%rip),%r10 |
2381 | cmp %r10,%rbx | |
2382 | jb .Lin_prologue # non-AVX code | |
2383 | ||
2384 | lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area | |
2385 | lea 512($context),%rdi # &context.Xmm6 | |
2386 | mov \$`$SZ==4?8:12`,%ecx | |
2387 | .long 0xa548f3fc # cld; rep movsq | |
2388 | ||
be01f79d AP |
2389 | .Lin_prologue: |
2390 | mov 8(%rax),%rdi | |
2391 | mov 16(%rax),%rsi | |
2392 | mov %rax,152($context) # restore context->Rsp | |
2393 | mov %rsi,168($context) # restore context->Rsi | |
2394 | mov %rdi,176($context) # restore context->Rdi | |
2395 | ||
2396 | mov 40($disp),%rdi # disp->ContextRecord | |
2397 | mov $context,%rsi # context | |
2398 | mov \$154,%ecx # sizeof(CONTEXT) | |
2399 | .long 0xa548f3fc # cld; rep movsq | |
2400 | ||
2401 | mov $disp,%rsi | |
2402 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
2403 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
2404 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
2405 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
2406 | mov 40(%rsi),%r10 # disp->ContextRecord | |
2407 | lea 56(%rsi),%r11 # &disp->HandlerData | |
2408 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
2409 | mov %r10,32(%rsp) # arg5 | |
2410 | mov %r11,40(%rsp) # arg6 | |
2411 | mov %r12,48(%rsp) # arg7 | |
2412 | mov %rcx,56(%rsp) # arg8, (NULL) | |
2413 | call *__imp_RtlVirtualUnwind(%rip) | |
2414 | ||
2415 | mov \$1,%eax # ExceptionContinueSearch | |
2416 | add \$64,%rsp | |
2417 | popfq | |
2418 | pop %r15 | |
2419 | pop %r14 | |
2420 | pop %r13 | |
2421 | pop %r12 | |
2422 | pop %rbp | |
2423 | pop %rbx | |
2424 | pop %rdi | |
2425 | pop %rsi | |
2426 | ret | |
2427 | .size se_handler,.-se_handler | |
29be3f64 | 2428 | ___ |
be01f79d | 2429 | |
29be3f64 | 2430 | $code.=<<___ if ($SZ==4 && $shaext); |
619b9466 AP |
2431 | .type shaext_handler,\@abi-omnipotent |
2432 | .align 16 | |
2433 | shaext_handler: | |
2434 | push %rsi | |
2435 | push %rdi | |
2436 | push %rbx | |
2437 | push %rbp | |
2438 | push %r12 | |
2439 | push %r13 | |
2440 | push %r14 | |
2441 | push %r15 | |
2442 | pushfq | |
2443 | sub \$64,%rsp | |
2444 | ||
2445 | mov 120($context),%rax # pull context->Rax | |
2446 | mov 248($context),%rbx # pull context->Rip | |
2447 | ||
2448 | lea .Lprologue_shaext(%rip),%r10 | |
2449 | cmp %r10,%rbx # context->Rip<.Lprologue | |
2450 | jb .Lin_prologue | |
2451 | ||
2452 | lea .Lepilogue_shaext(%rip),%r10 | |
2453 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
2454 | jae .Lin_prologue | |
2455 | ||
2456 | lea -8-5*16(%rax),%rsi | |
2457 | lea 512($context),%rdi # &context.Xmm6 | |
2458 | mov \$10,%ecx | |
2459 | .long 0xa548f3fc # cld; rep movsq | |
2460 | ||
2461 | jmp .Lin_prologue | |
2462 | .size shaext_handler,.-shaext_handler | |
29be3f64 | 2463 | ___ |
619b9466 | 2464 | |
29be3f64 | 2465 | $code.=<<___; |
be01f79d AP |
2466 | .section .pdata |
2467 | .align 4 | |
2468 | .rva .LSEH_begin_$func | |
2469 | .rva .LSEH_end_$func | |
2470 | .rva .LSEH_info_$func | |
a8f3b8b5 | 2471 | ___ |
7eb9680a | 2472 | $code.=<<___ if ($SZ==4 && $shaext); |
619b9466 AP |
2473 | .rva .LSEH_begin_${func}_shaext |
2474 | .rva .LSEH_end_${func}_shaext | |
2475 | .rva .LSEH_info_${func}_shaext | |
977f32e8 AP |
2476 | ___ |
2477 | $code.=<<___ if ($SZ==4); | |
a8f3b8b5 AP |
2478 | .rva .LSEH_begin_${func}_ssse3 |
2479 | .rva .LSEH_end_${func}_ssse3 | |
2480 | .rva .LSEH_info_${func}_ssse3 | |
2481 | ___ | |
2482 | $code.=<<___ if ($avx && $SZ==8); | |
2483 | .rva .LSEH_begin_${func}_xop | |
2484 | .rva .LSEH_end_${func}_xop | |
2485 | .rva .LSEH_info_${func}_xop | |
2486 | ___ | |
2487 | $code.=<<___ if ($avx); | |
2488 | .rva .LSEH_begin_${func}_avx | |
2489 | .rva .LSEH_end_${func}_avx | |
faee82c1 | 2490 | .rva .LSEH_info_${func}_avx |
a8f3b8b5 | 2491 | ___ |
c4558efb AP |
2492 | $code.=<<___ if ($avx>1); |
2493 | .rva .LSEH_begin_${func}_avx2 | |
2494 | .rva .LSEH_end_${func}_avx2 | |
2495 | .rva .LSEH_info_${func}_avx2 | |
2496 | ___ | |
a8f3b8b5 | 2497 | $code.=<<___; |
be01f79d AP |
2498 | .section .xdata |
2499 | .align 8 | |
2500 | .LSEH_info_$func: | |
2501 | .byte 9,0,0,0 | |
2502 | .rva se_handler | |
a8f3b8b5 AP |
2503 | .rva .Lprologue,.Lepilogue # HandlerData[] |
2504 | ___ | |
07b635cc | 2505 | $code.=<<___ if ($SZ==4 && $shaext); |
619b9466 AP |
2506 | .LSEH_info_${func}_shaext: |
2507 | .byte 9,0,0,0 | |
2508 | .rva shaext_handler | |
07b635cc AP |
2509 | ___ |
2510 | $code.=<<___ if ($SZ==4); | |
a8f3b8b5 AP |
2511 | .LSEH_info_${func}_ssse3: |
2512 | .byte 9,0,0,0 | |
2513 | .rva se_handler | |
2514 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] | |
2515 | ___ | |
2516 | $code.=<<___ if ($avx && $SZ==8); | |
2517 | .LSEH_info_${func}_xop: | |
2518 | .byte 9,0,0,0 | |
2519 | .rva se_handler | |
2520 | .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] | |
2521 | ___ | |
2522 | $code.=<<___ if ($avx); | |
2523 | .LSEH_info_${func}_avx: | |
2524 | .byte 9,0,0,0 | |
2525 | .rva se_handler | |
2526 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | |
be01f79d | 2527 | ___ |
c4558efb AP |
2528 | $code.=<<___ if ($avx>1); |
2529 | .LSEH_info_${func}_avx2: | |
2530 | .byte 9,0,0,0 | |
2531 | .rva se_handler | |
2532 | .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] | |
2533 | ___ | |
be01f79d AP |
2534 | } |
2535 | ||
619b9466 AP |
2536 | sub sha256op38 { |
2537 | my $instr = shift; | |
2538 | my %opcodelet = ( | |
2539 | "sha256rnds2" => 0xcb, | |
2540 | "sha256msg1" => 0xcc, | |
2541 | "sha256msg2" => 0xcd ); | |
2542 | ||
2543 | if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { | |
2544 | my @opcode=(0x0f,0x38); | |
2545 | push @opcode,$opcodelet{$instr}; | |
2546 | push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M | |
2547 | return ".byte\t".join(',',@opcode); | |
2548 | } else { | |
2549 | return $instr."\t".@_[0]; | |
2550 | } | |
2551 | } | |
2552 | ||
2553 | foreach (split("\n",$code)) { | |
2554 | s/\`([^\`]*)\`/eval $1/geo; | |
2555 | ||
2556 | s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; | |
2557 | ||
2558 | print $_,"\n"; | |
2559 | } | |
a21314db | 2560 | close STDOUT or die "error closing STDOUT: $!"; |