]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
a598ed0d | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
b7838586 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # Multi-buffer SHA1 procedure processes n buffers in parallel by | |
18 | # placing buffer data to designated lane of SIMD register. n is | |
19 | # naturally limited to 4 on pre-AVX2 processors and to 8 on | |
20 | # AVX2-capable processors such as Haswell. | |
21 | # | |
61ba602a | 22 | # this +aesni(i) sha1 aesni-sha1 gain(iv) |
b7838586 | 23 | # ------------------------------------------------------------------- |
3847d15d | 24 | # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% |
619b9466 | 25 | # Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% |
b7838586 | 26 | # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% |
3847d15d | 27 | # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% |
61ba602a | 28 | # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% |
b7f5503f | 29 | # Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% |
3847d15d | 30 | # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% |
b7838586 | 31 | # |
61ba602a AP |
32 | # (i) multi-block CBC encrypt with 128-bit key; |
33 | # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, | |
b7838586 | 34 | # because of lower AES-NI instruction throughput; |
61ba602a | 35 | # (iii) "this" is for n=8, when we gather twice as much data, result |
3847d15d AP |
36 | # for n=4 is 8.00+4.44=12.4; |
37 | # (iv) presented improvement coefficients are asymptotic limits and | |
38 | # in real-life application are somewhat lower, e.g. for 2KB | |
39 | # fragments they range from 30% to 100% (on Haswell); | |
b7838586 | 40 | |
1aa89a7a RL |
41 | # $output is the last argument if it looks like a file (it has an extension) |
42 | # $flavour is the first argument if it doesn't look like a file | |
43 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
44 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
b7838586 AP |
45 | |
46 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
47 | ||
48 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
49 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
50 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
51 | die "can't locate x86_64-xlate.pl"; | |
52 | ||
0d51cf3c L |
53 | push(@INC,"${dir}","${dir}../../perlasm"); |
54 | require "x86_64-support.pl"; | |
55 | ||
56 | $ptr_size=&pointer_size($flavour); | |
57 | ||
b7838586 AP |
58 | $avx=0; |
59 | ||
60 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
61 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
62 | $avx = ($1>=2.19) + ($1>=2.22); | |
63 | } | |
64 | ||
65 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
66 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { | |
67 | $avx = ($1>=2.09) + ($1>=2.10); | |
68 | } | |
69 | ||
70 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
71 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
72 | $avx = ($1>=10) + ($1>=11); | |
73 | } | |
74 | ||
9bb3e5fd | 75 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { |
a356e488 | 76 | $avx = ($2>=3.0) + ($2>3.0); |
ac171925 AP |
77 | } |
78 | ||
1aa89a7a RL |
79 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
80 | or die "can't call $xlate: $!"; | |
b7838586 AP |
81 | *STDOUT=*OUT; |
82 | ||
83 | # void sha1_multi_block ( | |
84 | # struct { unsigned int A[8]; | |
85 | # unsigned int B[8]; | |
86 | # unsigned int C[8]; | |
87 | # unsigned int D[8]; | |
88 | # unsigned int E[8]; } *ctx, | |
89 | # struct { void *ptr; int blocks; } inp[8], | |
90 | # int num); /* 1 or 2 */ | |
91 | # | |
92 | $ctx="%rdi"; # 1st arg | |
93 | $inp="%rsi"; # 2nd arg | |
94 | $num="%edx"; | |
95 | @ptr=map("%r$_",(8..11)); | |
96 | $Tbl="%rbp"; | |
0d51cf3c | 97 | $inp_elm_size=2*$ptr_size; |
b7838586 AP |
98 | |
99 | @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); | |
100 | ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); | |
101 | @Xi=map("%xmm$_",(10..14)); | |
102 | $K="%xmm15"; | |
103 | ||
3847d15d AP |
104 | if (1) { |
105 | # Atom-specific optimization aiming to eliminate pshufb with high | |
609b0852 | 106 | # registers [and thus get rid of 48 cycles accumulated penalty] |
3847d15d AP |
107 | @Xi=map("%xmm$_",(0..4)); |
108 | ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); | |
109 | @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); | |
110 | } | |
111 | ||
b7838586 AP |
112 | $REG_SZ=16; |
113 | ||
114 | sub Xi_off { | |
115 | my $off = shift; | |
116 | ||
117 | $off %= 16; $off *= $REG_SZ; | |
118 | $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; | |
119 | } | |
120 | ||
121 | sub BODY_00_19 { | |
122 | my ($i,$a,$b,$c,$d,$e)=@_; | |
123 | my $j=$i+1; | |
124 | my $k=$i+2; | |
125 | ||
e608273a AP |
126 | # Loads are performed 2+3/4 iterations in advance. 3/4 means that out |
127 | # of 4 words you would expect to be loaded per given iteration one is | |
128 | # spilled to next iteration. In other words indices in four input | |
129 | # streams are distributed as following: | |
130 | # | |
131 | # $i==0: 0,0,0,0,1,1,1,1,2,2,2, | |
132 | # $i==1: 2,3,3,3, | |
133 | # $i==2: 3,4,4,4, | |
134 | # ... | |
135 | # $i==13: 14,15,15,15, | |
136 | # $i==14: 15 | |
609b0852 | 137 | # |
e608273a | 138 | # Then at $i==15 Xupdate is applied one iteration in advance... |
b7838586 AP |
139 | $code.=<<___ if ($i==0); |
140 | movd (@ptr[0]),@Xi[0] | |
141 | lea `16*4`(@ptr[0]),@ptr[0] | |
142 | movd (@ptr[1]),@Xi[2] # borrow @Xi[2] | |
143 | lea `16*4`(@ptr[1]),@ptr[1] | |
144 | movd (@ptr[2]),@Xi[3] # borrow @Xi[3] | |
145 | lea `16*4`(@ptr[2]),@ptr[2] | |
146 | movd (@ptr[3]),@Xi[4] # borrow @Xi[4] | |
147 | lea `16*4`(@ptr[3]),@ptr[3] | |
148 | punpckldq @Xi[3],@Xi[0] | |
149 | movd `4*$j-16*4`(@ptr[0]),@Xi[1] | |
150 | punpckldq @Xi[4],@Xi[2] | |
151 | movd `4*$j-16*4`(@ptr[1]),$t3 | |
152 | punpckldq @Xi[2],@Xi[0] | |
153 | movd `4*$j-16*4`(@ptr[2]),$t2 | |
154 | pshufb $tx,@Xi[0] | |
155 | ___ | |
156 | $code.=<<___ if ($i<14); # just load input | |
157 | movd `4*$j-16*4`(@ptr[3]),$t1 | |
158 | punpckldq $t2,@Xi[1] | |
159 | movdqa $a,$t2 | |
160 | paddd $K,$e # e+=K_00_19 | |
161 | punpckldq $t1,$t3 | |
162 | movdqa $b,$t1 | |
163 | movdqa $b,$t0 | |
164 | pslld \$5,$t2 | |
165 | pandn $d,$t1 | |
166 | pand $c,$t0 | |
167 | punpckldq $t3,@Xi[1] | |
168 | movdqa $a,$t3 | |
169 | ||
170 | movdqa @Xi[0],`&Xi_off($i)` | |
171 | paddd @Xi[0],$e # e+=X[i] | |
172 | movd `4*$k-16*4`(@ptr[0]),@Xi[2] | |
173 | psrld \$27,$t3 | |
174 | pxor $t1,$t0 # Ch(b,c,d) | |
175 | movdqa $b,$t1 | |
176 | ||
177 | por $t3,$t2 # rol(a,5) | |
178 | movd `4*$k-16*4`(@ptr[1]),$t3 | |
179 | pslld \$30,$t1 | |
180 | paddd $t0,$e # e+=Ch(b,c,d) | |
181 | ||
182 | psrld \$2,$b | |
183 | paddd $t2,$e # e+=rol(a,5) | |
b7838586 | 184 | pshufb $tx,@Xi[1] |
55eb14da | 185 | movd `4*$k-16*4`(@ptr[2]),$t2 |
b7838586 AP |
186 | por $t1,$b # b=rol(b,30) |
187 | ___ | |
188 | $code.=<<___ if ($i==14); # just load input | |
189 | movd `4*$j-16*4`(@ptr[3]),$t1 | |
190 | punpckldq $t2,@Xi[1] | |
191 | movdqa $a,$t2 | |
192 | paddd $K,$e # e+=K_00_19 | |
193 | punpckldq $t1,$t3 | |
194 | movdqa $b,$t1 | |
195 | movdqa $b,$t0 | |
196 | pslld \$5,$t2 | |
3847d15d | 197 | prefetcht0 63(@ptr[0]) |
b7838586 AP |
198 | pandn $d,$t1 |
199 | pand $c,$t0 | |
200 | punpckldq $t3,@Xi[1] | |
201 | movdqa $a,$t3 | |
202 | ||
203 | movdqa @Xi[0],`&Xi_off($i)` | |
204 | paddd @Xi[0],$e # e+=X[i] | |
205 | psrld \$27,$t3 | |
206 | pxor $t1,$t0 # Ch(b,c,d) | |
207 | movdqa $b,$t1 | |
3847d15d | 208 | prefetcht0 63(@ptr[1]) |
b7838586 AP |
209 | |
210 | por $t3,$t2 # rol(a,5) | |
211 | pslld \$30,$t1 | |
212 | paddd $t0,$e # e+=Ch(b,c,d) | |
3847d15d | 213 | prefetcht0 63(@ptr[2]) |
b7838586 AP |
214 | |
215 | psrld \$2,$b | |
216 | paddd $t2,$e # e+=rol(a,5) | |
217 | pshufb $tx,@Xi[1] | |
3847d15d | 218 | prefetcht0 63(@ptr[3]) |
b7838586 AP |
219 | por $t1,$b # b=rol(b,30) |
220 | ___ | |
221 | $code.=<<___ if ($i>=13 && $i<15); | |
222 | movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" | |
223 | ___ | |
224 | $code.=<<___ if ($i>=15); # apply Xupdate | |
225 | pxor @Xi[-2],@Xi[1] # "X[13]" | |
226 | movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" | |
227 | ||
228 | movdqa $a,$t2 | |
229 | pxor `&Xi_off($j+8)`,@Xi[1] | |
230 | paddd $K,$e # e+=K_00_19 | |
231 | movdqa $b,$t1 | |
232 | pslld \$5,$t2 | |
233 | pxor @Xi[3],@Xi[1] | |
234 | movdqa $b,$t0 | |
235 | pandn $d,$t1 | |
236 | movdqa @Xi[1],$tx | |
237 | pand $c,$t0 | |
238 | movdqa $a,$t3 | |
239 | psrld \$31,$tx | |
240 | paddd @Xi[1],@Xi[1] | |
241 | ||
242 | movdqa @Xi[0],`&Xi_off($i)` | |
243 | paddd @Xi[0],$e # e+=X[i] | |
244 | psrld \$27,$t3 | |
245 | pxor $t1,$t0 # Ch(b,c,d) | |
246 | ||
247 | movdqa $b,$t1 | |
248 | por $t3,$t2 # rol(a,5) | |
249 | pslld \$30,$t1 | |
250 | paddd $t0,$e # e+=Ch(b,c,d) | |
251 | ||
252 | psrld \$2,$b | |
253 | paddd $t2,$e # e+=rol(a,5) | |
254 | por $tx,@Xi[1] # rol \$1,@Xi[1] | |
255 | por $t1,$b # b=rol(b,30) | |
256 | ___ | |
257 | push(@Xi,shift(@Xi)); | |
258 | } | |
259 | ||
260 | sub BODY_20_39 { | |
261 | my ($i,$a,$b,$c,$d,$e)=@_; | |
262 | my $j=$i+1; | |
263 | ||
264 | $code.=<<___ if ($i<79); | |
265 | pxor @Xi[-2],@Xi[1] # "X[13]" | |
266 | movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" | |
267 | ||
268 | movdqa $a,$t2 | |
269 | movdqa $d,$t0 | |
270 | pxor `&Xi_off($j+8)`,@Xi[1] | |
271 | paddd $K,$e # e+=K_20_39 | |
272 | pslld \$5,$t2 | |
273 | pxor $b,$t0 | |
274 | ||
275 | movdqa $a,$t3 | |
276 | ___ | |
277 | $code.=<<___ if ($i<72); | |
278 | movdqa @Xi[0],`&Xi_off($i)` | |
279 | ___ | |
280 | $code.=<<___ if ($i<79); | |
281 | paddd @Xi[0],$e # e+=X[i] | |
282 | pxor @Xi[3],@Xi[1] | |
283 | psrld \$27,$t3 | |
284 | pxor $c,$t0 # Parity(b,c,d) | |
285 | movdqa $b,$t1 | |
286 | ||
287 | pslld \$30,$t1 | |
288 | movdqa @Xi[1],$tx | |
289 | por $t3,$t2 # rol(a,5) | |
290 | psrld \$31,$tx | |
291 | paddd $t0,$e # e+=Parity(b,c,d) | |
292 | paddd @Xi[1],@Xi[1] | |
293 | ||
294 | psrld \$2,$b | |
295 | paddd $t2,$e # e+=rol(a,5) | |
296 | por $tx,@Xi[1] # rol(@Xi[1],1) | |
297 | por $t1,$b # b=rol(b,30) | |
298 | ___ | |
299 | $code.=<<___ if ($i==79); | |
300 | movdqa $a,$t2 | |
301 | paddd $K,$e # e+=K_20_39 | |
302 | movdqa $d,$t0 | |
303 | pslld \$5,$t2 | |
304 | pxor $b,$t0 | |
305 | ||
306 | movdqa $a,$t3 | |
307 | paddd @Xi[0],$e # e+=X[i] | |
308 | psrld \$27,$t3 | |
309 | movdqa $b,$t1 | |
310 | pxor $c,$t0 # Parity(b,c,d) | |
311 | ||
312 | pslld \$30,$t1 | |
313 | por $t3,$t2 # rol(a,5) | |
314 | paddd $t0,$e # e+=Parity(b,c,d) | |
315 | ||
316 | psrld \$2,$b | |
317 | paddd $t2,$e # e+=rol(a,5) | |
318 | por $t1,$b # b=rol(b,30) | |
319 | ___ | |
320 | push(@Xi,shift(@Xi)); | |
321 | } | |
322 | ||
323 | sub BODY_40_59 { | |
324 | my ($i,$a,$b,$c,$d,$e)=@_; | |
325 | my $j=$i+1; | |
326 | ||
327 | $code.=<<___; | |
328 | pxor @Xi[-2],@Xi[1] # "X[13]" | |
329 | movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" | |
330 | ||
331 | movdqa $a,$t2 | |
332 | movdqa $d,$t1 | |
333 | pxor `&Xi_off($j+8)`,@Xi[1] | |
334 | pxor @Xi[3],@Xi[1] | |
335 | paddd $K,$e # e+=K_40_59 | |
336 | pslld \$5,$t2 | |
337 | movdqa $a,$t3 | |
338 | pand $c,$t1 | |
339 | ||
340 | movdqa $d,$t0 | |
341 | movdqa @Xi[1],$tx | |
342 | psrld \$27,$t3 | |
343 | paddd $t1,$e | |
344 | pxor $c,$t0 | |
345 | ||
346 | movdqa @Xi[0],`&Xi_off($i)` | |
347 | paddd @Xi[0],$e # e+=X[i] | |
348 | por $t3,$t2 # rol(a,5) | |
349 | psrld \$31,$tx | |
350 | pand $b,$t0 | |
351 | movdqa $b,$t1 | |
352 | ||
353 | pslld \$30,$t1 | |
354 | paddd @Xi[1],@Xi[1] | |
355 | paddd $t0,$e # e+=Maj(b,d,c) | |
356 | ||
357 | psrld \$2,$b | |
358 | paddd $t2,$e # e+=rol(a,5) | |
359 | por $tx,@Xi[1] # rol(@X[1],1) | |
360 | por $t1,$b # b=rol(b,30) | |
361 | ___ | |
362 | push(@Xi,shift(@Xi)); | |
363 | } | |
364 | ||
365 | $code.=<<___; | |
366 | .text | |
367 | ||
368 | .extern OPENSSL_ia32cap_P | |
369 | ||
370 | .globl sha1_multi_block | |
371 | .type sha1_multi_block,\@function,3 | |
372 | .align 32 | |
373 | sha1_multi_block: | |
399976c7 | 374 | .cfi_startproc |
619b9466 AP |
375 | mov OPENSSL_ia32cap_P+4(%rip),%rcx |
376 | bt \$61,%rcx # check SHA bit | |
377 | jc _shaext_shortcut | |
b7838586 AP |
378 | ___ |
379 | $code.=<<___ if ($avx); | |
b7838586 AP |
380 | test \$`1<<28`,%ecx |
381 | jnz _avx_shortcut | |
382 | ___ | |
383 | $code.=<<___; | |
384 | mov %rsp,%rax | |
399976c7 | 385 | .cfi_def_cfa_register %rax |
b7838586 | 386 | push %rbx |
399976c7 | 387 | .cfi_push %rbx |
b7838586 | 388 | push %rbp |
399976c7 | 389 | .cfi_push %rbx |
b7838586 AP |
390 | ___ |
391 | $code.=<<___ if ($win64); | |
392 | lea -0xa8(%rsp),%rsp | |
393 | movaps %xmm6,(%rsp) | |
394 | movaps %xmm7,0x10(%rsp) | |
395 | movaps %xmm8,0x20(%rsp) | |
396 | movaps %xmm9,0x30(%rsp) | |
397 | movaps %xmm10,-0x78(%rax) | |
398 | movaps %xmm11,-0x68(%rax) | |
399 | movaps %xmm12,-0x58(%rax) | |
400 | movaps %xmm13,-0x48(%rax) | |
401 | movaps %xmm14,-0x38(%rax) | |
402 | movaps %xmm15,-0x28(%rax) | |
403 | ___ | |
404 | $code.=<<___; | |
405 | sub \$`$REG_SZ*18`,%rsp | |
406 | and \$-256,%rsp | |
407 | mov %rax,`$REG_SZ*17`(%rsp) # original %rsp | |
399976c7 | 408 | .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 |
619b9466 | 409 | .Lbody: |
b7838586 AP |
410 | lea K_XX_XX(%rip),$Tbl |
411 | lea `$REG_SZ*16`(%rsp),%rbx | |
412 | ||
413 | .Loop_grande: | |
414 | mov $num,`$REG_SZ*17+8`(%rsp) # original $num | |
415 | xor $num,$num | |
416 | ___ | |
417 | for($i=0;$i<4;$i++) { | |
0d51cf3c | 418 | $ptr_reg=&pointer_register($flavour,@ptr[$i]); |
b7838586 | 419 | $code.=<<___; |
0d51cf3c L |
420 | # input pointer |
421 | mov `$inp_elm_size*$i+0`($inp),$ptr_reg | |
422 | # number of blocks | |
423 | mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx | |
b7838586 AP |
424 | cmp $num,%ecx |
425 | cmovg %ecx,$num # find maximum | |
426 | test %ecx,%ecx | |
427 | mov %ecx,`4*$i`(%rbx) # initialize counters | |
428 | cmovle $Tbl,@ptr[$i] # cancel input | |
429 | ___ | |
430 | } | |
431 | $code.=<<___; | |
432 | test $num,$num | |
433 | jz .Ldone | |
434 | ||
435 | movdqu 0x00($ctx),$A # load context | |
436 | lea 128(%rsp),%rax | |
437 | movdqu 0x20($ctx),$B | |
438 | movdqu 0x40($ctx),$C | |
439 | movdqu 0x60($ctx),$D | |
440 | movdqu 0x80($ctx),$E | |
441 | movdqa 0x60($Tbl),$tx # pbswap_mask | |
3847d15d | 442 | movdqa -0x20($Tbl),$K # K_00_19 |
b7838586 AP |
443 | jmp .Loop |
444 | ||
445 | .align 32 | |
446 | .Loop: | |
447 | ___ | |
b7838586 AP |
448 | for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } |
449 | $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 | |
450 | for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
451 | $code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 | |
452 | for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
453 | $code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 | |
454 | for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
455 | $code.=<<___; | |
456 | movdqa (%rbx),@Xi[0] # pull counters | |
457 | mov \$1,%ecx | |
46f4e1be | 458 | cmp 4*0(%rbx),%ecx # examine counters |
b7838586 AP |
459 | pxor $t2,$t2 |
460 | cmovge $Tbl,@ptr[0] # cancel input | |
461 | cmp 4*1(%rbx),%ecx | |
462 | movdqa @Xi[0],@Xi[1] | |
463 | cmovge $Tbl,@ptr[1] | |
464 | cmp 4*2(%rbx),%ecx | |
465 | pcmpgtd $t2,@Xi[1] # mask value | |
466 | cmovge $Tbl,@ptr[2] | |
467 | cmp 4*3(%rbx),%ecx | |
468 | paddd @Xi[1],@Xi[0] # counters-- | |
469 | cmovge $Tbl,@ptr[3] | |
470 | ||
471 | movdqu 0x00($ctx),$t0 | |
472 | pand @Xi[1],$A | |
473 | movdqu 0x20($ctx),$t1 | |
474 | pand @Xi[1],$B | |
475 | paddd $t0,$A | |
476 | movdqu 0x40($ctx),$t2 | |
477 | pand @Xi[1],$C | |
478 | paddd $t1,$B | |
479 | movdqu 0x60($ctx),$t3 | |
480 | pand @Xi[1],$D | |
481 | paddd $t2,$C | |
482 | movdqu 0x80($ctx),$tx | |
483 | pand @Xi[1],$E | |
484 | movdqu $A,0x00($ctx) | |
485 | paddd $t3,$D | |
486 | movdqu $B,0x20($ctx) | |
487 | paddd $tx,$E | |
488 | movdqu $C,0x40($ctx) | |
489 | movdqu $D,0x60($ctx) | |
490 | movdqu $E,0x80($ctx) | |
491 | ||
492 | movdqa @Xi[0],(%rbx) # save counters | |
493 | movdqa 0x60($Tbl),$tx # pbswap_mask | |
3847d15d | 494 | movdqa -0x20($Tbl),$K # K_00_19 |
b7838586 AP |
495 | dec $num |
496 | jnz .Loop | |
497 | ||
498 | mov `$REG_SZ*17+8`(%rsp),$num | |
499 | lea $REG_SZ($ctx),$ctx | |
0d51cf3c | 500 | lea `$inp_elm_size*$REG_SZ/4`($inp),$inp |
b7838586 AP |
501 | dec $num |
502 | jnz .Loop_grande | |
503 | ||
504 | .Ldone: | |
0d4fb843 | 505 | mov `$REG_SZ*17`(%rsp),%rax # original %rsp |
399976c7 | 506 | .cfi_def_cfa %rax,8 |
b7838586 AP |
507 | ___ |
508 | $code.=<<___ if ($win64); | |
509 | movaps -0xb8(%rax),%xmm6 | |
510 | movaps -0xa8(%rax),%xmm7 | |
511 | movaps -0x98(%rax),%xmm8 | |
512 | movaps -0x88(%rax),%xmm9 | |
513 | movaps -0x78(%rax),%xmm10 | |
514 | movaps -0x68(%rax),%xmm11 | |
515 | movaps -0x58(%rax),%xmm12 | |
516 | movaps -0x48(%rax),%xmm13 | |
517 | movaps -0x38(%rax),%xmm14 | |
518 | movaps -0x28(%rax),%xmm15 | |
519 | ___ | |
520 | $code.=<<___; | |
521 | mov -16(%rax),%rbp | |
399976c7 | 522 | .cfi_restore %rbp |
b7838586 | 523 | mov -8(%rax),%rbx |
399976c7 | 524 | .cfi_restore %rbx |
b7838586 | 525 | lea (%rax),%rsp |
399976c7 | 526 | .cfi_def_cfa_register %rsp |
619b9466 | 527 | .Lepilogue: |
b7838586 | 528 | ret |
399976c7 | 529 | .cfi_endproc |
b7838586 AP |
530 | .size sha1_multi_block,.-sha1_multi_block |
531 | ___ | |
619b9466 AP |
532 | {{{ |
533 | my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); | |
534 | my @MSG0=map("%xmm$_",(4..7)); | |
535 | my @MSG1=map("%xmm$_",(11..14)); | |
536 | ||
537 | $code.=<<___; | |
538 | .type sha1_multi_block_shaext,\@function,3 | |
539 | .align 32 | |
540 | sha1_multi_block_shaext: | |
399976c7 | 541 | .cfi_startproc |
619b9466 AP |
542 | _shaext_shortcut: |
543 | mov %rsp,%rax | |
399976c7 | 544 | .cfi_def_cfa_register %rax |
619b9466 | 545 | push %rbx |
399976c7 | 546 | .cfi_push %rbx |
619b9466 | 547 | push %rbp |
399976c7 | 548 | .cfi_push %rbp |
619b9466 AP |
549 | ___ |
550 | $code.=<<___ if ($win64); | |
551 | lea -0xa8(%rsp),%rsp | |
552 | movaps %xmm6,(%rsp) | |
553 | movaps %xmm7,0x10(%rsp) | |
554 | movaps %xmm8,0x20(%rsp) | |
555 | movaps %xmm9,0x30(%rsp) | |
556 | movaps %xmm10,-0x78(%rax) | |
557 | movaps %xmm11,-0x68(%rax) | |
558 | movaps %xmm12,-0x58(%rax) | |
559 | movaps %xmm13,-0x48(%rax) | |
560 | movaps %xmm14,-0x38(%rax) | |
561 | movaps %xmm15,-0x28(%rax) | |
562 | ___ | |
563 | $code.=<<___; | |
564 | sub \$`$REG_SZ*18`,%rsp | |
565 | shl \$1,$num # we process pair at a time | |
566 | and \$-256,%rsp | |
567 | lea 0x40($ctx),$ctx # size optimization | |
568 | mov %rax,`$REG_SZ*17`(%rsp) # original %rsp | |
569 | .Lbody_shaext: | |
570 | lea `$REG_SZ*16`(%rsp),%rbx | |
571 | movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap | |
572 | ||
573 | .Loop_grande_shaext: | |
0d4fb843 | 574 | mov $num,`$REG_SZ*17+8`(%rsp) # original $num |
619b9466 AP |
575 | xor $num,$num |
576 | ___ | |
577 | for($i=0;$i<2;$i++) { | |
0d51cf3c | 578 | $ptr_reg=&pointer_register($flavour,@ptr[$i]); |
619b9466 | 579 | $code.=<<___; |
0d51cf3c L |
580 | # input pointer |
581 | mov `$inp_elm_size*$i+0`($inp),$ptr_reg | |
582 | # number of blocks | |
583 | mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx | |
619b9466 AP |
584 | cmp $num,%ecx |
585 | cmovg %ecx,$num # find maximum | |
586 | test %ecx,%ecx | |
587 | mov %ecx,`4*$i`(%rbx) # initialize counters | |
588 | cmovle %rsp,@ptr[$i] # cancel input | |
589 | ___ | |
590 | } | |
591 | $code.=<<___; | |
592 | test $num,$num | |
593 | jz .Ldone_shaext | |
594 | ||
595 | movq 0x00-0x40($ctx),$ABCD0 # a1.a0 | |
596 | movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 | |
597 | movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 | |
598 | movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 | |
599 | movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 | |
600 | ||
601 | punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 | |
602 | punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 | |
603 | ||
604 | movdqa $ABCD0,$ABCD1 | |
605 | punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 | |
606 | punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 | |
607 | ||
608 | pshufd \$0b00111111,@MSG0[3],$E0 | |
609 | pshufd \$0b01111111,@MSG0[3],$E1 | |
610 | pshufd \$0b00011011,$ABCD0,$ABCD0 | |
611 | pshufd \$0b00011011,$ABCD1,$ABCD1 | |
612 | jmp .Loop_shaext | |
613 | ||
614 | .align 32 | |
615 | .Loop_shaext: | |
616 | movdqu 0x00(@ptr[0]),@MSG0[0] | |
617 | movdqu 0x00(@ptr[1]),@MSG1[0] | |
618 | movdqu 0x10(@ptr[0]),@MSG0[1] | |
619 | movdqu 0x10(@ptr[1]),@MSG1[1] | |
620 | movdqu 0x20(@ptr[0]),@MSG0[2] | |
621 | pshufb $BSWAP,@MSG0[0] | |
622 | movdqu 0x20(@ptr[1]),@MSG1[2] | |
623 | pshufb $BSWAP,@MSG1[0] | |
624 | movdqu 0x30(@ptr[0]),@MSG0[3] | |
625 | lea 0x40(@ptr[0]),@ptr[0] | |
626 | pshufb $BSWAP,@MSG0[1] | |
627 | movdqu 0x30(@ptr[1]),@MSG1[3] | |
628 | lea 0x40(@ptr[1]),@ptr[1] | |
629 | pshufb $BSWAP,@MSG1[1] | |
630 | ||
631 | movdqa $E0,0x50(%rsp) # offload | |
632 | paddd @MSG0[0],$E0 | |
633 | movdqa $E1,0x70(%rsp) | |
634 | paddd @MSG1[0],$E1 | |
635 | movdqa $ABCD0,0x40(%rsp) # offload | |
636 | movdqa $ABCD0,$E0_ | |
637 | movdqa $ABCD1,0x60(%rsp) | |
638 | movdqa $ABCD1,$E1_ | |
639 | sha1rnds4 \$0,$E0,$ABCD0 # 0-3 | |
640 | sha1nexte @MSG0[1],$E0_ | |
641 | sha1rnds4 \$0,$E1,$ABCD1 # 0-3 | |
642 | sha1nexte @MSG1[1],$E1_ | |
643 | pshufb $BSWAP,@MSG0[2] | |
644 | prefetcht0 127(@ptr[0]) | |
645 | sha1msg1 @MSG0[1],@MSG0[0] | |
646 | pshufb $BSWAP,@MSG1[2] | |
647 | prefetcht0 127(@ptr[1]) | |
648 | sha1msg1 @MSG1[1],@MSG1[0] | |
649 | ||
650 | pshufb $BSWAP,@MSG0[3] | |
651 | movdqa $ABCD0,$E0 | |
652 | pshufb $BSWAP,@MSG1[3] | |
653 | movdqa $ABCD1,$E1 | |
654 | sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 | |
655 | sha1nexte @MSG0[2],$E0 | |
656 | sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 | |
657 | sha1nexte @MSG1[2],$E1 | |
658 | pxor @MSG0[2],@MSG0[0] | |
659 | sha1msg1 @MSG0[2],@MSG0[1] | |
660 | pxor @MSG1[2],@MSG1[0] | |
661 | sha1msg1 @MSG1[2],@MSG1[1] | |
662 | ___ | |
663 | for($i=2;$i<20-4;$i++) { | |
664 | $code.=<<___; | |
665 | movdqa $ABCD0,$E0_ | |
666 | movdqa $ABCD1,$E1_ | |
667 | sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 | |
668 | sha1nexte @MSG0[3],$E0_ | |
669 | sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 | |
670 | sha1nexte @MSG1[3],$E1_ | |
671 | sha1msg2 @MSG0[3],@MSG0[0] | |
672 | sha1msg2 @MSG1[3],@MSG1[0] | |
673 | pxor @MSG0[3],@MSG0[1] | |
674 | sha1msg1 @MSG0[3],@MSG0[2] | |
675 | pxor @MSG1[3],@MSG1[1] | |
676 | sha1msg1 @MSG1[3],@MSG1[2] | |
677 | ___ | |
678 | ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); | |
679 | push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); | |
680 | } | |
681 | $code.=<<___; | |
682 | movdqa $ABCD0,$E0_ | |
683 | movdqa $ABCD1,$E1_ | |
684 | sha1rnds4 \$3,$E0,$ABCD0 # 64-67 | |
685 | sha1nexte @MSG0[3],$E0_ | |
686 | sha1rnds4 \$3,$E1,$ABCD1 # 64-67 | |
687 | sha1nexte @MSG1[3],$E1_ | |
688 | sha1msg2 @MSG0[3],@MSG0[0] | |
689 | sha1msg2 @MSG1[3],@MSG1[0] | |
690 | pxor @MSG0[3],@MSG0[1] | |
691 | pxor @MSG1[3],@MSG1[1] | |
692 | ||
693 | mov \$1,%ecx | |
694 | pxor @MSG0[2],@MSG0[2] # zero | |
695 | cmp 4*0(%rbx),%ecx # examine counters | |
696 | cmovge %rsp,@ptr[0] # cancel input | |
697 | ||
698 | movdqa $ABCD0,$E0 | |
699 | movdqa $ABCD1,$E1 | |
700 | sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 | |
701 | sha1nexte @MSG0[0],$E0 | |
702 | sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 | |
703 | sha1nexte @MSG1[0],$E1 | |
704 | sha1msg2 @MSG0[0],@MSG0[1] | |
705 | sha1msg2 @MSG1[0],@MSG1[1] | |
706 | ||
707 | cmp 4*1(%rbx),%ecx | |
708 | cmovge %rsp,@ptr[1] | |
709 | movq (%rbx),@MSG0[0] # pull counters | |
710 | ||
711 | movdqa $ABCD0,$E0_ | |
712 | movdqa $ABCD1,$E1_ | |
713 | sha1rnds4 \$3,$E0,$ABCD0 # 72-75 | |
714 | sha1nexte @MSG0[1],$E0_ | |
715 | sha1rnds4 \$3,$E1,$ABCD1 # 72-75 | |
716 | sha1nexte @MSG1[1],$E1_ | |
717 | ||
718 | pshufd \$0x00,@MSG0[0],@MSG1[2] | |
719 | pshufd \$0x55,@MSG0[0],@MSG1[3] | |
720 | movdqa @MSG0[0],@MSG0[1] | |
721 | pcmpgtd @MSG0[2],@MSG1[2] | |
722 | pcmpgtd @MSG0[2],@MSG1[3] | |
723 | ||
724 | movdqa $ABCD0,$E0 | |
725 | movdqa $ABCD1,$E1 | |
726 | sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 | |
727 | sha1nexte $MSG0[2],$E0 | |
728 | sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 | |
729 | sha1nexte $MSG0[2],$E1 | |
730 | ||
731 | pcmpgtd @MSG0[2],@MSG0[1] # counter mask | |
732 | pand @MSG1[2],$ABCD0 | |
733 | pand @MSG1[2],$E0 | |
734 | pand @MSG1[3],$ABCD1 | |
735 | pand @MSG1[3],$E1 | |
736 | paddd @MSG0[1],@MSG0[0] # counters-- | |
737 | ||
738 | paddd 0x40(%rsp),$ABCD0 | |
739 | paddd 0x50(%rsp),$E0 | |
740 | paddd 0x60(%rsp),$ABCD1 | |
741 | paddd 0x70(%rsp),$E1 | |
742 | ||
743 | movq @MSG0[0],(%rbx) # save counters | |
744 | dec $num | |
745 | jnz .Loop_shaext | |
746 | ||
747 | mov `$REG_SZ*17+8`(%rsp),$num | |
748 | ||
749 | pshufd \$0b00011011,$ABCD0,$ABCD0 | |
750 | pshufd \$0b00011011,$ABCD1,$ABCD1 | |
751 | ||
752 | movdqa $ABCD0,@MSG0[0] | |
753 | punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 | |
754 | punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 | |
755 | punpckhdq $E1,$E0 # e1.e0.xx.xx | |
756 | movq $ABCD0,0x00-0x40($ctx) # a1.a0 | |
757 | psrldq \$8,$ABCD0 | |
758 | movq @MSG0[0],0x40-0x40($ctx)# c1.c0 | |
759 | psrldq \$8,@MSG0[0] | |
760 | movq $ABCD0,0x20-0x40($ctx) # b1.b0 | |
761 | psrldq \$8,$E0 | |
762 | movq @MSG0[0],0x60-0x40($ctx)# d1.d0 | |
763 | movq $E0,0x80-0x40($ctx) # e1.e0 | |
764 | ||
765 | lea `$REG_SZ/2`($ctx),$ctx | |
0d51cf3c | 766 | lea `$inp_elm_size*2`($inp),$inp |
619b9466 AP |
767 | dec $num |
768 | jnz .Loop_grande_shaext | |
769 | ||
770 | .Ldone_shaext: | |
771 | #mov `$REG_SZ*17`(%rsp),%rax # original %rsp | |
772 | ___ | |
773 | $code.=<<___ if ($win64); | |
774 | movaps -0xb8(%rax),%xmm6 | |
775 | movaps -0xa8(%rax),%xmm7 | |
776 | movaps -0x98(%rax),%xmm8 | |
777 | movaps -0x88(%rax),%xmm9 | |
778 | movaps -0x78(%rax),%xmm10 | |
779 | movaps -0x68(%rax),%xmm11 | |
780 | movaps -0x58(%rax),%xmm12 | |
781 | movaps -0x48(%rax),%xmm13 | |
782 | movaps -0x38(%rax),%xmm14 | |
783 | movaps -0x28(%rax),%xmm15 | |
784 | ___ | |
785 | $code.=<<___; | |
786 | mov -16(%rax),%rbp | |
399976c7 | 787 | .cfi_restore %rbp |
619b9466 | 788 | mov -8(%rax),%rbx |
399976c7 | 789 | .cfi_restore %rbx |
619b9466 | 790 | lea (%rax),%rsp |
399976c7 | 791 | .cfi_def_cfa_register %rsp |
619b9466 AP |
792 | .Lepilogue_shaext: |
793 | ret | |
399976c7 | 794 | .cfi_endproc |
619b9466 AP |
795 | .size sha1_multi_block_shaext,.-sha1_multi_block_shaext |
796 | ___ | |
797 | }}} | |
b7838586 AP |
798 | |
799 | if ($avx) {{{ | |
800 | sub BODY_00_19_avx { | |
801 | my ($i,$a,$b,$c,$d,$e)=@_; | |
802 | my $j=$i+1; | |
803 | my $k=$i+2; | |
804 | my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; | |
805 | my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; | |
806 | ||
807 | $code.=<<___ if ($i==0 && $REG_SZ==16); | |
808 | vmovd (@ptr[0]),@Xi[0] | |
809 | lea `16*4`(@ptr[0]),@ptr[0] | |
810 | vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] | |
811 | lea `16*4`(@ptr[1]),@ptr[1] | |
812 | vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] | |
813 | lea `16*4`(@ptr[2]),@ptr[2] | |
814 | vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] | |
815 | lea `16*4`(@ptr[3]),@ptr[3] | |
816 | vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] | |
817 | vpunpckldq @Xi[2],@Xi[0],@Xi[0] | |
818 | vmovd `4*$j-16*4`($ptr_n),$t3 | |
819 | vpshufb $tx,@Xi[0],@Xi[0] | |
820 | ___ | |
821 | $code.=<<___ if ($i<15 && $REG_SZ==16); # just load input | |
822 | vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] | |
823 | vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 | |
824 | ___ | |
825 | $code.=<<___ if ($i==0 && $REG_SZ==32); | |
826 | vmovd (@ptr[0]),@Xi[0] | |
827 | lea `16*4`(@ptr[0]),@ptr[0] | |
828 | vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] | |
829 | lea `16*4`(@ptr[4]),@ptr[4] | |
830 | vmovd (@ptr[1]),$t2 | |
831 | lea `16*4`(@ptr[1]),@ptr[1] | |
832 | vmovd (@ptr[5]),$t1 | |
833 | lea `16*4`(@ptr[5]),@ptr[5] | |
834 | vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] | |
835 | lea `16*4`(@ptr[2]),@ptr[2] | |
836 | vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] | |
837 | lea `16*4`(@ptr[6]),@ptr[6] | |
838 | vpinsrd \$1,(@ptr[3]),$t2,$t2 | |
839 | lea `16*4`(@ptr[3]),@ptr[3] | |
840 | vpunpckldq $t2,@Xi[0],@Xi[0] | |
841 | vpinsrd \$1,(@ptr[7]),$t1,$t1 | |
842 | lea `16*4`(@ptr[7]),@ptr[7] | |
843 | vpunpckldq $t1,@Xi[2],@Xi[2] | |
844 | vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] | |
845 | vinserti128 @Xi[2],@Xi[0],@Xi[0] | |
846 | vmovd `4*$j-16*4`($ptr_n),$t3 | |
847 | vpshufb $tx,@Xi[0],@Xi[0] | |
848 | ___ | |
849 | $code.=<<___ if ($i<15 && $REG_SZ==32); # just load input | |
850 | vmovd `4*$j-16*4`(@ptr[1]),$t2 | |
851 | vmovd `4*$j-16*4`(@ptr[5]),$t1 | |
852 | vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] | |
853 | vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 | |
854 | vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 | |
855 | vpunpckldq $t2,@Xi[1],@Xi[1] | |
856 | vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 | |
857 | vpunpckldq $t1,$t3,$t3 | |
858 | ___ | |
859 | $code.=<<___ if ($i<14); | |
860 | vpaddd $K,$e,$e # e+=K_00_19 | |
861 | vpslld \$5,$a,$t2 | |
862 | vpandn $d,$b,$t1 | |
863 | vpand $c,$b,$t0 | |
864 | ||
865 | vmovdqa @Xi[0],`&Xi_off($i)` | |
866 | vpaddd @Xi[0],$e,$e # e+=X[i] | |
867 | $vpack $t3,@Xi[1],@Xi[1] | |
868 | vpsrld \$27,$a,$t3 | |
869 | vpxor $t1,$t0,$t0 # Ch(b,c,d) | |
870 | vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] | |
871 | ||
872 | vpslld \$30,$b,$t1 | |
873 | vpor $t3,$t2,$t2 # rol(a,5) | |
874 | vmovd `4*$k-16*4`($ptr_n),$t3 | |
875 | vpaddd $t0,$e,$e # e+=Ch(b,c,d) | |
876 | ||
877 | vpsrld \$2,$b,$b | |
878 | vpaddd $t2,$e,$e # e+=rol(a,5) | |
879 | vpshufb $tx,@Xi[1],@Xi[1] | |
880 | vpor $t1,$b,$b # b=rol(b,30) | |
881 | ___ | |
882 | $code.=<<___ if ($i==14); | |
883 | vpaddd $K,$e,$e # e+=K_00_19 | |
3847d15d | 884 | prefetcht0 63(@ptr[0]) |
b7838586 AP |
885 | vpslld \$5,$a,$t2 |
886 | vpandn $d,$b,$t1 | |
887 | vpand $c,$b,$t0 | |
888 | ||
889 | vmovdqa @Xi[0],`&Xi_off($i)` | |
890 | vpaddd @Xi[0],$e,$e # e+=X[i] | |
891 | $vpack $t3,@Xi[1],@Xi[1] | |
892 | vpsrld \$27,$a,$t3 | |
3847d15d | 893 | prefetcht0 63(@ptr[1]) |
b7838586 AP |
894 | vpxor $t1,$t0,$t0 # Ch(b,c,d) |
895 | ||
896 | vpslld \$30,$b,$t1 | |
897 | vpor $t3,$t2,$t2 # rol(a,5) | |
3847d15d | 898 | prefetcht0 63(@ptr[2]) |
b7838586 AP |
899 | vpaddd $t0,$e,$e # e+=Ch(b,c,d) |
900 | ||
901 | vpsrld \$2,$b,$b | |
902 | vpaddd $t2,$e,$e # e+=rol(a,5) | |
3847d15d | 903 | prefetcht0 63(@ptr[3]) |
b7838586 AP |
904 | vpshufb $tx,@Xi[1],@Xi[1] |
905 | vpor $t1,$b,$b # b=rol(b,30) | |
906 | ___ | |
907 | $code.=<<___ if ($i>=13 && $i<15); | |
908 | vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" | |
909 | ___ | |
910 | $code.=<<___ if ($i>=15); # apply Xupdate | |
911 | vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" | |
912 | vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" | |
913 | ||
914 | vpaddd $K,$e,$e # e+=K_00_19 | |
915 | vpslld \$5,$a,$t2 | |
916 | vpandn $d,$b,$t1 | |
3847d15d | 917 | `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` |
b7838586 AP |
918 | vpand $c,$b,$t0 |
919 | ||
920 | vmovdqa @Xi[0],`&Xi_off($i)` | |
921 | vpaddd @Xi[0],$e,$e # e+=X[i] | |
922 | vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] | |
923 | vpsrld \$27,$a,$t3 | |
924 | vpxor $t1,$t0,$t0 # Ch(b,c,d) | |
925 | vpxor @Xi[3],@Xi[1],@Xi[1] | |
3847d15d | 926 | `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` |
b7838586 AP |
927 | |
928 | vpslld \$30,$b,$t1 | |
929 | vpor $t3,$t2,$t2 # rol(a,5) | |
930 | vpaddd $t0,$e,$e # e+=Ch(b,c,d) | |
3847d15d | 931 | `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` |
b7838586 AP |
932 | vpsrld \$31,@Xi[1],$tx |
933 | vpaddd @Xi[1],@Xi[1],@Xi[1] | |
934 | ||
935 | vpsrld \$2,$b,$b | |
3847d15d | 936 | `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` |
b7838586 AP |
937 | vpaddd $t2,$e,$e # e+=rol(a,5) |
938 | vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] | |
939 | vpor $t1,$b,$b # b=rol(b,30) | |
940 | ___ | |
941 | push(@Xi,shift(@Xi)); | |
942 | } | |
943 | ||
944 | sub BODY_20_39_avx { | |
945 | my ($i,$a,$b,$c,$d,$e)=@_; | |
946 | my $j=$i+1; | |
947 | ||
948 | $code.=<<___ if ($i<79); | |
949 | vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" | |
950 | vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" | |
951 | ||
952 | vpslld \$5,$a,$t2 | |
953 | vpaddd $K,$e,$e # e+=K_20_39 | |
954 | vpxor $b,$d,$t0 | |
955 | ___ | |
956 | $code.=<<___ if ($i<72); | |
957 | vmovdqa @Xi[0],`&Xi_off($i)` | |
958 | ___ | |
959 | $code.=<<___ if ($i<79); | |
960 | vpaddd @Xi[0],$e,$e # e+=X[i] | |
961 | vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] | |
962 | vpsrld \$27,$a,$t3 | |
963 | vpxor $c,$t0,$t0 # Parity(b,c,d) | |
964 | vpxor @Xi[3],@Xi[1],@Xi[1] | |
965 | ||
966 | vpslld \$30,$b,$t1 | |
967 | vpor $t3,$t2,$t2 # rol(a,5) | |
968 | vpaddd $t0,$e,$e # e+=Parity(b,c,d) | |
969 | vpsrld \$31,@Xi[1],$tx | |
970 | vpaddd @Xi[1],@Xi[1],@Xi[1] | |
971 | ||
972 | vpsrld \$2,$b,$b | |
973 | vpaddd $t2,$e,$e # e+=rol(a,5) | |
974 | vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) | |
975 | vpor $t1,$b,$b # b=rol(b,30) | |
976 | ___ | |
977 | $code.=<<___ if ($i==79); | |
978 | vpslld \$5,$a,$t2 | |
979 | vpaddd $K,$e,$e # e+=K_20_39 | |
980 | vpxor $b,$d,$t0 | |
981 | ||
982 | vpsrld \$27,$a,$t3 | |
983 | vpaddd @Xi[0],$e,$e # e+=X[i] | |
984 | vpxor $c,$t0,$t0 # Parity(b,c,d) | |
985 | ||
986 | vpslld \$30,$b,$t1 | |
987 | vpor $t3,$t2,$t2 # rol(a,5) | |
988 | vpaddd $t0,$e,$e # e+=Parity(b,c,d) | |
989 | ||
990 | vpsrld \$2,$b,$b | |
991 | vpaddd $t2,$e,$e # e+=rol(a,5) | |
992 | vpor $t1,$b,$b # b=rol(b,30) | |
993 | ___ | |
994 | push(@Xi,shift(@Xi)); | |
995 | } | |
996 | ||
997 | sub BODY_40_59_avx { | |
998 | my ($i,$a,$b,$c,$d,$e)=@_; | |
999 | my $j=$i+1; | |
1000 | ||
1001 | $code.=<<___; | |
1002 | vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" | |
1003 | vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" | |
1004 | ||
1005 | vpaddd $K,$e,$e # e+=K_40_59 | |
1006 | vpslld \$5,$a,$t2 | |
1007 | vpand $c,$d,$t1 | |
1008 | vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] | |
1009 | ||
1010 | vpaddd $t1,$e,$e | |
1011 | vpsrld \$27,$a,$t3 | |
1012 | vpxor $c,$d,$t0 | |
1013 | vpxor @Xi[3],@Xi[1],@Xi[1] | |
1014 | ||
1015 | vmovdqu @Xi[0],`&Xi_off($i)` | |
1016 | vpaddd @Xi[0],$e,$e # e+=X[i] | |
1017 | vpor $t3,$t2,$t2 # rol(a,5) | |
1018 | vpsrld \$31,@Xi[1],$tx | |
1019 | vpand $b,$t0,$t0 | |
1020 | vpaddd @Xi[1],@Xi[1],@Xi[1] | |
1021 | ||
1022 | vpslld \$30,$b,$t1 | |
1023 | vpaddd $t0,$e,$e # e+=Maj(b,d,c) | |
1024 | ||
1025 | vpsrld \$2,$b,$b | |
1026 | vpaddd $t2,$e,$e # e+=rol(a,5) | |
1027 | vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) | |
1028 | vpor $t1,$b,$b # b=rol(b,30) | |
1029 | ___ | |
1030 | push(@Xi,shift(@Xi)); | |
1031 | } | |
1032 | ||
1033 | $code.=<<___; | |
1034 | .type sha1_multi_block_avx,\@function,3 | |
1035 | .align 32 | |
1036 | sha1_multi_block_avx: | |
399976c7 | 1037 | .cfi_startproc |
b7838586 AP |
1038 | _avx_shortcut: |
1039 | ___ | |
1040 | $code.=<<___ if ($avx>1); | |
1041 | shr \$32,%rcx | |
1042 | cmp \$2,$num | |
1043 | jb .Lavx | |
1044 | test \$`1<<5`,%ecx | |
1045 | jnz _avx2_shortcut | |
1046 | jmp .Lavx | |
1047 | .align 32 | |
1048 | .Lavx: | |
1049 | ___ | |
1050 | $code.=<<___; | |
1051 | mov %rsp,%rax | |
399976c7 | 1052 | .cfi_def_cfa_register %rax |
b7838586 | 1053 | push %rbx |
399976c7 | 1054 | .cfi_push %rbx |
b7838586 | 1055 | push %rbp |
399976c7 | 1056 | .cfi_push %rbp |
b7838586 AP |
1057 | ___ |
1058 | $code.=<<___ if ($win64); | |
1059 | lea -0xa8(%rsp),%rsp | |
1060 | movaps %xmm6,(%rsp) | |
1061 | movaps %xmm7,0x10(%rsp) | |
1062 | movaps %xmm8,0x20(%rsp) | |
1063 | movaps %xmm9,0x30(%rsp) | |
1064 | movaps %xmm10,-0x78(%rax) | |
1065 | movaps %xmm11,-0x68(%rax) | |
1066 | movaps %xmm12,-0x58(%rax) | |
1067 | movaps %xmm13,-0x48(%rax) | |
1068 | movaps %xmm14,-0x38(%rax) | |
1069 | movaps %xmm15,-0x28(%rax) | |
1070 | ___ | |
1071 | $code.=<<___; | |
1072 | sub \$`$REG_SZ*18`, %rsp | |
1073 | and \$-256,%rsp | |
1074 | mov %rax,`$REG_SZ*17`(%rsp) # original %rsp | |
399976c7 | 1075 | .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 |
619b9466 | 1076 | .Lbody_avx: |
b7838586 AP |
1077 | lea K_XX_XX(%rip),$Tbl |
1078 | lea `$REG_SZ*16`(%rsp),%rbx | |
1079 | ||
1080 | vzeroupper | |
1081 | .Loop_grande_avx: | |
1082 | mov $num,`$REG_SZ*17+8`(%rsp) # original $num | |
1083 | xor $num,$num | |
1084 | ___ | |
1085 | for($i=0;$i<4;$i++) { | |
0d51cf3c | 1086 | $ptr_reg=&pointer_register($flavour,@ptr[$i]); |
b7838586 | 1087 | $code.=<<___; |
0d51cf3c L |
1088 | # input pointer |
1089 | mov `$inp_elm_size*$i+0`($inp),$ptr_reg | |
1090 | # number of blocks | |
1091 | mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx | |
b7838586 AP |
1092 | cmp $num,%ecx |
1093 | cmovg %ecx,$num # find maximum | |
1094 | test %ecx,%ecx | |
1095 | mov %ecx,`4*$i`(%rbx) # initialize counters | |
1096 | cmovle $Tbl,@ptr[$i] # cancel input | |
1097 | ___ | |
1098 | } | |
1099 | $code.=<<___; | |
1100 | test $num,$num | |
1101 | jz .Ldone_avx | |
1102 | ||
1103 | vmovdqu 0x00($ctx),$A # load context | |
1104 | lea 128(%rsp),%rax | |
1105 | vmovdqu 0x20($ctx),$B | |
1106 | vmovdqu 0x40($ctx),$C | |
1107 | vmovdqu 0x60($ctx),$D | |
1108 | vmovdqu 0x80($ctx),$E | |
1109 | vmovdqu 0x60($Tbl),$tx # pbswap_mask | |
1110 | jmp .Loop_avx | |
1111 | ||
1112 | .align 32 | |
1113 | .Loop_avx: | |
1114 | ___ | |
1115 | $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 | |
1116 | for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } | |
1117 | $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 | |
1118 | for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } | |
1119 | $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 | |
1120 | for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } | |
1121 | $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 | |
1122 | for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } | |
1123 | $code.=<<___; | |
1124 | mov \$1,%ecx | |
1125 | ___ | |
1126 | for($i=0;$i<4;$i++) { | |
1127 | $code.=<<___; | |
1128 | cmp `4*$i`(%rbx),%ecx # examine counters | |
1129 | cmovge $Tbl,@ptr[$i] # cancel input | |
1130 | ___ | |
1131 | } | |
1132 | $code.=<<___; | |
1133 | vmovdqu (%rbx),$t0 # pull counters | |
1134 | vpxor $t2,$t2,$t2 | |
1135 | vmovdqa $t0,$t1 | |
1136 | vpcmpgtd $t2,$t1,$t1 # mask value | |
1137 | vpaddd $t1,$t0,$t0 # counters-- | |
1138 | ||
1139 | vpand $t1,$A,$A | |
1140 | vpand $t1,$B,$B | |
1141 | vpaddd 0x00($ctx),$A,$A | |
1142 | vpand $t1,$C,$C | |
1143 | vpaddd 0x20($ctx),$B,$B | |
1144 | vpand $t1,$D,$D | |
1145 | vpaddd 0x40($ctx),$C,$C | |
1146 | vpand $t1,$E,$E | |
1147 | vpaddd 0x60($ctx),$D,$D | |
1148 | vpaddd 0x80($ctx),$E,$E | |
1149 | vmovdqu $A,0x00($ctx) | |
1150 | vmovdqu $B,0x20($ctx) | |
1151 | vmovdqu $C,0x40($ctx) | |
1152 | vmovdqu $D,0x60($ctx) | |
1153 | vmovdqu $E,0x80($ctx) | |
1154 | ||
1155 | vmovdqu $t0,(%rbx) # save counters | |
1156 | vmovdqu 0x60($Tbl),$tx # pbswap_mask | |
1157 | dec $num | |
1158 | jnz .Loop_avx | |
1159 | ||
1160 | mov `$REG_SZ*17+8`(%rsp),$num | |
1161 | lea $REG_SZ($ctx),$ctx | |
0d51cf3c | 1162 | lea `$inp_elm_size*$REG_SZ/4`($inp),$inp |
b7838586 AP |
1163 | dec $num |
1164 | jnz .Loop_grande_avx | |
1165 | ||
1166 | .Ldone_avx: | |
0d4fb843 | 1167 | mov `$REG_SZ*17`(%rsp),%rax # original %rsp |
399976c7 | 1168 | .cfi_def_cfa %rax,8 |
b7838586 AP |
1169 | vzeroupper |
1170 | ___ | |
1171 | $code.=<<___ if ($win64); | |
1172 | movaps -0xb8(%rax),%xmm6 | |
1173 | movaps -0xa8(%rax),%xmm7 | |
1174 | movaps -0x98(%rax),%xmm8 | |
1175 | movaps -0x88(%rax),%xmm9 | |
1176 | movaps -0x78(%rax),%xmm10 | |
1177 | movaps -0x68(%rax),%xmm11 | |
1178 | movaps -0x58(%rax),%xmm12 | |
1179 | movaps -0x48(%rax),%xmm13 | |
1180 | movaps -0x38(%rax),%xmm14 | |
1181 | movaps -0x28(%rax),%xmm15 | |
1182 | ___ | |
1183 | $code.=<<___; | |
1184 | mov -16(%rax),%rbp | |
399976c7 | 1185 | .cfi_restore %rbp |
b7838586 | 1186 | mov -8(%rax),%rbx |
399976c7 | 1187 | .cfi_restore %rbx |
b7838586 | 1188 | lea (%rax),%rsp |
399976c7 | 1189 | .cfi_def_cfa_register %rsp |
619b9466 | 1190 | .Lepilogue_avx: |
b7838586 | 1191 | ret |
399976c7 | 1192 | .cfi_endproc |
b7838586 AP |
1193 | .size sha1_multi_block_avx,.-sha1_multi_block_avx |
1194 | ___ | |
1195 | ||
1196 | if ($avx>1) { | |
1197 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
1198 | ||
1199 | $REG_SZ=32; | |
1200 | ||
1201 | @ptr=map("%r$_",(12..15,8..11)); | |
1202 | ||
1203 | @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); | |
1204 | ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); | |
1205 | @Xi=map("%ymm$_",(10..14)); | |
1206 | $K="%ymm15"; | |
1207 | ||
1208 | $code.=<<___; | |
1209 | .type sha1_multi_block_avx2,\@function,3 | |
1210 | .align 32 | |
1211 | sha1_multi_block_avx2: | |
399976c7 | 1212 | .cfi_startproc |
b7838586 AP |
1213 | _avx2_shortcut: |
1214 | mov %rsp,%rax | |
399976c7 | 1215 | .cfi_def_cfa_register %rax |
b7838586 | 1216 | push %rbx |
399976c7 | 1217 | .cfi_push %rbx |
b7838586 | 1218 | push %rbp |
399976c7 | 1219 | .cfi_push %rbp |
b7838586 | 1220 | push %r12 |
399976c7 | 1221 | .cfi_push %r12 |
b7838586 | 1222 | push %r13 |
399976c7 | 1223 | .cfi_push %r13 |
b7838586 | 1224 | push %r14 |
399976c7 | 1225 | .cfi_push %r14 |
b7838586 | 1226 | push %r15 |
399976c7 | 1227 | .cfi_push %r15 |
b7838586 AP |
1228 | ___ |
1229 | $code.=<<___ if ($win64); | |
1230 | lea -0xa8(%rsp),%rsp | |
1231 | movaps %xmm6,(%rsp) | |
1232 | movaps %xmm7,0x10(%rsp) | |
1233 | movaps %xmm8,0x20(%rsp) | |
1234 | movaps %xmm9,0x30(%rsp) | |
1235 | movaps %xmm10,0x40(%rsp) | |
1236 | movaps %xmm11,0x50(%rsp) | |
1237 | movaps %xmm12,-0x78(%rax) | |
1238 | movaps %xmm13,-0x68(%rax) | |
1239 | movaps %xmm14,-0x58(%rax) | |
1240 | movaps %xmm15,-0x48(%rax) | |
1241 | ___ | |
1242 | $code.=<<___; | |
1243 | sub \$`$REG_SZ*18`, %rsp | |
1244 | and \$-256,%rsp | |
1245 | mov %rax,`$REG_SZ*17`(%rsp) # original %rsp | |
399976c7 | 1246 | .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 |
619b9466 | 1247 | .Lbody_avx2: |
b7838586 AP |
1248 | lea K_XX_XX(%rip),$Tbl |
1249 | shr \$1,$num | |
1250 | ||
1251 | vzeroupper | |
1252 | .Loop_grande_avx2: | |
1253 | mov $num,`$REG_SZ*17+8`(%rsp) # original $num | |
1254 | xor $num,$num | |
1255 | lea `$REG_SZ*16`(%rsp),%rbx | |
1256 | ___ | |
1257 | for($i=0;$i<8;$i++) { | |
0d51cf3c | 1258 | $ptr_reg=&pointer_register($flavour,@ptr[$i]); |
b7838586 | 1259 | $code.=<<___; |
0d51cf3c L |
1260 | # input pointer |
1261 | mov `$inp_elm_size*$i+0`($inp),$ptr_reg | |
1262 | # number of blocks | |
1263 | mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx | |
b7838586 AP |
1264 | cmp $num,%ecx |
1265 | cmovg %ecx,$num # find maximum | |
1266 | test %ecx,%ecx | |
1267 | mov %ecx,`4*$i`(%rbx) # initialize counters | |
1268 | cmovle $Tbl,@ptr[$i] # cancel input | |
1269 | ___ | |
1270 | } | |
1271 | $code.=<<___; | |
1272 | vmovdqu 0x00($ctx),$A # load context | |
1273 | lea 128(%rsp),%rax | |
1274 | vmovdqu 0x20($ctx),$B | |
1275 | lea 256+128(%rsp),%rbx | |
1276 | vmovdqu 0x40($ctx),$C | |
1277 | vmovdqu 0x60($ctx),$D | |
1278 | vmovdqu 0x80($ctx),$E | |
1279 | vmovdqu 0x60($Tbl),$tx # pbswap_mask | |
1280 | jmp .Loop_avx2 | |
1281 | ||
1282 | .align 32 | |
1283 | .Loop_avx2: | |
1284 | ___ | |
1285 | $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 | |
1286 | for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } | |
1287 | $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 | |
1288 | for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } | |
1289 | $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 | |
1290 | for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } | |
1291 | $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 | |
1292 | for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } | |
1293 | $code.=<<___; | |
1294 | mov \$1,%ecx | |
1295 | lea `$REG_SZ*16`(%rsp),%rbx | |
1296 | ___ | |
1297 | for($i=0;$i<8;$i++) { | |
1298 | $code.=<<___; | |
1299 | cmp `4*$i`(%rbx),%ecx # examine counters | |
1300 | cmovge $Tbl,@ptr[$i] # cancel input | |
1301 | ___ | |
1302 | } | |
1303 | $code.=<<___; | |
1304 | vmovdqu (%rbx),$t0 # pull counters | |
1305 | vpxor $t2,$t2,$t2 | |
1306 | vmovdqa $t0,$t1 | |
1307 | vpcmpgtd $t2,$t1,$t1 # mask value | |
1308 | vpaddd $t1,$t0,$t0 # counters-- | |
1309 | ||
1310 | vpand $t1,$A,$A | |
1311 | vpand $t1,$B,$B | |
1312 | vpaddd 0x00($ctx),$A,$A | |
1313 | vpand $t1,$C,$C | |
1314 | vpaddd 0x20($ctx),$B,$B | |
1315 | vpand $t1,$D,$D | |
1316 | vpaddd 0x40($ctx),$C,$C | |
1317 | vpand $t1,$E,$E | |
1318 | vpaddd 0x60($ctx),$D,$D | |
1319 | vpaddd 0x80($ctx),$E,$E | |
1320 | vmovdqu $A,0x00($ctx) | |
1321 | vmovdqu $B,0x20($ctx) | |
1322 | vmovdqu $C,0x40($ctx) | |
1323 | vmovdqu $D,0x60($ctx) | |
1324 | vmovdqu $E,0x80($ctx) | |
1325 | ||
1326 | vmovdqu $t0,(%rbx) # save counters | |
1327 | lea 256+128(%rsp),%rbx | |
1328 | vmovdqu 0x60($Tbl),$tx # pbswap_mask | |
1329 | dec $num | |
1330 | jnz .Loop_avx2 | |
1331 | ||
1332 | #mov `$REG_SZ*17+8`(%rsp),$num | |
1333 | #lea $REG_SZ($ctx),$ctx | |
0d51cf3c | 1334 | #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp |
b7838586 AP |
1335 | #dec $num |
1336 | #jnz .Loop_grande_avx2 | |
1337 | ||
1338 | .Ldone_avx2: | |
0d4fb843 | 1339 | mov `$REG_SZ*17`(%rsp),%rax # original %rsp |
399976c7 | 1340 | .cfi_def_cfa %rax,8 |
b7838586 AP |
1341 | vzeroupper |
1342 | ___ | |
1343 | $code.=<<___ if ($win64); | |
1344 | movaps -0xd8(%rax),%xmm6 | |
1345 | movaps -0xc8(%rax),%xmm7 | |
1346 | movaps -0xb8(%rax),%xmm8 | |
1347 | movaps -0xa8(%rax),%xmm9 | |
1348 | movaps -0x98(%rax),%xmm10 | |
1349 | movaps -0x88(%rax),%xmm11 | |
1350 | movaps -0x78(%rax),%xmm12 | |
1351 | movaps -0x68(%rax),%xmm13 | |
1352 | movaps -0x58(%rax),%xmm14 | |
1353 | movaps -0x48(%rax),%xmm15 | |
1354 | ___ | |
1355 | $code.=<<___; | |
1356 | mov -48(%rax),%r15 | |
399976c7 | 1357 | .cfi_restore %r15 |
b7838586 | 1358 | mov -40(%rax),%r14 |
399976c7 | 1359 | .cfi_restore %r14 |
b7838586 | 1360 | mov -32(%rax),%r13 |
399976c7 | 1361 | .cfi_restore %r13 |
b7838586 | 1362 | mov -24(%rax),%r12 |
399976c7 | 1363 | .cfi_restore %r12 |
b7838586 | 1364 | mov -16(%rax),%rbp |
399976c7 | 1365 | .cfi_restore %rbp |
b7838586 | 1366 | mov -8(%rax),%rbx |
399976c7 | 1367 | .cfi_restore %rbx |
b7838586 | 1368 | lea (%rax),%rsp |
399976c7 | 1369 | .cfi_def_cfa_register %rsp |
619b9466 | 1370 | .Lepilogue_avx2: |
b7838586 | 1371 | ret |
399976c7 | 1372 | .cfi_endproc |
b7838586 AP |
1373 | .size sha1_multi_block_avx2,.-sha1_multi_block_avx2 |
1374 | ___ | |
1375 | } }}} | |
1376 | $code.=<<___; | |
1377 | ||
1378 | .align 256 | |
1379 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | |
1380 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | |
1381 | K_XX_XX: | |
1382 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | |
1383 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | |
1384 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | |
1385 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | |
1386 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | |
1387 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | |
1388 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap | |
1389 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap | |
619b9466 AP |
1390 | .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 |
1391 | .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | |
1392 | ___ | |
1393 | ||
1394 | if ($win64) { | |
1395 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
1396 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
1397 | $rec="%rcx"; | |
1398 | $frame="%rdx"; | |
1399 | $context="%r8"; | |
1400 | $disp="%r9"; | |
1401 | ||
1402 | $code.=<<___; | |
1403 | .extern __imp_RtlVirtualUnwind | |
1404 | .type se_handler,\@abi-omnipotent | |
1405 | .align 16 | |
1406 | se_handler: | |
1407 | push %rsi | |
1408 | push %rdi | |
1409 | push %rbx | |
1410 | push %rbp | |
1411 | push %r12 | |
1412 | push %r13 | |
1413 | push %r14 | |
1414 | push %r15 | |
1415 | pushfq | |
1416 | sub \$64,%rsp | |
1417 | ||
1418 | mov 120($context),%rax # pull context->Rax | |
1419 | mov 248($context),%rbx # pull context->Rip | |
1420 | ||
1421 | mov 8($disp),%rsi # disp->ImageBase | |
1422 | mov 56($disp),%r11 # disp->HandlerData | |
1423 | ||
1424 | mov 0(%r11),%r10d # HandlerData[0] | |
1425 | lea (%rsi,%r10),%r10 # end of prologue label | |
1426 | cmp %r10,%rbx # context->Rip<.Lbody | |
1427 | jb .Lin_prologue | |
1428 | ||
1429 | mov 152($context),%rax # pull context->Rsp | |
1430 | ||
1431 | mov 4(%r11),%r10d # HandlerData[1] | |
1432 | lea (%rsi,%r10),%r10 # epilogue label | |
1433 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
1434 | jae .Lin_prologue | |
1435 | ||
1436 | mov `16*17`(%rax),%rax # pull saved stack pointer | |
1437 | ||
1438 | mov -8(%rax),%rbx | |
1439 | mov -16(%rax),%rbp | |
1440 | mov %rbx,144($context) # restore context->Rbx | |
1441 | mov %rbp,160($context) # restore context->Rbp | |
1442 | ||
1443 | lea -24-10*16(%rax),%rsi | |
1444 | lea 512($context),%rdi # &context.Xmm6 | |
1445 | mov \$20,%ecx | |
1446 | .long 0xa548f3fc # cld; rep movsq | |
1447 | ||
1448 | .Lin_prologue: | |
1449 | mov 8(%rax),%rdi | |
1450 | mov 16(%rax),%rsi | |
1451 | mov %rax,152($context) # restore context->Rsp | |
1452 | mov %rsi,168($context) # restore context->Rsi | |
1453 | mov %rdi,176($context) # restore context->Rdi | |
1454 | ||
1455 | mov 40($disp),%rdi # disp->ContextRecord | |
1456 | mov $context,%rsi # context | |
1457 | mov \$154,%ecx # sizeof(CONTEXT) | |
1458 | .long 0xa548f3fc # cld; rep movsq | |
1459 | ||
1460 | mov $disp,%rsi | |
1461 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
1462 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
1463 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
1464 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
1465 | mov 40(%rsi),%r10 # disp->ContextRecord | |
1466 | lea 56(%rsi),%r11 # &disp->HandlerData | |
1467 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
1468 | mov %r10,32(%rsp) # arg5 | |
1469 | mov %r11,40(%rsp) # arg6 | |
1470 | mov %r12,48(%rsp) # arg7 | |
1471 | mov %rcx,56(%rsp) # arg8, (NULL) | |
1472 | call *__imp_RtlVirtualUnwind(%rip) | |
1473 | ||
1474 | mov \$1,%eax # ExceptionContinueSearch | |
1475 | add \$64,%rsp | |
1476 | popfq | |
1477 | pop %r15 | |
1478 | pop %r14 | |
1479 | pop %r13 | |
1480 | pop %r12 | |
1481 | pop %rbp | |
1482 | pop %rbx | |
1483 | pop %rdi | |
1484 | pop %rsi | |
1485 | ret | |
1486 | .size se_handler,.-se_handler | |
1487 | ___ | |
1488 | $code.=<<___ if ($avx>1); | |
1489 | .type avx2_handler,\@abi-omnipotent | |
1490 | .align 16 | |
1491 | avx2_handler: | |
1492 | push %rsi | |
1493 | push %rdi | |
1494 | push %rbx | |
1495 | push %rbp | |
1496 | push %r12 | |
1497 | push %r13 | |
1498 | push %r14 | |
1499 | push %r15 | |
1500 | pushfq | |
1501 | sub \$64,%rsp | |
1502 | ||
1503 | mov 120($context),%rax # pull context->Rax | |
1504 | mov 248($context),%rbx # pull context->Rip | |
1505 | ||
1506 | mov 8($disp),%rsi # disp->ImageBase | |
1507 | mov 56($disp),%r11 # disp->HandlerData | |
1508 | ||
1509 | mov 0(%r11),%r10d # HandlerData[0] | |
1510 | lea (%rsi,%r10),%r10 # end of prologue label | |
1511 | cmp %r10,%rbx # context->Rip<body label | |
1512 | jb .Lin_prologue | |
1513 | ||
1514 | mov 152($context),%rax # pull context->Rsp | |
1515 | ||
1516 | mov 4(%r11),%r10d # HandlerData[1] | |
1517 | lea (%rsi,%r10),%r10 # epilogue label | |
1518 | cmp %r10,%rbx # context->Rip>=epilogue label | |
1519 | jae .Lin_prologue | |
1520 | ||
1521 | mov `32*17`($context),%rax # pull saved stack pointer | |
1522 | ||
1523 | mov -8(%rax),%rbx | |
1524 | mov -16(%rax),%rbp | |
1525 | mov -24(%rax),%r12 | |
1526 | mov -32(%rax),%r13 | |
1527 | mov -40(%rax),%r14 | |
1528 | mov -48(%rax),%r15 | |
1529 | mov %rbx,144($context) # restore context->Rbx | |
1530 | mov %rbp,160($context) # restore context->Rbp | |
46f4e1be JS |
1531 | mov %r12,216($context) # restore context->R12 |
1532 | mov %r13,224($context) # restore context->R13 | |
1533 | mov %r14,232($context) # restore context->R14 | |
1534 | mov %r15,240($context) # restore context->R15 | |
619b9466 AP |
1535 | |
1536 | lea -56-10*16(%rax),%rsi | |
1537 | lea 512($context),%rdi # &context.Xmm6 | |
1538 | mov \$20,%ecx | |
1539 | .long 0xa548f3fc # cld; rep movsq | |
1540 | ||
1541 | jmp .Lin_prologue | |
1542 | .size avx2_handler,.-avx2_handler | |
1543 | ___ | |
1544 | $code.=<<___; | |
1545 | .section .pdata | |
1546 | .align 4 | |
1547 | .rva .LSEH_begin_sha1_multi_block | |
1548 | .rva .LSEH_end_sha1_multi_block | |
1549 | .rva .LSEH_info_sha1_multi_block | |
1550 | .rva .LSEH_begin_sha1_multi_block_shaext | |
1551 | .rva .LSEH_end_sha1_multi_block_shaext | |
1552 | .rva .LSEH_info_sha1_multi_block_shaext | |
1553 | ___ | |
1554 | $code.=<<___ if ($avx); | |
1555 | .rva .LSEH_begin_sha1_multi_block_avx | |
1556 | .rva .LSEH_end_sha1_multi_block_avx | |
1557 | .rva .LSEH_info_sha1_multi_block_avx | |
1558 | ___ | |
1559 | $code.=<<___ if ($avx>1); | |
1560 | .rva .LSEH_begin_sha1_multi_block_avx2 | |
1561 | .rva .LSEH_end_sha1_multi_block_avx2 | |
1562 | .rva .LSEH_info_sha1_multi_block_avx2 | |
1563 | ___ | |
1564 | $code.=<<___; | |
1565 | .section .xdata | |
1566 | .align 8 | |
1567 | .LSEH_info_sha1_multi_block: | |
1568 | .byte 9,0,0,0 | |
1569 | .rva se_handler | |
1570 | .rva .Lbody,.Lepilogue # HandlerData[] | |
1571 | .LSEH_info_sha1_multi_block_shaext: | |
1572 | .byte 9,0,0,0 | |
1573 | .rva se_handler | |
1574 | .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] | |
1575 | ___ | |
1576 | $code.=<<___ if ($avx); | |
1577 | .LSEH_info_sha1_multi_block_avx: | |
1578 | .byte 9,0,0,0 | |
1579 | .rva se_handler | |
1580 | .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] | |
b7838586 | 1581 | ___ |
619b9466 AP |
1582 | $code.=<<___ if ($avx>1); |
1583 | .LSEH_info_sha1_multi_block_avx2: | |
1584 | .byte 9,0,0,0 | |
1585 | .rva avx2_handler | |
1586 | .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] | |
1587 | ___ | |
1588 | } | |
1589 | #################################################################### | |
1590 | ||
1591 | sub rex { | |
1592 | local *opcode=shift; | |
1593 | my ($dst,$src)=@_; | |
1594 | my $rex=0; | |
1595 | ||
1596 | $rex|=0x04 if ($dst>=8); | |
1597 | $rex|=0x01 if ($src>=8); | |
1598 | unshift @opcode,$rex|0x40 if ($rex); | |
1599 | } | |
1600 | ||
1601 | sub sha1rnds4 { | |
1602 | if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
1603 | my @opcode=(0x0f,0x3a,0xcc); | |
1604 | rex(\@opcode,$3,$2); | |
1605 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M | |
1606 | my $c=$1; | |
1607 | push @opcode,$c=~/^0/?oct($c):$c; | |
1608 | return ".byte\t".join(',',@opcode); | |
1609 | } else { | |
1610 | return "sha1rnds4\t".@_[0]; | |
1611 | } | |
1612 | } | |
1613 | ||
1614 | sub sha1op38 { | |
1615 | my $instr = shift; | |
1616 | my %opcodelet = ( | |
1617 | "sha1nexte" => 0xc8, | |
1618 | "sha1msg1" => 0xc9, | |
1619 | "sha1msg2" => 0xca ); | |
1620 | ||
1621 | if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { | |
1622 | my @opcode=(0x0f,0x38); | |
1623 | rex(\@opcode,$2,$1); | |
1624 | push @opcode,$opcodelet{$instr}; | |
1625 | push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M | |
1626 | return ".byte\t".join(',',@opcode); | |
1627 | } else { | |
1628 | return $instr."\t".@_[0]; | |
1629 | } | |
1630 | } | |
b7838586 AP |
1631 | |
1632 | foreach (split("\n",$code)) { | |
1633 | s/\`([^\`]*)\`/eval($1)/ge; | |
1634 | ||
619b9466 AP |
1635 | s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or |
1636 | s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or | |
1637 | ||
b7838586 AP |
1638 | s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or |
1639 | s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or | |
1640 | s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or | |
1641 | s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or | |
1642 | s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or | |
1643 | s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; | |
619b9466 | 1644 | |
b7838586 AP |
1645 | print $_,"\n"; |
1646 | } | |
1647 | ||
a21314db | 1648 | close STDOUT or die "error closing STDOUT: $!"; |