-Subproject commit e09ea622bba106e13ab85173c205f354b0f1d481
+Subproject commit 34ccd24d0e609ae26a0b0e0085462f35edc5bcce
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
- xorps %xmm11,%xmm3
.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
movups 80(%rdx),%xmm7
xorps %xmm12,%xmm4
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm3
.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
leaq 64(%r8),%r8
xorps %xmm3,%xmm0
movdqu 32(%r8),%xmm3
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rdx),%xmm7
-.byte 102,68,15,58,68,218,0
xorps %xmm4,%xmm8
- movdqa %xmm3,%xmm5
+.byte 102,68,15,58,68,218,0
pshufd $78,%xmm3,%xmm4
pxor %xmm0,%xmm8
- pxor %xmm3,%xmm4
+ movdqa %xmm3,%xmm5
pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
movdqa %xmm8,%xmm9
- pslldq $8,%xmm8
.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
psrldq $8,%xmm9
pxor %xmm8,%xmm0
movdqa .L7_mask(%rip),%xmm8
pand %xmm0,%xmm8
.byte 102,69,15,56,0,200
-.byte 102,68,15,58,68,231,0
pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
psllq $57,%xmm9
movdqa %xmm9,%xmm8
pslldq $8,%xmm9
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
- pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pxor %xmm1,%xmm0
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
- pxor %xmm0,%xmm8
-
leaq 64(%r8),%r8
subq $64,%r9
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm4
.byte 102,65,15,58,68,207,17
- xorps %xmm3,%xmm0
.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
pxor %xmm4,%xmm8
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
- movdqu (%r8),%xmm5
+ movdqu (%r8),%xmm9
pxor %xmm0,%xmm8
-.byte 102,65,15,56,0,234
+.byte 102,69,15,56,0,202
movdqu 16(%r8),%xmm3
pxor %xmm1,%xmm8
- pxor %xmm5,%xmm1
+ pxor %xmm9,%xmm1
pxor %xmm8,%xmm4
.byte 102,65,15,56,0,218
movdqa %xmm4,%xmm8
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
-.byte 102,15,58,68,234,17
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
psrlq $1,%xmm0
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
-.byte 0x66,0x90
subq $32,%r9
ja .Lmod_loop
movl _gnutls_x86_cpuid_s+8(%rip),%r10d
testl $512,%r8d
jz .Lialu
+ testl $536870912,%r10d
+ jnz _shaext_shortcut
jmp _ssse3_shortcut
.p2align 4
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
.LSEH_end_sha1_block_data_order:
+.def sha1_block_data_order_shaext; .scl 3; .type 32; .endef
+.p2align 5
+sha1_block_data_order_shaext:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_sha1_block_data_order_shaext:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+_shaext_shortcut:
+ leaq -72(%rsp),%rsp
+ movaps %xmm6,-8-64(%rax)
+ movaps %xmm7,-8-48(%rax)
+ movaps %xmm8,-8-32(%rax)
+ movaps %xmm9,-8-16(%rax)
+.Lprologue_shaext:
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp .Loop_shaext
+
+.p2align 4
+.Loop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%rax
+ paddd %xmm4,%xmm1
+ cmovneq %rax,%rsi
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz .Loop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ movaps -8-64(%rax),%xmm6
+ movaps -8-48(%rax),%xmm7
+ movaps -8-32(%rax),%xmm8
+ movaps -8-16(%rax),%xmm9
+ movq %rax,%rsp
+.Lepilogue_shaext:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_sha1_block_data_order_shaext:
.def sha1_block_data_order_ssse3; .scl 3; .type 32; .endef
.p2align 4
sha1_block_data_order_ssse3:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
jmp .Lcommon_seh_tail
+.def shaext_handler; .scl 3; .type 32; .endef
+.p2align 4
+shaext_handler:
+ pushq %rsi
+ pushq %rdi
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+ subq $64,%rsp
+
+ movq 120(%r8),%rax
+ movq 248(%r8),%rbx
+
+ leaq .Lprologue_shaext(%rip),%r10
+ cmpq %r10,%rbx
+ jb .Lcommon_seh_tail
+
+ leaq .Lepilogue_shaext(%rip),%r10
+ cmpq %r10,%rbx
+ jae .Lcommon_seh_tail
+
+ leaq -8-64(%rax),%rsi
+ leaq 512(%r8),%rdi
+ movl $8,%ecx
+.long 0xa548f3fc
+
+ jmp .Lcommon_seh_tail
.def ssse3_handler; .scl 3; .type 32; .endef
.p2align 4
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
+.rva .LSEH_begin_sha1_block_data_order_shaext
+.rva .LSEH_end_sha1_block_data_order_shaext
+.rva .LSEH_info_sha1_block_data_order_shaext
.rva .LSEH_begin_sha1_block_data_order_ssse3
.rva .LSEH_end_sha1_block_data_order_ssse3
.rva .LSEH_info_sha1_block_data_order_ssse3
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
+.LSEH_info_sha1_block_data_order_shaext:
+.byte 9,0,0,0
+.rva shaext_handler
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ testl $536870912,%r11d
+ jnz _shaext_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
pushq %rbx
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.def sha256_block_data_order_shaext; .scl 3; .type 32; .endef
+.p2align 6
+sha256_block_data_order_shaext:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_sha256_block_data_order_shaext:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+_shaext_shortcut:
+ leaq -88(%rsp),%rsp
+ movaps %xmm6,-8-80(%rax)
+ movaps %xmm7,-8-64(%rax)
+ movaps %xmm8,-8-48(%rax)
+ movaps %xmm9,-8-32(%rax)
+ movaps %xmm10,-8-16(%rax)
+.Lprologue_shaext:
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .Loop_shaext
+
+.p2align 4
+.Loop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz .Loop_shaext
+
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ movaps -8-80(%rax),%xmm6
+ movaps -8-64(%rax),%xmm7
+ movaps -8-48(%rax),%xmm8
+ movaps -8-32(%rax),%xmm9
+ movaps -8-16(%rax),%xmm10
+ movq %rax,%rsp
+.Lepilogue_shaext:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_sha256_block_data_order_shaext:
.def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef
.p2align 6
sha256_block_data_order_ssse3:
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
leaq K256(%rip),%rbp
.byte 102,15,56,0,207
movdqa 0(%rbp),%xmm4
-.byte 102,15,56,0,215
movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
paddd %xmm0,%xmm4
movdqa 64(%rbp),%xmm6
.byte 102,15,56,0,223
popq %rsi
.byte 0xf3,0xc3
+.def shaext_handler; .scl 3; .type 32; .endef
+.p2align 4
+shaext_handler:
+ pushq %rsi
+ pushq %rdi
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+ subq $64,%rsp
+
+ movq 120(%r8),%rax
+ movq 248(%r8),%rbx
+
+ leaq .Lprologue_shaext(%rip),%r10
+ cmpq %r10,%rbx
+ jb .Lin_prologue
+
+ leaq .Lepilogue_shaext(%rip),%r10
+ cmpq %r10,%rbx
+ jae .Lin_prologue
+
+ leaq -8-80(%rax),%rsi
+ leaq 512(%r8),%rdi
+ movl $10,%ecx
+.long 0xa548f3fc
+
+ jmp .Lin_prologue
.section .pdata
.p2align 2
.rva .LSEH_begin_sha256_block_data_order
.rva .LSEH_end_sha256_block_data_order
.rva .LSEH_info_sha256_block_data_order
+.rva .LSEH_begin_sha256_block_data_order_shaext
+.rva .LSEH_end_sha256_block_data_order_shaext
+.rva .LSEH_info_sha256_block_data_order_shaext
.rva .LSEH_begin_sha256_block_data_order_ssse3
.rva .LSEH_end_sha256_block_data_order_ssse3
.rva .LSEH_info_sha256_block_data_order_ssse3
.byte 9,0,0,0
.rva se_handler
.rva .Lprologue,.Lepilogue
+.LSEH_info_sha256_block_data_order_shaext:
+.byte 9,0,0,0
+.rva shaext_handler
.LSEH_info_sha256_block_data_order_ssse3:
.byte 9,0,0,0
.rva se_handler
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
- xorps %xmm11,%xmm3
.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
movups 80(%rsi),%xmm7
xorps %xmm12,%xmm4
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm3
.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
xorps %xmm3,%xmm0
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
-.byte 102,68,15,58,68,218,0
xorps %xmm4,%xmm8
- movdqa %xmm3,%xmm5
+.byte 102,68,15,58,68,218,0
pshufd $78,%xmm3,%xmm4
pxor %xmm0,%xmm8
- pxor %xmm3,%xmm4
+ movdqa %xmm3,%xmm5
pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
movdqa %xmm8,%xmm9
- pslldq $8,%xmm8
.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
psrldq $8,%xmm9
pxor %xmm8,%xmm0
movdqa .L7_mask(%rip),%xmm8
pand %xmm0,%xmm8
.byte 102,69,15,56,0,200
-.byte 102,68,15,58,68,231,0
pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
psllq $57,%xmm9
movdqa %xmm9,%xmm8
pslldq $8,%xmm9
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
- pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pxor %xmm1,%xmm0
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
- pxor %xmm0,%xmm8
-
leaq 64(%rdx),%rdx
subq $64,%rcx
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm4
.byte 102,65,15,58,68,207,17
- xorps %xmm3,%xmm0
.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
pxor %xmm4,%xmm8
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
- movdqu (%rdx),%xmm5
+ movdqu (%rdx),%xmm9
pxor %xmm0,%xmm8
-.byte 102,65,15,56,0,234
+.byte 102,69,15,56,0,202
movdqu 16(%rdx),%xmm3
pxor %xmm1,%xmm8
- pxor %xmm5,%xmm1
+ pxor %xmm9,%xmm1
pxor %xmm8,%xmm4
.byte 102,65,15,56,0,218
movdqa %xmm4,%xmm8
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
-.byte 102,15,58,68,234,17
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
psrlq $1,%xmm0
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
-.byte 0x66,0x90
subq $32,%rcx
ja .Lmod_loop
movl _gnutls_x86_cpuid_s+8(%rip),%r10d
testl $512,%r8d
jz .Lialu
+ testl $536870912,%r10d
+ jnz _shaext_shortcut
jmp _ssse3_shortcut
.align 16
.Lepilogue:
.byte 0xf3,0xc3
.size sha1_block_data_order,.-sha1_block_data_order
+.type sha1_block_data_order_shaext,@function
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%rax
+ paddd %xmm4,%xmm1
+ cmovneq %rax,%rsi
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz .Loop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ .byte 0xf3,0xc3
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
.type sha1_block_data_order_ssse3,@function
.align 16
sha1_block_data_order_ssse3:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ testl $536870912,%r11d
+ jnz _shaext_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
pushq %rbx
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha256_block_data_order_shaext,@function
+.align 64
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz .Loop_shaext
+
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ .byte 0xf3,0xc3
+.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
.type sha256_block_data_order_ssse3,@function
.align 64
sha256_block_data_order_ssse3:
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
leaq K256(%rip),%rbp
.byte 102,15,56,0,207
movdqa 0(%rbp),%xmm4
-.byte 102,15,56,0,215
movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
paddd %xmm0,%xmm4
movdqa 64(%rbp),%xmm6
.byte 102,15,56,0,223
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
- xorps %xmm11,%xmm3
.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
movups 80(%rsi),%xmm7
xorps %xmm12,%xmm4
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm3
.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
xorps %xmm3,%xmm0
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
-.byte 102,68,15,58,68,218,0
xorps %xmm4,%xmm8
- movdqa %xmm3,%xmm5
+.byte 102,68,15,58,68,218,0
pshufd $78,%xmm3,%xmm4
pxor %xmm0,%xmm8
- pxor %xmm3,%xmm4
+ movdqa %xmm3,%xmm5
pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
movdqa %xmm8,%xmm9
- pslldq $8,%xmm8
.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
psrldq $8,%xmm9
pxor %xmm8,%xmm0
movdqa L$7_mask(%rip),%xmm8
pand %xmm0,%xmm8
.byte 102,69,15,56,0,200
-.byte 102,68,15,58,68,231,0
pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
psllq $57,%xmm9
movdqa %xmm9,%xmm8
pslldq $8,%xmm9
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
- pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pxor %xmm1,%xmm0
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
- pxor %xmm0,%xmm8
-
leaq 64(%rdx),%rdx
subq $64,%rcx
jnc L$mod4_loop
L$tail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm4
.byte 102,65,15,58,68,207,17
- xorps %xmm3,%xmm0
.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
pxor %xmm4,%xmm8
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
- movdqu (%rdx),%xmm5
+ movdqu (%rdx),%xmm9
pxor %xmm0,%xmm8
-.byte 102,65,15,56,0,234
+.byte 102,69,15,56,0,202
movdqu 16(%rdx),%xmm3
pxor %xmm1,%xmm8
- pxor %xmm5,%xmm1
+ pxor %xmm9,%xmm1
pxor %xmm8,%xmm4
.byte 102,65,15,56,0,218
movdqa %xmm4,%xmm8
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
-.byte 102,15,58,68,234,17
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
psrlq $1,%xmm0
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
-.byte 0x66,0x90
subq $32,%rcx
ja L$mod_loop
movl __gnutls_x86_cpuid_s+8(%rip),%r10d
testl $512,%r8d
jz L$ialu
+ testl $536870912,%r10d
+ jnz _shaext_shortcut
jmp _ssse3_shortcut
.p2align 4
.byte 0xf3,0xc3
+.p2align 5
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp L$oop_shaext
+
+.p2align 4
+L$oop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%rax
+ paddd %xmm4,%xmm1
+ cmovneq %rax,%rsi
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz L$oop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ .byte 0xf3,0xc3
+
+
.p2align 4
sha1_block_data_order_ssse3:
_ssse3_shortcut:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ testl $536870912,%r11d
+ jnz _shaext_shortcut
testl $512,%r10d
jnz L$ssse3_shortcut
pushq %rbx
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp L$oop_shaext
+
+.p2align 4
+L$oop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz L$oop_shaext
+
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ .byte 0xf3,0xc3
+
+
.p2align 6
sha256_block_data_order_ssse3:
L$ssse3_shortcut:
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
leaq K256(%rip),%rbp
.byte 102,15,56,0,207
movdqa 0(%rbp),%xmm4
-.byte 102,15,56,0,215
movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
paddd %xmm0,%xmm4
movdqa 64(%rbp),%xmm6
.byte 102,15,56,0,223