]> git.ipfire.org Git - thirdparty/gnutls.git/commitdiff
Make asm-sources
authorZoltan Fridrich <zfridric@redhat.com>
Tue, 2 Jul 2024 15:13:21 +0000 (17:13 +0200)
committerZoltan Fridrich <zfridric@redhat.com>
Tue, 2 Jul 2024 15:14:48 +0000 (17:14 +0200)
Signed-off-by: Zoltan Fridrich <zfridric@redhat.com>
lib/accelerated/aarch64/elf/aes-aarch64.s
lib/accelerated/aarch64/elf/ghash-aarch64.s
lib/accelerated/aarch64/elf/sha1-armv8.s
lib/accelerated/aarch64/elf/sha256-armv8.s
lib/accelerated/aarch64/elf/sha512-armv8.s
lib/accelerated/aarch64/macosx/aes-aarch64.s
lib/accelerated/aarch64/macosx/ghash-aarch64.s
lib/accelerated/aarch64/macosx/sha1-armv8.s
lib/accelerated/aarch64/macosx/sha256-armv8.s
lib/accelerated/aarch64/macosx/sha512-armv8.s

index b9b4b4b6e4cd94e8fb260ddfdca34a55753b92a2..cb3bfcfc7e7c95c1a6d4ad7a9b432709ac33414d 100644 (file)
 #
 # *** This file is auto-generated ***
 #
-# 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
+# 0 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S"
+# 1 "/home/zfridric/upstream/gnutls//"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/elf/aes-aarch64.s.tmp.S" 2
 
 
-.text
 .arch armv8-a+crypto
+.text
 .align 5
 .Lrcon:
 .long 0x01,0x01,0x01,0x01
@@ -58,6 +59,8 @@
 .align 5
 aes_v8_set_encrypt_key:
 .Lenc_key:
+
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
  mov x3,#-1
@@ -229,7 +232,7 @@ aes_v8_set_encrypt_key:
 .type aes_v8_set_decrypt_key,%function
 .align 5
 aes_v8_set_decrypt_key:
-.inst 0xd503233f
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
  bl .Lenc_key
@@ -263,13 +266,14 @@ aes_v8_set_decrypt_key:
  eor x0,x0,x0
 .Ldec_key_abort:
  ldp x29,x30,[sp],#16
-.inst 0xd50323bf
+
  ret
 .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
 .globl aes_v8_encrypt
 .type aes_v8_encrypt,%function
 .align 5
 aes_v8_encrypt:
+
  ldr w3,[x2,#240]
  ld1 {v0.4s},[x2],#16
  ld1 {v2.16b},[x0]
@@ -299,6 +303,7 @@ aes_v8_encrypt:
 .type aes_v8_decrypt,%function
 .align 5
 aes_v8_decrypt:
+
  ldr w3,[x2,#240]
  ld1 {v0.4s},[x2],#16
  ld1 {v2.16b},[x0]
@@ -324,21 +329,109 @@ aes_v8_decrypt:
  st1 {v2.16b},[x1]
  ret
 .size aes_v8_decrypt,.-aes_v8_decrypt
-.globl aes_v8_cbc_encrypt
-.type aes_v8_cbc_encrypt,%function
+.globl aes_v8_ecb_encrypt
+.type aes_v8_ecb_encrypt,%function
 .align 5
-aes_v8_cbc_encrypt:
+aes_v8_ecb_encrypt:
+
+ subs x2,x2,#16
+
+ b.ne .Lecb_big_size
+ ld1 {v0.16b},[x0]
+ cmp w4,#0
+ ldr w5,[x3,#240]
+ ld1 {v5.4s,v6.4s},[x3],#32
+
+ b.eq .Lecb_small_dec
+ aese v0.16b,v5.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aese v0.16b,v6.16b
+ aesmc v0.16b,v0.16b
+ subs w5,w5,#10
+ b.eq .Lecb_128_enc
+.Lecb_round_loop:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w5,w5,#2
+ b.gt .Lecb_round_loop
+.Lecb_128_enc:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ st1 {v0.16b},[x1]
+ b .Lecb_Final_abort
+.Lecb_small_dec:
+ aesd v0.16b,v5.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aesd v0.16b,v6.16b
+ aesimc v0.16b,v0.16b
+ subs w5,w5,#10
+ b.eq .Lecb_128_dec
+.Lecb_dec_round_loop:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w5,w5,#2
+ b.gt .Lecb_dec_round_loop
+.Lecb_128_dec:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ st1 {v0.16b},[x1]
+ b .Lecb_Final_abort
+.Lecb_big_size:
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
- subs x2,x2,#16
  mov x8,#16
- b.lo .Lcbc_abort
+ b.lo .Lecb_done
  csel x8,xzr,x8,eq
 
- cmp w5,#0
+ cmp w4,#0
  ldr w5,[x3,#240]
  and x2,x2,#-16
- ld1 {v6.16b},[x4]
  ld1 {v0.16b},[x0],x8
 
  ld1 {v16.4s,v17.4s},[x3]
@@ -352,450 +445,2792 @@ aes_v8_cbc_encrypt:
 
  add x7,x3,#32
  mov w6,w5
- b.eq .Lcbc_dec
+ b.eq .Lecb_dec
 
- cmp w5,#2
- eor v0.16b,v0.16b,v6.16b
- eor v5.16b,v16.16b,v7.16b
- b.eq .Lcbc_enc128
+ ld1 {v1.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v1.16b,v1.16b
+ orr v24.16b,v1.16b,v1.16b
+ orr v1.16b,v0.16b,v0.16b
+ b.lo .Lecb_enc_tail
 
- ld1 {v2.4s,v3.4s},[x7]
- add x7,x3,#16
- add x6,x3,#16*4
- add x12,x3,#16*5
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- add x14,x3,#16*6
- add x3,x3,#16*7
- b .Lenter_cbc_enc
+ orr v1.16b,v3.16b,v3.16b
+ ld1 {v24.16b},[x0],#16
+ cmp x2,#32
+ b.lo .Loop3x_ecb_enc
 
-.align 4
-.Loop_cbc_enc:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-.Lenter_cbc_enc:
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.4s},[x6]
- cmp w5,#4
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x12]
- b.eq .Lcbc_enc192
+ ld1 {v25.16b},[x0],#16
+ ld1 {v26.16b},[x0],#16
+ sub x2,x2,#32
+ mov w6,w5
 
+.Loop5x_ecb_enc:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- ld1 {v16.4s},[x14]
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x3]
- nop
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop5x_ecb_enc
 
-.Lcbc_enc192:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- subs x2,x2,#16
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ cmp x2,#0x40
+ sub x2,x2,#0x50
+
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
  aese v0.16b,v18.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v18.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v18.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v18.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v18.16b
+ aesmc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
  aese v0.16b,v19.16b
  aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ aese v1.16b,v19.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v19.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v19.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v19.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v20.16b
  aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v20.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v20.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v21.16b
  aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7]
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v21.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v21.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v22.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v22.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v22.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs .Loop_cbc_enc
+ ld1 {v2.16b},[x0],#16
+ aese v1.16b,v23.16b
+ ld1 {v3.16b},[x0],#16
+ aese v24.16b,v23.16b
+ ld1 {v27.16b},[x0],#16
+ aese v25.16b,v23.16b
+ ld1 {v28.16b},[x0],#16
+ aese v26.16b,v23.16b
+ ld1 {v29.16b},[x0],#16
+ cbz x6,.Lecb_enc_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v7.16b,v0.16b
+ orr v0.16b,v2.16b,v2.16b
+ eor v5.16b,v7.16b,v1.16b
+ orr v1.16b,v3.16b,v3.16b
+ eor v17.16b,v7.16b,v24.16b
+ orr v24.16b,v27.16b,v27.16b
+ eor v30.16b,v7.16b,v25.16b
+ orr v25.16b,v28.16b,v28.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ orr v26.16b,v29.16b,v29.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs .Loop5x_ecb_enc
 
- st1 {v6.16b},[x1],#16
b .Lcbc_done
+ add x2,x2,#0x50
cbz x2,.Lecb_done
 
-.align 5
-.Lcbc_enc128:
- ld1 {v2.4s,v3.4s},[x7]
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- b .Lenter_cbc_enc128
-.Loop_cbc_enc128:
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ orr v0.16b,v27.16b,v27.16b
+ orr v1.16b,v28.16b,v28.16b
+ orr v24.16b,v29.16b,v29.16b
+ b.lo .Lecb_enc_tail
+
+ b .Loop3x_ecb_enc
+
+.align 4
+.Lecb_enc_tail4x:
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
+ eor v30.16b,v7.16b,v25.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+
+ b .Lecb_done
+.align 4
+.Loop3x_ecb_enc:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-.Lenter_cbc_enc128:
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- subs x2,x2,#16
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v18.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ecb_enc
+
+ aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v0.16b,v19.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ subs x2,x2,#0x30
+ csel x6,x2,x6,lo
+ aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ add x0,x0,x6
+
+
+ mov x7,x3
  aese v0.16b,v20.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v2.16b},[x0],#16
  aese v0.16b,v21.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v3.16b},[x0],#16
  aese v0.16b,v22.16b
  aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
  aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs .Loop_cbc_enc128
-
- st1 {v6.16b},[x1],#16
- b .Lcbc_done
-.align 5
-.Lcbc_dec:
- ld1 {v18.16b},[x0],#16
- subs x2,x2,#32
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ ld1 {v16.4s},[x7],#16
  add w6,w5,#2
- orr v3.16b,v0.16b,v0.16b
- orr v1.16b,v0.16b,v0.16b
- orr v19.16b,v18.16b,v18.16b
- b.lo .Lcbc_dec_tail
+ eor v4.16b,v7.16b,v0.16b
+ eor v5.16b,v7.16b,v1.16b
+ eor v24.16b,v24.16b,v7.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v24.16b},[x1],#16
+ orr v24.16b,v27.16b,v27.16b
+ b.hs .Loop3x_ecb_enc
 
- orr v1.16b,v18.16b,v18.16b
- ld1 {v18.16b},[x0],#16
- orr v2.16b,v0.16b,v0.16b
- orr v3.16b,v1.16b,v1.16b
- orr v19.16b,v18.16b,v18.16b
+ cmn x2,#0x30
+ b.eq .Lecb_done
+ nop
 
-.Loop3x_cbc_dec:
- aesd v0.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v16.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
+.Lecb_enc_tail:
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
  ld1 {v16.4s},[x7],#16
  subs w6,w6,#2
- aesd v0.16b,v17.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v17.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
  ld1 {v17.4s},[x7],#16
- b.gt .Loop3x_cbc_dec
+ b.gt .Lecb_enc_tail
 
- aesd v0.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v16.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
- eor v4.16b,v6.16b,v7.16b
- subs x2,x2,#0x30
- eor v5.16b,v2.16b,v7.16b
- csel x6,x2,x6,lo
- aesd v0.16b,v17.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v17.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ cmn x2,#0x20
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ b.eq .Lecb_enc_one
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b .Lecb_done
+
+.Lecb_enc_one:
+ eor v5.16b,v7.16b,v24.16b
+ st1 {v5.16b},[x1],#16
+ b .Lecb_done
+.align 5
+.Lecb_dec:
+ ld1 {v1.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v1.16b,v1.16b
+ orr v24.16b,v1.16b,v1.16b
+ orr v1.16b,v0.16b,v0.16b
+ b.lo .Lecb_dec_tail
+
+ orr v1.16b,v3.16b,v3.16b
+ ld1 {v24.16b},[x0],#16
+ cmp x2,#32
+ b.lo .Loop3x_ecb_dec
+
+ ld1 {v25.16b},[x0],#16
+ ld1 {v26.16b},[x0],#16
+ sub x2,x2,#32
+ mov w6,w5
+
+.Loop5x_ecb_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop5x_ecb_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ cmp x2,#0x40
+ sub x2,x2,#0x50
+
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v18.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v18.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v18.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v18.16b
+ aesimc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v19.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v19.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v19.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v19.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v20.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v20.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v21.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v21.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v22.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v22.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v23.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v1.16b,v23.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v24.16b,v23.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v25.16b,v23.16b
+ ld1 {v28.16b},[x0],#16
+ aesd v26.16b,v23.16b
+ ld1 {v29.16b},[x0],#16
+ cbz x6,.Lecb_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v7.16b,v0.16b
+ orr v0.16b,v2.16b,v2.16b
+ eor v5.16b,v7.16b,v1.16b
+ orr v1.16b,v3.16b,v3.16b
+ eor v17.16b,v7.16b,v24.16b
+ orr v24.16b,v27.16b,v27.16b
+ eor v30.16b,v7.16b,v25.16b
+ orr v25.16b,v28.16b,v28.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ orr v26.16b,v29.16b,v29.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs .Loop5x_ecb_dec
+
+ add x2,x2,#0x50
+ cbz x2,.Lecb_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ orr v0.16b,v27.16b,v27.16b
+ orr v1.16b,v28.16b,v28.16b
+ orr v24.16b,v29.16b,v29.16b
+ b.lo .Lecb_dec_tail
+
+ b .Loop3x_ecb_dec
+
+.align 4
+.Lecb_tail4x:
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
+ eor v30.16b,v7.16b,v25.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+
+ b .Lecb_done
+.align 4
+.Loop3x_ecb_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ecb_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ subs x2,x2,#0x30
+ csel x6,x2,x6,lo
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
  add x0,x0,x6
 
 
- orr v6.16b,v19.16b,v19.16b
  mov x7,x3
  aesd v0.16b,v20.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v20.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v20.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
  ld1 {v2.16b},[x0],#16
  aesd v0.16b,v21.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v21.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v21.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
  ld1 {v3.16b},[x0],#16
  aesd v0.16b,v22.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v22.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v22.16b
- aesimc v18.16b,v18.16b
- ld1 {v19.16b},[x0],#16
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
  aesd v0.16b,v23.16b
  aesd v1.16b,v23.16b
- aesd v18.16b,v23.16b
+ aesd v24.16b,v23.16b
  ld1 {v16.4s},[x7],#16
  add w6,w5,#2
- eor v4.16b,v4.16b,v0.16b
- eor v5.16b,v5.16b,v1.16b
- eor v18.16b,v18.16b,v17.16b
+ eor v4.16b,v7.16b,v0.16b
+ eor v5.16b,v7.16b,v1.16b
+ eor v24.16b,v24.16b,v7.16b
  ld1 {v17.4s},[x7],#16
  st1 {v4.16b},[x1],#16
  orr v0.16b,v2.16b,v2.16b
  st1 {v5.16b},[x1],#16
  orr v1.16b,v3.16b,v3.16b
- st1 {v18.16b},[x1],#16
- orr v18.16b,v19.16b,v19.16b
- b.hs .Loop3x_cbc_dec
+ st1 {v24.16b},[x1],#16
+ orr v24.16b,v27.16b,v27.16b
+ b.hs .Loop3x_ecb_dec
 
  cmn x2,#0x30
- b.eq .Lcbc_done
+ b.eq .Lecb_done
  nop
 
-.Lcbc_dec_tail:
+.Lecb_dec_tail:
  aesd v1.16b,v16.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
  ld1 {v16.4s},[x7],#16
  subs w6,w6,#2
  aesd v1.16b,v17.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
  ld1 {v17.4s},[x7],#16
- b.gt .Lcbc_dec_tail
+ b.gt .Lecb_dec_tail
 
  aesd v1.16b,v16.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v17.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v20.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v20.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
  cmn x2,#0x20
  aesd v1.16b,v21.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v21.16b
- aesimc v18.16b,v18.16b
- eor v5.16b,v6.16b,v7.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v22.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v22.16b
- aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v23.16b
- aesd v18.16b,v23.16b
- b.eq .Lcbc_dec_one
- eor v5.16b,v5.16b,v1.16b
- eor v17.16b,v17.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+ aesd v24.16b,v23.16b
+ b.eq .Lecb_dec_one
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
  st1 {v5.16b},[x1],#16
  st1 {v17.16b},[x1],#16
- b .Lcbc_done
+ b .Lecb_done
 
-.Lcbc_dec_one:
- eor v5.16b,v5.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+.Lecb_dec_one:
+ eor v5.16b,v7.16b,v24.16b
  st1 {v5.16b},[x1],#16
 
-.Lcbc_done:
- st1 {v6.16b},[x4]
-.Lcbc_abort:
+.Lecb_done:
  ldr x29,[sp],#16
+.Lecb_Final_abort:
  ret
-.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
-.globl aes_v8_ctr32_encrypt_blocks
-.type aes_v8_ctr32_encrypt_blocks,%function
+.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
+.globl aes_v8_cbc_encrypt
+.type aes_v8_cbc_encrypt,%function
 .align 5
-aes_v8_ctr32_encrypt_blocks:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- ldr w5,[x3,#240]
-
- ldr w8, [x4, #12]
+aes_v8_cbc_encrypt:
 
 
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo .Lcbc_abort
+ csel x8,xzr,x8,eq
 
- ld1 {v0.4s},[x4]
+ cmp w5,#0
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
 
  ld1 {v16.4s,v17.4s},[x3]
- sub w5,w5,#4
- mov x12,#16
- cmp x2,#2
+ sub w5,w5,#6
  add x7,x3,x5,lsl#4
  sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
  ld1 {v20.4s,v21.4s},[x7],#32
  ld1 {v22.4s,v23.4s},[x7],#32
  ld1 {v7.4s},[x7]
+
  add x7,x3,#32
  mov w6,w5
- csel x12,xzr,x12,lo
+ b.eq .Lcbc_dec
 
- rev w8, w8
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq .Lcbc_enc128
 
- orr v1.16b,v0.16b,v0.16b
- add w10, w8, #1
- orr v18.16b,v0.16b,v0.16b
- add w8, w8, #2
- orr v6.16b,v0.16b,v0.16b
- rev w10, w10
- mov v1.s[3],w10
- b.ls .Lctr32_tail
- rev w12, w8
- sub x2,x2,#3
- mov v18.s[3],w12
- b .Loop3x_ctr32
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b .Lenter_cbc_enc
 
 .align 4
-.Loop3x_ctr32:
+.Loop_cbc_enc:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc:
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v17.16b
- aesmc v18.16b,v18.16b
- ld1 {v17.4s},[x7],#16
- b.gt .Loop3x_ctr32
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq .Lcbc_enc192
 
  aese v0.16b,v16.16b
- aesmc v4.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v5.16b,v1.16b
- ld1 {v2.16b},[x0],#16
- orr v0.16b,v6.16b,v6.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
- orr v1.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v17.16b
- aesmc v5.16b,v5.16b
- ld1 {v19.16b},[x0],#16
- mov x7,x3
- aese v18.16b,v17.16b
- aesmc v17.16b,v18.16b
- orr v18.16b,v6.16b,v6.16b
- add w9,w8,#1
- aese v4.16b,v20.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v20.16b
- aesmc v5.16b,v5.16b
- eor v2.16b,v2.16b,v7.16b
- add w10,w8,#2
- aese v17.16b,v20.16b
- aesmc v17.16b,v17.16b
- eor v3.16b,v3.16b,v7.16b
- add w8,w8,#3
- aese v4.16b,v21.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v21.16b
- aesmc v5.16b,v5.16b
- eor v19.16b,v19.16b,v7.16b
- rev w9,w9
- aese v17.16b,v21.16b
- aesmc v17.16b,v17.16b
- mov v0.s[3], w9
- rev w10,w10
- aese v4.16b,v22.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v22.16b
- aesmc v5.16b,v5.16b
- mov v1.s[3], w10
- rev w12,w8
- aese v17.16b,v22.16b
- aesmc v17.16b,v17.16b
- mov v18.s[3], w12
- subs x2,x2,#3
- aese v4.16b,v23.16b
- aese v5.16b,v23.16b
- aese v17.16b,v23.16b
-
- eor v2.16b,v2.16b,v4.16b
- ld1 {v16.4s},[x7],#16
- st1 {v2.16b},[x1],#16
- eor v3.16b,v3.16b,v5.16b
- mov w6,w5
- st1 {v3.16b},[x1],#16
- eor v19.16b,v19.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- st1 {v19.16b},[x1],#16
- b.hs .Loop3x_ctr32
-
- adds x2,x2,#3
- b.eq .Lctr32_done
- cmp x2,#1
- mov x12,#16
- csel x12,xzr,x12,eq
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
 
-.Lctr32_tail:
+.Lcbc_enc192:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
+ subs x2,x2,#16
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v17.4s},[x7],#16
- b.gt .Lctr32_tail
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc
+
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
 
+.align 5
+.Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc128:
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v2.16b},[x0],x12
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
  aese v0.16b,v20.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v20.16b
- aesmc v1.16b,v1.16b
- ld1 {v3.16b},[x0]
  aese v0.16b,v21.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v21.16b
- aesmc v1.16b,v1.16b
- eor v2.16b,v2.16b,v7.16b
  aese v0.16b,v22.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v22.16b
- aesmc v1.16b,v1.16b
- eor v3.16b,v3.16b,v7.16b
+ eor v16.16b,v16.16b,v5.16b
  aese v0.16b,v23.16b
- aese v1.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc128
 
- cmp x2,#1
- eor v2.16b,v2.16b,v0.16b
- eor v3.16b,v3.16b,v1.16b
- st1 {v2.16b},[x1],#16
- b.eq .Lctr32_done
- st1 {v3.16b},[x1]
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
+.align 5
+.Lcbc_dec:
+ ld1 {v24.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v27.16b,v24.16b,v24.16b
+ b.lo .Lcbc_dec_tail
 
-.Lctr32_done:
- ldr x29,[sp],#16
- ret
-.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
+ orr v1.16b,v24.16b,v24.16b
+ ld1 {v24.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v27.16b,v24.16b,v24.16b
+ cmp x2,#32
+ b.lo .Loop3x_cbc_dec
+
+ ld1 {v25.16b},[x0],#16
+ ld1 {v26.16b},[x0],#16
+ sub x2,x2,#32
+ mov w6,w5
+ orr v28.16b,v25.16b,v25.16b
+ orr v29.16b,v26.16b,v26.16b
+
+.Loop5x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop5x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ cmp x2,#0x40
+ sub x2,x2,#0x50
+
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v18.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v18.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v18.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v18.16b
+ aesimc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v19.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v19.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v19.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v19.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v20.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v20.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v21.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v21.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v22.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v22.16b
+ aesimc v26.16b,v26.16b
+
+ eor v4.16b,v6.16b,v7.16b
+ aesd v0.16b,v23.16b
+ eor v5.16b,v2.16b,v7.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v1.16b,v23.16b
+ eor v17.16b,v3.16b,v7.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v24.16b,v23.16b
+ eor v30.16b,v27.16b,v7.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v25.16b,v23.16b
+ eor v31.16b,v28.16b,v7.16b
+ ld1 {v28.16b},[x0],#16
+ aesd v26.16b,v23.16b
+ orr v6.16b,v29.16b,v29.16b
+ ld1 {v29.16b},[x0],#16
+ cbz x6,.Lcbc_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ orr v0.16b,v2.16b,v2.16b
+ eor v5.16b,v5.16b,v1.16b
+ orr v1.16b,v3.16b,v3.16b
+ eor v17.16b,v17.16b,v24.16b
+ orr v24.16b,v27.16b,v27.16b
+ eor v30.16b,v30.16b,v25.16b
+ orr v25.16b,v28.16b,v28.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ orr v26.16b,v29.16b,v29.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs .Loop5x_cbc_dec
+
+ add x2,x2,#0x50
+ cbz x2,.Lcbc_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ orr v0.16b,v27.16b,v27.16b
+ orr v2.16b,v27.16b,v27.16b
+ orr v1.16b,v28.16b,v28.16b
+ orr v3.16b,v28.16b,v28.16b
+ orr v24.16b,v29.16b,v29.16b
+ orr v27.16b,v29.16b,v29.16b
+ b.lo .Lcbc_dec_tail
+
+ b .Loop3x_cbc_dec
+
+.align 4
+.Lcbc_tail4x:
+ eor v5.16b,v4.16b,v1.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v30.16b,v30.16b,v25.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+
+ b .Lcbc_done
+.align 4
+.Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6
+
+
+ orr v6.16b,v27.16b,v27.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ ld1 {v16.4s},[x7],#16
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v24.16b,v24.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v24.16b},[x1],#16
+ orr v24.16b,v27.16b,v27.16b
+ b.hs .Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ b.eq .Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v24.16b
+ orr v6.16b,v27.16b,v27.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ eor v5.16b,v5.16b,v24.16b
+ orr v6.16b,v27.16b,v27.16b
+ st1 {v5.16b},[x1],#16
+
+.Lcbc_done:
+ st1 {v6.16b},[x4]
+.Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
+.globl aes_v8_ctr32_encrypt_blocks
+.type aes_v8_ctr32_encrypt_blocks,%function
+.align 5
+aes_v8_ctr32_encrypt_blocks:
+
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+
+
+
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3]
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ rev w8, w8
+
+ orr v1.16b,v0.16b,v0.16b
+ add w10, w8, #1
+ orr v18.16b,v0.16b,v0.16b
+ add w8, w8, #2
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v1.s[3],w10
+ b.ls .Lctr32_tail
+ rev w12, w8
+ sub x2,x2,#3
+ mov v18.s[3],w12
+ cmp x2,#32
+ b.lo .Loop3x_ctr32
+
+ add w13,w8,#1
+ add w14,w8,#2
+ orr v24.16b,v0.16b,v0.16b
+ rev w13,w13
+ orr v25.16b,v0.16b,v0.16b
+ rev w14,w14
+ mov v24.s[3],w13
+ sub x2,x2,#2
+ mov v25.s[3],w14
+ add w8,w8,#2
+ b .Loop5x_ctr32
+
+.align 4
+.Loop5x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop5x_ctr32
+
+ mov x7,x3
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v16.4s},[x7],#16
+
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v17.4s},[x7],#16
+
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ add w9,w8,#1
+ add w10,w8,#2
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ add w12,w8,#3
+ add w13,w8,#4
+ aese v18.16b,v20.16b
+ aesmc v18.16b,v18.16b
+ add w14,w8,#5
+ rev w9,w9
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ rev w10,w10
+ rev w12,w12
+ aese v25.16b,v20.16b
+ aesmc v25.16b,v25.16b
+ rev w13,w13
+ rev w14,w14
+
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v21.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v21.16b
+ aesmc v25.16b,v25.16b
+
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v2.16b},[x0],#16
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0],#16
+ aese v18.16b,v22.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v26.16b},[x0],#16
+ aese v25.16b,v22.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v27.16b},[x0],#16
+
+ aese v0.16b,v23.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v1.16b,v23.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v18.16b,v23.16b
+ eor v19.16b,v19.16b,v7.16b
+ aese v24.16b,v23.16b
+ eor v26.16b,v26.16b,v7.16b
+ aese v25.16b,v23.16b
+ eor v27.16b,v27.16b,v7.16b
+
+ eor v2.16b,v2.16b,v0.16b
+ orr v0.16b,v6.16b,v6.16b
+ eor v3.16b,v3.16b,v1.16b
+ orr v1.16b,v6.16b,v6.16b
+ eor v19.16b,v19.16b,v18.16b
+ orr v18.16b,v6.16b,v6.16b
+ eor v26.16b,v26.16b,v24.16b
+ orr v24.16b,v6.16b,v6.16b
+ eor v27.16b,v27.16b,v25.16b
+ orr v25.16b,v6.16b,v6.16b
+
+ st1 {v2.16b},[x1],#16
+ mov v0.s[3],w9
+ st1 {v3.16b},[x1],#16
+ mov v1.s[3],w10
+ st1 {v19.16b},[x1],#16
+ mov v18.s[3],w12
+ st1 {v26.16b},[x1],#16
+ mov v24.s[3],w13
+ st1 {v27.16b},[x1],#16
+ mov v25.s[3],w14
+
+ mov w6,w5
+ cbz x2,.Lctr32_done
+
+ add w8,w8,#5
+ subs x2,x2,#5
+ b.hs .Loop5x_ctr32
+
+ add x2,x2,#5
+ sub w8,w8,#5
+
+ cmp x2,#2
+ mov x12,#16
+ csel x12,xzr,x12,lo
+ b.ls .Lctr32_tail
+
+ sub x2,x2,#3
+ add w8,w8,#3
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ orr v0.16b,v6.16b,v6.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ orr v18.16b,v6.16b,v6.16b
+ add w9,w8,#1
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ eor v19.16b,v19.16b,v7.16b
+ rev w9,w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ mov v0.s[3], w9
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ mov v1.s[3], w10
+ rev w12,w8
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ mov v18.s[3], w12
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v19.16b},[x1],#16
+ b.hs .Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq .Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+.Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq .Lctr32_done
+ st1 {v3.16b},[x1]
+
+.Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
+.globl aes_v8_xts_encrypt
+.type aes_v8_xts_encrypt,%function
+.align 5
+aes_v8_xts_encrypt:
+
+ cmp x2,#16
+
+ b.ne .Lxts_enc_big_size
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+.Loop_enc_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt .Loop_enc_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+ ld1 {v0.16b},[x0]
+ eor v0.16b,v6.16b,v0.16b
+
+ ldr w6,[x3,#240]
+ ld1 {v28.4s,v29.4s},[x3],#32
+
+ aese v0.16b,v28.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aese v0.16b,v29.16b
+ aesmc v0.16b,v0.16b
+ subs w6,w6,#10
+ b.eq .Lxts_128_enc
+.Lxts_enc_round_loop:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w6,w6,#2
+ b.gt .Lxts_enc_round_loop
+.Lxts_128_enc:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ eor v0.16b,v0.16b,v6.16b
+ st1 {v0.16b},[x1]
+ b .Lxts_enc_final_abort
+
+.align 4
+.Lxts_enc_big_size:
+ stp x19,x20,[sp,#-64]!
+ stp x21,x22,[sp,#48]
+ stp d8,d9,[sp,#32]
+ stp d10,d11,[sp,#16]
+
+
+ and x21,x2,#0xf
+ and x2,x2,#-16
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo .Lxts_abort
+ csel x8,xzr,x8,eq
+
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+.Loop_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt .Loop_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+
+
+
+ fmov x9,d6
+ fmov x10,v6.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d8,x9
+ fmov v8.d[1],x10
+
+ ldr w5,[x3,#240]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3]
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+
+
+.Lxts_enc:
+ ld1 {v24.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v28.16b,v0.16b,v0.16b
+ orr v27.16b,v24.16b,v24.16b
+ orr v29.16b,v24.16b,v24.16b
+ b.lo .Lxts_inner_enc_tail
+ eor v0.16b,v0.16b,v6.16b
+ eor v24.16b,v24.16b,v8.16b
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d9,x9
+ fmov v9.d[1],x10
+
+
+ orr v1.16b,v24.16b,v24.16b
+ ld1 {v24.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ eor v27.16b,v24.16b,v9.16b
+ eor v24.16b,v24.16b,v9.16b
+ cmp x2,#32
+ b.lo .Lxts_outer_enc_tail
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d10,x9
+ fmov v10.d[1],x10
+
+ ld1 {v25.16b},[x0],#16
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v26.16b},[x0],#16
+ eor v25.16b,v25.16b,v10.16b
+ eor v26.16b,v26.16b,v11.16b
+ sub x2,x2,#32
+ mov w6,w5
+ b .Loop5x_xts_enc
+
+.align 4
+.Loop5x_xts_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop5x_xts_enc
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ subs x2,x2,#0x50
+
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v18.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v18.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v18.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v18.16b
+ aesmc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v19.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v19.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v19.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v19.16b
+ aesmc v26.16b,v26.16b
+
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v20.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v20.16b
+ aesmc v26.16b,v26.16b
+
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v21.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v21.16b
+ aesmc v26.16b,v26.16b
+
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v22.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v22.16b
+ aesmc v26.16b,v26.16b
+
+ eor v4.16b,v7.16b,v6.16b
+ aese v0.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v7.16b,v8.16b
+ ld1 {v2.16b},[x0],#16
+ aese v1.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d8,x9
+ fmov v8.d[1],x10
+ eor v17.16b,v7.16b,v9.16b
+ ld1 {v3.16b},[x0],#16
+ aese v24.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d9,x9
+ fmov v9.d[1],x10
+ eor v30.16b,v7.16b,v10.16b
+ ld1 {v27.16b},[x0],#16
+ aese v25.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d10,x9
+ fmov v10.d[1],x10
+ eor v31.16b,v7.16b,v11.16b
+ ld1 {v28.16b},[x0],#16
+ aese v26.16b,v23.16b
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v29.16b},[x0],#16
+ cbz x6,.Lxts_enc_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ eor v0.16b,v2.16b,v6.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v1.16b,v3.16b,v8.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v30.16b,v30.16b,v25.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ eor v26.16b,v29.16b,v11.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs .Loop5x_xts_enc
+
+
+
+ cmn x2,#0x10
+ b.ne .Loop5x_enc_after
+ orr v11.16b,v10.16b,v10.16b
+ orr v10.16b,v9.16b,v9.16b
+ orr v9.16b,v8.16b,v8.16b
+ orr v8.16b,v6.16b,v6.16b
+ fmov x9,d11
+ fmov x10,v11.d[1]
+ eor v0.16b,v6.16b,v2.16b
+ eor v1.16b,v8.16b,v3.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v26.16b,v29.16b,v11.16b
+ b.eq .Loop5x_xts_enc
+
+.Loop5x_enc_after:
+ add x2,x2,#0x50
+ cbz x2,.Lxts_enc_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ b.lo .Lxts_inner_enc_tail
+
+ eor v0.16b,v6.16b,v27.16b
+ eor v1.16b,v8.16b,v28.16b
+ eor v24.16b,v29.16b,v9.16b
+ b .Lxts_outer_enc_tail
+
+.align 4
+.Lxts_enc_tail4x:
+ add x0,x0,#16
+ eor v5.16b,v1.16b,v5.16b
+ st1 {v5.16b},[x1],#16
+ eor v17.16b,v24.16b,v17.16b
+ st1 {v17.16b},[x1],#16
+ eor v30.16b,v25.16b,v30.16b
+ eor v31.16b,v26.16b,v31.16b
+ st1 {v30.16b,v31.16b},[x1],#32
+
+ b .Lxts_enc_done
+.align 4
+.Lxts_outer_enc_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lxts_outer_enc_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+
+ fmov x9,d9
+ fmov x10,v9.d[1]
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v8.16b,v7.16b
+ csel x6,x2,x6,lo
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ eor v17.16b,v9.16b,v7.16b
+
+ add x6,x6,#0x20
+ add x0,x0,x6
+ mov x7,x3
+
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ ld1 {v27.16b},[x0],#16
+ add w6,w5,#2
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v24.16b,v24.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ st1 {v5.16b},[x1],#16
+ st1 {v24.16b},[x1],#16
+ cmn x2,#0x30
+ b.eq .Lxts_enc_done
+.Lxts_encxor_one:
+ orr v28.16b,v3.16b,v3.16b
+ orr v29.16b,v27.16b,v27.16b
+ nop
+
+.Lxts_inner_enc_tail:
+ cmn x2,#0x10
+ eor v1.16b,v28.16b,v6.16b
+ eor v24.16b,v29.16b,v8.16b
+ b.eq .Lxts_enc_tail_loop
+ eor v24.16b,v29.16b,v6.16b
+.Lxts_enc_tail_loop:
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lxts_enc_tail_loop
+
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ cmn x2,#0x20
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ eor v5.16b,v6.16b,v7.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ eor v17.16b,v8.16b,v7.16b
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ b.eq .Lxts_enc_one
+ eor v5.16b,v5.16b,v1.16b
+ st1 {v5.16b},[x1],#16
+ eor v17.16b,v17.16b,v24.16b
+ orr v6.16b,v8.16b,v8.16b
+ st1 {v17.16b},[x1],#16
+ fmov x9,d8
+ fmov x10,v8.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ b .Lxts_enc_done
+
+.Lxts_enc_one:
+ eor v5.16b,v5.16b,v24.16b
+ orr v6.16b,v6.16b,v6.16b
+ st1 {v5.16b},[x1],#16
+ fmov x9,d6
+ fmov x10,v6.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ b .Lxts_enc_done
+.align 5
+.Lxts_enc_done:
+
+ tst x21,#0xf
+ b.eq .Lxts_abort
+
+ mov x20,x0
+ mov x13,x1
+ sub x1,x1,#16
+.composite_enc_loop:
+ subs x21,x21,#1
+ ldrb w15,[x1,x21]
+ ldrb w14,[x20,x21]
+ strb w15,[x13,x21]
+ strb w14,[x1,x21]
+ b.gt .composite_enc_loop
+.Lxts_enc_load_done:
+ ld1 {v26.16b},[x1]
+ eor v26.16b,v26.16b,v6.16b
+
+
+ ldr w6,[x3,#240]
+ ld1 {v0.4s},[x3],#16
+ sub w6,w6,#2
+ ld1 {v1.4s},[x3],#16
+.Loop_final_enc:
+ aese v26.16b,v0.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v0.4s},[x3],#16
+ subs w6,w6,#2
+ aese v26.16b,v1.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v1.4s},[x3],#16
+ b.gt .Loop_final_enc
+
+ aese v26.16b,v0.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v0.4s},[x3]
+ aese v26.16b,v1.16b
+ eor v26.16b,v26.16b,v0.16b
+ eor v26.16b,v26.16b,v6.16b
+ st1 {v26.16b},[x1]
+
+.Lxts_abort:
+ ldp x21,x22,[sp,#48]
+ ldp d8,d9,[sp,#32]
+ ldp d10,d11,[sp,#16]
+ ldp x19,x20,[sp],#64
+.Lxts_enc_final_abort:
+ ret
+.size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
+.globl aes_v8_xts_decrypt
+.type aes_v8_xts_decrypt,%function
+.align 5
+aes_v8_xts_decrypt:
+
+ cmp x2,#16
+
+ b.ne .Lxts_dec_big_size
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+.Loop_dec_small_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt .Loop_dec_small_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+ ld1 {v0.16b},[x0]
+ eor v0.16b,v6.16b,v0.16b
+
+ ldr w6,[x3,#240]
+ ld1 {v28.4s,v29.4s},[x3],#32
+
+ aesd v0.16b,v28.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aesd v0.16b,v29.16b
+ aesimc v0.16b,v0.16b
+ subs w6,w6,#10
+ b.eq .Lxts_128_dec
+.Lxts_dec_round_loop:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w6,w6,#2
+ b.gt .Lxts_dec_round_loop
+.Lxts_128_dec:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ eor v0.16b,v6.16b,v0.16b
+ st1 {v0.16b},[x1]
+ b .Lxts_dec_final_abort
+.Lxts_dec_big_size:
+ stp x19,x20,[sp,#-64]!
+ stp x21,x22,[sp,#48]
+ stp d8,d9,[sp,#32]
+ stp d10,d11,[sp,#16]
+
+ and x21,x2,#0xf
+ and x2,x2,#-16
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo .Lxts_dec_abort
+
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+.Loop_dec_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt .Loop_dec_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+
+
+
+ fmov x9,d6
+ fmov x10,v6.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d8,x9
+ fmov v8.d[1],x10
+
+ ldr w5,[x3,#240]
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d9,x9
+ fmov v9.d[1],x10
+
+ ld1 {v16.4s,v17.4s},[x3]
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d10,x9
+ fmov v10.d[1],x10
+
+ add x7,x3,#32
+ mov w6,w5
+ b .Lxts_dec
+
+
+.align 5
+.Lxts_dec:
+ tst x21,#0xf
+ b.eq .Lxts_dec_begin
+ subs x2,x2,#16
+ csel x8,xzr,x8,eq
+ ld1 {v0.16b},[x0],#16
+ b.lo .Lxts_done
+ sub x0,x0,#16
+.Lxts_dec_begin:
+ ld1 {v0.16b},[x0],x8
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v28.16b,v0.16b,v0.16b
+ ld1 {v24.16b},[x0],#16
+ orr v27.16b,v24.16b,v24.16b
+ orr v29.16b,v24.16b,v24.16b
+ b.lo .Lxts_inner_dec_tail
+ eor v0.16b,v0.16b,v6.16b
+ eor v24.16b,v24.16b,v8.16b
+
+ orr v1.16b,v24.16b,v24.16b
+ ld1 {v24.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ eor v27.16b,v24.16b,v9.16b
+ eor v24.16b,v24.16b,v9.16b
+ cmp x2,#32
+ b.lo .Lxts_outer_dec_tail
+
+ ld1 {v25.16b},[x0],#16
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v26.16b},[x0],#16
+ eor v25.16b,v25.16b,v10.16b
+ eor v26.16b,v26.16b,v11.16b
+ sub x2,x2,#32
+ mov w6,w5
+ b .Loop5x_xts_dec
+
+.align 4
+.Loop5x_xts_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop5x_xts_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ subs x2,x2,#0x50
+
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v18.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v18.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v18.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v18.16b
+ aesimc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v19.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v19.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v19.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v19.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v20.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v20.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v21.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v21.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v22.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v22.16b
+ aesimc v26.16b,v26.16b
+
+ eor v4.16b,v7.16b,v6.16b
+ aesd v0.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v7.16b,v8.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v1.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d8,x9
+ fmov v8.d[1],x10
+ eor v17.16b,v7.16b,v9.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v24.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d9,x9
+ fmov v9.d[1],x10
+ eor v30.16b,v7.16b,v10.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v25.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d10,x9
+ fmov v10.d[1],x10
+ eor v31.16b,v7.16b,v11.16b
+ ld1 {v28.16b},[x0],#16
+ aesd v26.16b,v23.16b
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v29.16b},[x0],#16
+ cbz x6,.Lxts_dec_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ eor v0.16b,v2.16b,v6.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v1.16b,v3.16b,v8.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v30.16b,v30.16b,v25.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ eor v26.16b,v29.16b,v11.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs .Loop5x_xts_dec
+
+ cmn x2,#0x10
+ b.ne .Loop5x_dec_after
+
+
+
+ orr v11.16b,v10.16b,v10.16b
+ orr v10.16b,v9.16b,v9.16b
+ orr v9.16b,v8.16b,v8.16b
+ orr v8.16b,v6.16b,v6.16b
+ fmov x9,d11
+ fmov x10,v11.d[1]
+ eor v0.16b,v6.16b,v2.16b
+ eor v1.16b,v8.16b,v3.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v26.16b,v29.16b,v11.16b
+ b.eq .Loop5x_xts_dec
+
+.Loop5x_dec_after:
+ add x2,x2,#0x50
+ cbz x2,.Lxts_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ b.lo .Lxts_inner_dec_tail
+
+ eor v0.16b,v6.16b,v27.16b
+ eor v1.16b,v8.16b,v28.16b
+ eor v24.16b,v29.16b,v9.16b
+ b .Lxts_outer_dec_tail
+
+.align 4
+.Lxts_dec_tail4x:
+ add x0,x0,#16
+ tst x21,#0xf
+ eor v5.16b,v1.16b,v4.16b
+ st1 {v5.16b},[x1],#16
+ eor v17.16b,v24.16b,v17.16b
+ st1 {v17.16b},[x1],#16
+ eor v30.16b,v25.16b,v30.16b
+ eor v31.16b,v26.16b,v31.16b
+ st1 {v30.16b,v31.16b},[x1],#32
+
+ b.eq .Lxts_dec_abort
+ ld1 {v0.16b},[x0],#16
+ b .Lxts_done
+.align 4
+.Lxts_outer_dec_tail:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lxts_outer_dec_tail
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+
+ fmov x9,d9
+ fmov x10,v9.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v8.16b,v7.16b
+ csel x6,x2,x6,lo
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v9.16b,v7.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d8,x9
+ fmov v8.d[1],x10
+
+ add x6,x6,#0x20
+ add x0,x0,x6
+
+ mov x7,x3
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d9,x9
+ fmov v9.d[1],x10
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ ld1 {v16.4s},[x7],#16
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v24.16b,v24.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ st1 {v5.16b},[x1],#16
+ st1 {v24.16b},[x1],#16
+
+ cmn x2,#0x30
+ add x2,x2,#0x30
+ b.eq .Lxts_done
+ sub x2,x2,#0x30
+ orr v28.16b,v3.16b,v3.16b
+ orr v29.16b,v27.16b,v27.16b
+ nop
+
+.Lxts_inner_dec_tail:
+
+ cmn x2,#0x10
+ eor v1.16b,v28.16b,v6.16b
+ eor v24.16b,v29.16b,v8.16b
+ b.eq .Lxts_dec_tail_loop
+ eor v24.16b,v29.16b,v6.16b
+.Lxts_dec_tail_loop:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lxts_dec_tail_loop
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v8.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ b.eq .Lxts_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v24.16b
+ orr v6.16b,v9.16b,v9.16b
+ orr v8.16b,v10.16b,v10.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ add x2,x2,#16
+ b .Lxts_done
+
+.Lxts_dec_one:
+ eor v5.16b,v5.16b,v24.16b
+ orr v6.16b,v8.16b,v8.16b
+ orr v8.16b,v9.16b,v9.16b
+ st1 {v5.16b},[x1],#16
+ add x2,x2,#32
+
+.Lxts_done:
+ tst x21,#0xf
+ b.eq .Lxts_dec_abort
+
+ mov x7,x3
+ cbnz x2,.Lxts_dec_1st_done
+ ld1 {v0.16b},[x0],#16
+
+
+.Lxts_dec_1st_done:
+ eor v26.16b,v0.16b,v8.16b
+ ldr w6,[x3,#240]
+ ld1 {v0.4s},[x3],#16
+ sub w6,w6,#2
+ ld1 {v1.4s},[x3],#16
+.Loop_final_2nd_dec:
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x3],#16
+ subs w6,w6,#2
+ aesd v26.16b,v1.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v1.4s},[x3],#16
+ b.gt .Loop_final_2nd_dec
+
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x3]
+ aesd v26.16b,v1.16b
+ eor v26.16b,v26.16b,v0.16b
+ eor v26.16b,v26.16b,v8.16b
+ st1 {v26.16b},[x1]
+
+ mov x20,x0
+ add x13,x1,#16
+
+
+
+.composite_dec_loop:
+ subs x21,x21,#1
+ ldrb w15,[x1,x21]
+ ldrb w14,[x20,x21]
+ strb w15,[x13,x21]
+ strb w14,[x1,x21]
+ b.gt .composite_dec_loop
+.Lxts_dec_load_done:
+ ld1 {v26.16b},[x1]
+ eor v26.16b,v26.16b,v6.16b
+
+
+ ldr w6,[x7,#240]
+ ld1 {v0.4s},[x7],#16
+ sub w6,w6,#2
+ ld1 {v1.4s},[x7],#16
+.Loop_final_dec:
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v26.16b,v1.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v1.4s},[x7],#16
+ b.gt .Loop_final_dec
+
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x7]
+ aesd v26.16b,v1.16b
+ eor v26.16b,v26.16b,v0.16b
+ eor v26.16b,v26.16b,v6.16b
+ st1 {v26.16b},[x1]
+
+.Lxts_dec_abort:
+ ldp x21,x22,[sp,#48]
+ ldp d8,d9,[sp,#32]
+ ldp d10,d11,[sp,#16]
+ ldp x19,x20,[sp],#64
+
+.Lxts_dec_final_abort:
+ ret
+.size aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
 .section .note.GNU-stack,"",%progbits
index c30139985bb8549ce43254a599feb456f09bf7be..b30a6eeedfba647aa83dd96c018b3d6c20b5dcbd 100644 (file)
 #
 # *** This file is auto-generated ***
 #
-# 1 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
+# 0 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
+# 1 "/home/zfridric/upstream/gnutls//"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S" 2
 
 
-.text
 .arch armv8-a+crypto
+.text
 .globl gcm_init_v8
 .type gcm_init_v8,%function
 .align 4
 gcm_init_v8:
+
  ld1 {v17.2d},[x1]
  movi v19.16b,#0xe1
  shl v19.2d,v19.2d,#57
@@ -126,21 +128,110 @@ gcm_init_v8:
  pmull v5.1q,v5.1d,v19.1d
  eor v18.16b,v18.16b,v2.16b
  eor v4.16b,v4.16b,v7.16b
- eor v20.16b, v0.16b,v18.16b
- eor v22.16b,v5.16b,v4.16b
+ eor v23.16b, v0.16b,v18.16b
+ eor v25.16b,v5.16b,v4.16b
+
+ ext v16.16b,v23.16b, v23.16b,#8
+ ext v17.16b,v25.16b,v25.16b,#8
+ ext v18.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v23.16b
+ eor v17.16b,v17.16b,v25.16b
+ eor v18.16b,v18.16b,v22.16b
+ ext v24.16b,v16.16b,v17.16b,#8
+ st1 {v23.2d,v24.2d,v25.2d},[x0],#48
+
+
+ pmull v0.1q,v22.1d, v23.1d
+ pmull v5.1q,v23.1d,v23.1d
+ pmull2 v2.1q,v22.2d, v23.2d
+ pmull2 v7.1q,v23.2d,v23.2d
+ pmull v1.1q,v16.1d,v18.1d
+ pmull v6.1q,v16.1d,v16.1d
 
- ext v16.16b,v20.16b, v20.16b,#8
- ext v17.16b,v22.16b,v22.16b,#8
- eor v16.16b,v16.16b,v20.16b
- eor v17.16b,v17.16b,v22.16b
- ext v21.16b,v16.16b,v17.16b,#8
- st1 {v20.2d,v21.2d,v22.2d},[x0]
+ ext v16.16b,v0.16b,v2.16b,#8
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v26.16b,v0.16b,v18.16b
+ eor v28.16b,v5.16b,v4.16b
+
+ ext v16.16b,v26.16b, v26.16b,#8
+ ext v17.16b,v28.16b,v28.16b,#8
+ ext v18.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v26.16b
+ eor v17.16b,v17.16b,v28.16b
+ eor v18.16b,v18.16b,v22.16b
+ ext v27.16b,v16.16b,v17.16b,#8
+ st1 {v26.2d,v27.2d,v28.2d},[x0],#48
+
+
+ pmull v0.1q,v22.1d,v26.1d
+ pmull v5.1q,v22.1d,v28.1d
+ pmull2 v2.1q,v22.2d,v26.2d
+ pmull2 v7.1q,v22.2d,v28.2d
+ pmull v1.1q,v16.1d,v18.1d
+ pmull v6.1q,v17.1d,v18.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v29.16b,v0.16b,v18.16b
+ eor v31.16b,v5.16b,v4.16b
+
+ ext v16.16b,v29.16b,v29.16b,#8
+ ext v17.16b,v31.16b,v31.16b,#8
+ eor v16.16b,v16.16b,v29.16b
+ eor v17.16b,v17.16b,v31.16b
+ ext v30.16b,v16.16b,v17.16b,#8
+ st1 {v29.2d,v30.2d,v31.2d},[x0]
  ret
 .size gcm_init_v8,.-gcm_init_v8
 .globl gcm_gmult_v8
 .type gcm_gmult_v8,%function
 .align 4
 gcm_gmult_v8:
+
  ld1 {v17.2d},[x0]
  movi v19.16b,#0xe1
  ld1 {v20.2d,v21.2d},[x1]
@@ -182,6 +273,7 @@ gcm_gmult_v8:
 .type gcm_ghash_v8,%function
 .align 4
 gcm_ghash_v8:
+
  cmp x3,#64
  b.hs .Lgcm_ghash_v8_4x
  ld1 {v0.2d},[x0]
@@ -192,7 +284,7 @@ gcm_ghash_v8:
 
  subs x3,x3,#32
  mov x12,#16
-# 159 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
+# 250 "lib/accelerated/aarch64/elf/ghash-aarch64.s.tmp.S"
  ld1 {v20.2d,v21.2d},[x1],#32
  movi v19.16b,#0xe1
  ld1 {v22.2d},[x1]
index 0de5de02f25af2ccd1cbe059dfbff3aacf2c4506..95e58427adc76b50f759d05696c26b0c808efa62 100644 (file)
 #
 # *** This file is auto-generated ***
 #
-# 1 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
+# 0 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S"
+# 1 "/home/zfridric/upstream/gnutls//"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/elf/sha1-armv8.s.tmp.S" 2
 
-.text
-
 
 .hidden _gnutls_arm_cpuid_s
+
+
+.text
+
 .globl sha1_block_data_order
 .type sha1_block_data_order,%function
 .align 6
 sha1_block_data_order:
 
-
-
- ldr x16,.L_gnutls_arm_cpuid_s
-
- adr x17,.L_gnutls_arm_cpuid_s
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#(1<<3)
+ adrp x16,_gnutls_arm_cpuid_s
+ ldr w16,[x16,#:lo12:_gnutls_arm_cpuid_s]
+ tst w16,#(1 << 3)
  b.ne .Lv8_entry
 
+
  stp x29,x30,[sp,#-96]!
  add x29,sp,#0
  stp x19,x20,[sp,#16]
@@ -88,7 +87,7 @@ sha1_block_data_order:
  add w24,w24,w28
  add w24,w24,w3
  lsr x4,x3,#32
- ldr x5,[x1,#-56]
+ ldur x5,[x1,#-56]
  bic w25,w23,w21
  and w26,w22,w21
  ror w27,w20,#27
@@ -113,7 +112,7 @@ sha1_block_data_order:
  add w22,w22,w5
  add w23,w23,w25
  lsr x6,x5,#32
- ldr x7,[x1,#-48]
+ ldur x7,[x1,#-48]
  bic w25,w21,w24
  and w26,w20,w24
  ror w27,w23,#27
@@ -138,7 +137,7 @@ sha1_block_data_order:
  add w20,w20,w7
  add w21,w21,w25
  lsr x8,x7,#32
- ldr x9,[x1,#-40]
+ ldur x9,[x1,#-40]
  bic w25,w24,w22
  and w26,w23,w22
  ror w27,w21,#27
@@ -163,7 +162,7 @@ sha1_block_data_order:
  add w23,w23,w9
  add w24,w24,w25
  lsr x10,x9,#32
- ldr x11,[x1,#-32]
+ ldur x11,[x1,#-32]
  bic w25,w22,w20
  and w26,w21,w20
  ror w27,w24,#27
@@ -188,7 +187,7 @@ sha1_block_data_order:
  add w21,w21,w11
  add w22,w22,w25
  lsr x12,x11,#32
- ldr x13,[x1,#-24]
+ ldur x13,[x1,#-24]
  bic w25,w20,w23
  and w26,w24,w23
  ror w27,w22,#27
@@ -213,7 +212,7 @@ sha1_block_data_order:
  add w24,w24,w13
  add w20,w20,w25
  lsr x14,x13,#32
- ldr x15,[x1,#-16]
+ ldur x15,[x1,#-16]
  bic w25,w23,w21
  and w26,w22,w21
  ror w27,w20,#27
@@ -238,7 +237,7 @@ sha1_block_data_order:
  add w22,w22,w15
  add w23,w23,w25
  lsr x16,x15,#32
- ldr x17,[x1,#-8]
+ ldur x17,[x1,#-8]
  bic w25,w21,w24
  and w26,w20,w24
  ror w27,w23,#27
@@ -1122,6 +1121,7 @@ sha1_block_data_order:
 .align 6
 sha1_block_armv8:
 .Lv8_entry:
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
 
@@ -1254,12 +1254,6 @@ sha1_block_armv8:
 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.L_gnutls_arm_cpuid_s:
-
-
-
-.quad _gnutls_arm_cpuid_s-.
-
 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 2
 .align 2
index 2d13b5e4276b61884825085321c3882ae2a88286..2aab5d14ce3e9d7a1d62654cf1dd7b61f6d05335 100644 (file)
 #
 # *** This file is auto-generated ***
 #
+# 0 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
+# 1 "/home/zfridric/upstream/gnutls//"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
-# 56 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
+# 58 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
-# 57 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S" 2
+# 59 "lib/accelerated/aarch64/elf/sha256-armv8.s.tmp.S" 2
 
 
-.text
+.hidden _gnutls_arm_cpuid_s
 
 
-.hidden _gnutls_arm_cpuid_s
+.text
+
 .globl sha256_block_data_order
 .type sha256_block_data_order,%function
 .align 6
 sha256_block_data_order:
 
 
-
-
- ldr x16,.L_gnutls_arm_cpuid_s
-
- adr x17,.L_gnutls_arm_cpuid_s
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#(1<<4)
+ adrp x16,_gnutls_arm_cpuid_s
+ ldr w16,[x16,#:lo12:_gnutls_arm_cpuid_s]
+ tst w16,#(1 << 4)
  b.ne .Lv8_entry
- tst w16,#(1<<0)
+ tst w16,#(1 << 0)
  b.ne .Lneon_entry
 
-.inst 0xd503233f
+
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1028,7 +1025,7 @@ sha256_block_data_order:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
-.inst 0xd50323bf
+
  ret
 .size sha256_block_data_order,.-sha256_block_data_order
 
@@ -1053,15 +1050,6 @@ sha256_block_data_order:
 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .long 0
 .size .LK256,.-.LK256
-
-.align 3
-.L_gnutls_arm_cpuid_s:
-
-
-
-.quad _gnutls_arm_cpuid_s-.
-
-
 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 2
 .align 2
@@ -1070,6 +1058,7 @@ sha256_block_data_order:
 .align 6
 sha256_block_armv8:
 .Lv8_entry:
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
 
@@ -1211,7 +1200,9 @@ sha256_block_armv8:
 .type sha256_block_neon,%function
 .align 4
 sha256_block_neon:
+
 .Lneon_entry:
+
  stp x29, x30, [sp, #-16]!
  mov x29, sp
  sub sp,sp,#16*4
index 13384fc827488ef69e32e265f254198e51694a56..201492465b6047c7bfff5c25f84f70263b49af2c 100644 (file)
 #
 # *** This file is auto-generated ***
 #
+# 0 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
+# 1 "/home/zfridric/upstream/gnutls//"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
-# 56 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
+# 58 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
-# 57 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S" 2
+# 59 "lib/accelerated/aarch64/elf/sha512-armv8.s.tmp.S" 2
 
 
-.text
+.hidden _gnutls_arm_cpuid_s
 
 
-.hidden _gnutls_arm_cpuid_s
+.text
+
 .globl sha512_block_data_order
 .type sha512_block_data_order,%function
 .align 6
 sha512_block_data_order:
 
 
-
-
- ldr x16,.L_gnutls_arm_cpuid_s
-
- adr x17,.L_gnutls_arm_cpuid_s
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#(1<<6)
+ adrp x16,_gnutls_arm_cpuid_s
+ ldr w16,[x16,#:lo12:_gnutls_arm_cpuid_s]
+ tst w16,#(1 << 6)
  b.ne .Lv8_entry
 
-.inst 0xd503233f
+
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1026,7 +1023,7 @@ sha512_block_data_order:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
-.inst 0xd50323bf
+
  ret
 .size sha512_block_data_order,.-sha512_block_data_order
 
@@ -1075,15 +1072,6 @@ sha512_block_data_order:
 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 .quad 0
 .size .LK512,.-.LK512
-
-.align 3
-.L_gnutls_arm_cpuid_s:
-
-
-
-.quad _gnutls_arm_cpuid_s-.
-
-
 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 2
 .align 2
@@ -1092,6 +1080,7 @@ sha512_block_data_order:
 .align 6
 sha512_block_armv8:
 .Lv8_entry:
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
 
index 4b55f88071acc38b1dcba49baf846073349945a6..a906a6946faaf44461661c7a5e5e5ec5f3b91525 100644 (file)
 #
 # *** This file is auto-generated ***
 #
-# 1 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
+# 0 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/macosx/aes-aarch64.s.tmp.S" 2
 
 
-.text
 
+.text
 .align 5
 Lrcon:
 .long 0x01,0x01,0x01,0x01
@@ -58,6 +58,8 @@ Lrcon:
 .align 5
 _aes_v8_set_encrypt_key:
 Lenc_key:
+
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
  mov x3,#-1
@@ -229,7 +231,7 @@ Lenc_key_abort:
 
 .align 5
 _aes_v8_set_decrypt_key:
-.long 0xd503233f
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
  bl Lenc_key
@@ -263,13 +265,14 @@ Loop_imc:
  eor x0,x0,x0
 Ldec_key_abort:
  ldp x29,x30,[sp],#16
-.long 0xd50323bf
+
  ret
 
 .globl _aes_v8_encrypt
 
 .align 5
 _aes_v8_encrypt:
+
  ldr w3,[x2,#240]
  ld1 {v0.4s},[x2],#16
  ld1 {v2.16b},[x0]
@@ -299,6 +302,7 @@ Loop_enc:
 
 .align 5
 _aes_v8_decrypt:
+
  ldr w3,[x2,#240]
  ld1 {v0.4s},[x2],#16
  ld1 {v2.16b},[x0]
@@ -324,21 +328,109 @@ Loop_dec:
  st1 {v2.16b},[x1]
  ret
 
-.globl _aes_v8_cbc_encrypt
+.globl _aes_v8_ecb_encrypt
 
 .align 5
-_aes_v8_cbc_encrypt:
+_aes_v8_ecb_encrypt:
+
+ subs x2,x2,#16
+
+ b.ne Lecb_big_size
+ ld1 {v0.16b},[x0]
+ cmp w4,#0
+ ldr w5,[x3,#240]
+ ld1 {v5.4s,v6.4s},[x3],#32
+
+ b.eq Lecb_small_dec
+ aese v0.16b,v5.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aese v0.16b,v6.16b
+ aesmc v0.16b,v0.16b
+ subs w5,w5,#10
+ b.eq Lecb_128_enc
+Lecb_round_loop:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w5,w5,#2
+ b.gt Lecb_round_loop
+Lecb_128_enc:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ st1 {v0.16b},[x1]
+ b Lecb_Final_abort
+Lecb_small_dec:
+ aesd v0.16b,v5.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aesd v0.16b,v6.16b
+ aesimc v0.16b,v0.16b
+ subs w5,w5,#10
+ b.eq Lecb_128_dec
+Lecb_dec_round_loop:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w5,w5,#2
+ b.gt Lecb_dec_round_loop
+Lecb_128_dec:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ st1 {v0.16b},[x1]
+ b Lecb_Final_abort
+Lecb_big_size:
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
- subs x2,x2,#16
  mov x8,#16
- b.lo Lcbc_abort
+ b.lo Lecb_done
  csel x8,xzr,x8,eq
 
- cmp w5,#0
+ cmp w4,#0
  ldr w5,[x3,#240]
  and x2,x2,#-16
- ld1 {v6.16b},[x4]
  ld1 {v0.16b},[x0],x8
 
  ld1 {v16.4s,v17.4s},[x3]
@@ -352,448 +444,2790 @@ _aes_v8_cbc_encrypt:
 
  add x7,x3,#32
  mov w6,w5
- b.eq Lcbc_dec
+ b.eq Lecb_dec
 
- cmp w5,#2
- eor v0.16b,v0.16b,v6.16b
- eor v5.16b,v16.16b,v7.16b
- b.eq Lcbc_enc128
+ ld1 {v1.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v1.16b,v1.16b
+ orr v24.16b,v1.16b,v1.16b
+ orr v1.16b,v0.16b,v0.16b
+ b.lo Lecb_enc_tail
 
- ld1 {v2.4s,v3.4s},[x7]
- add x7,x3,#16
- add x6,x3,#16*4
- add x12,x3,#16*5
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- add x14,x3,#16*6
- add x3,x3,#16*7
- b Lenter_cbc_enc
+ orr v1.16b,v3.16b,v3.16b
+ ld1 {v24.16b},[x0],#16
+ cmp x2,#32
+ b.lo Loop3x_ecb_enc
 
-.align 4
-Loop_cbc_enc:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-Lenter_cbc_enc:
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.4s},[x6]
- cmp w5,#4
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x12]
- b.eq Lcbc_enc192
+ ld1 {v25.16b},[x0],#16
+ ld1 {v26.16b},[x0],#16
+ sub x2,x2,#32
+ mov w6,w5
 
+Loop5x_ecb_enc:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- ld1 {v16.4s},[x14]
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x3]
- nop
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop5x_ecb_enc
 
-Lcbc_enc192:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- subs x2,x2,#16
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ cmp x2,#0x40
+ sub x2,x2,#0x50
+
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
  aese v0.16b,v18.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v18.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v18.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v18.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v18.16b
+ aesmc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
  aese v0.16b,v19.16b
  aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ aese v1.16b,v19.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v19.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v19.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v19.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v20.16b
  aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v20.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v20.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v21.16b
  aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7]
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v21.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v21.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v22.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v22.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v22.16b
+ aesmc v26.16b,v26.16b
+
  aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs Loop_cbc_enc
+ ld1 {v2.16b},[x0],#16
+ aese v1.16b,v23.16b
+ ld1 {v3.16b},[x0],#16
+ aese v24.16b,v23.16b
+ ld1 {v27.16b},[x0],#16
+ aese v25.16b,v23.16b
+ ld1 {v28.16b},[x0],#16
+ aese v26.16b,v23.16b
+ ld1 {v29.16b},[x0],#16
+ cbz x6,Lecb_enc_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v7.16b,v0.16b
+ orr v0.16b,v2.16b,v2.16b
+ eor v5.16b,v7.16b,v1.16b
+ orr v1.16b,v3.16b,v3.16b
+ eor v17.16b,v7.16b,v24.16b
+ orr v24.16b,v27.16b,v27.16b
+ eor v30.16b,v7.16b,v25.16b
+ orr v25.16b,v28.16b,v28.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ orr v26.16b,v29.16b,v29.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs Loop5x_ecb_enc
 
- st1 {v6.16b},[x1],#16
b Lcbc_done
+ add x2,x2,#0x50
cbz x2,Lecb_done
 
-.align 5
-Lcbc_enc128:
- ld1 {v2.4s,v3.4s},[x7]
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- b Lenter_cbc_enc128
-Loop_cbc_enc128:
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ orr v0.16b,v27.16b,v27.16b
+ orr v1.16b,v28.16b,v28.16b
+ orr v24.16b,v29.16b,v29.16b
+ b.lo Lecb_enc_tail
+
+ b Loop3x_ecb_enc
+
+.align 4
+Lecb_enc_tail4x:
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
+ eor v30.16b,v7.16b,v25.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+
+ b Lecb_done
+.align 4
+Loop3x_ecb_enc:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-Lenter_cbc_enc128:
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- subs x2,x2,#16
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v18.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_ecb_enc
+
+ aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v0.16b,v19.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ subs x2,x2,#0x30
+ csel x6,x2,x6,lo
+ aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ add x0,x0,x6
+
+
+ mov x7,x3
  aese v0.16b,v20.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v2.16b},[x0],#16
  aese v0.16b,v21.16b
  aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v3.16b},[x0],#16
  aese v0.16b,v22.16b
  aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
  aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs Loop_cbc_enc128
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ ld1 {v16.4s},[x7],#16
+ add w6,w5,#2
+ eor v4.16b,v7.16b,v0.16b
+ eor v5.16b,v7.16b,v1.16b
+ eor v24.16b,v24.16b,v7.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v24.16b},[x1],#16
+ orr v24.16b,v27.16b,v27.16b
+ b.hs Loop3x_ecb_enc
 
- st1 {v6.16b},[x1],#16
- b Lcbc_done
+ cmn x2,#0x30
+ b.eq Lecb_done
+ nop
+
+Lecb_enc_tail:
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lecb_enc_tail
+
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ cmn x2,#0x20
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ b.eq Lecb_enc_one
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b Lecb_done
+
+Lecb_enc_one:
+ eor v5.16b,v7.16b,v24.16b
+ st1 {v5.16b},[x1],#16
+ b Lecb_done
 .align 5
-Lcbc_dec:
- ld1 {v18.16b},[x0],#16
+Lecb_dec:
+ ld1 {v1.16b},[x0],#16
  subs x2,x2,#32
  add w6,w5,#2
- orr v3.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v24.16b,v1.16b,v1.16b
  orr v1.16b,v0.16b,v0.16b
- orr v19.16b,v18.16b,v18.16b
- b.lo Lcbc_dec_tail
+ b.lo Lecb_dec_tail
 
- orr v1.16b,v18.16b,v18.16b
- ld1 {v18.16b},[x0],#16
- orr v2.16b,v0.16b,v0.16b
- orr v3.16b,v1.16b,v1.16b
- orr v19.16b,v18.16b,v18.16b
+ orr v1.16b,v3.16b,v3.16b
+ ld1 {v24.16b},[x0],#16
+ cmp x2,#32
+ b.lo Loop3x_ecb_dec
 
-Loop3x_cbc_dec:
+ ld1 {v25.16b},[x0],#16
+ ld1 {v26.16b},[x0],#16
+ sub x2,x2,#32
+ mov w6,w5
+
+Loop5x_ecb_dec:
  aesd v0.16b,v16.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v16.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
  ld1 {v16.4s},[x7],#16
  subs w6,w6,#2
  aesd v0.16b,v17.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v17.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
  ld1 {v17.4s},[x7],#16
- b.gt Loop3x_cbc_dec
+ b.gt Loop5x_ecb_dec
 
  aesd v0.16b,v16.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v16.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
- eor v4.16b,v6.16b,v7.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ cmp x2,#0x40
+ sub x2,x2,#0x50
+
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v18.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v18.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v18.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v18.16b
+ aesimc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v19.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v19.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v19.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v19.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v20.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v20.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v21.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v21.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v22.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v22.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v23.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v1.16b,v23.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v24.16b,v23.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v25.16b,v23.16b
+ ld1 {v28.16b},[x0],#16
+ aesd v26.16b,v23.16b
+ ld1 {v29.16b},[x0],#16
+ cbz x6,Lecb_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v7.16b,v0.16b
+ orr v0.16b,v2.16b,v2.16b
+ eor v5.16b,v7.16b,v1.16b
+ orr v1.16b,v3.16b,v3.16b
+ eor v17.16b,v7.16b,v24.16b
+ orr v24.16b,v27.16b,v27.16b
+ eor v30.16b,v7.16b,v25.16b
+ orr v25.16b,v28.16b,v28.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ orr v26.16b,v29.16b,v29.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs Loop5x_ecb_dec
+
+ add x2,x2,#0x50
+ cbz x2,Lecb_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ orr v0.16b,v27.16b,v27.16b
+ orr v1.16b,v28.16b,v28.16b
+ orr v24.16b,v29.16b,v29.16b
+ b.lo Lecb_dec_tail
+
+ b Loop3x_ecb_dec
+
+.align 4
+Lecb_tail4x:
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
+ eor v30.16b,v7.16b,v25.16b
+ eor v31.16b,v7.16b,v26.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+
+ b Lecb_done
+.align 4
+Loop3x_ecb_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_ecb_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
  subs x2,x2,#0x30
- eor v5.16b,v2.16b,v7.16b
  csel x6,x2,x6,lo
  aesd v0.16b,v17.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v17.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
  add x0,x0,x6
 
 
- orr v6.16b,v19.16b,v19.16b
  mov x7,x3
  aesd v0.16b,v20.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v20.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v20.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
  ld1 {v2.16b},[x0],#16
  aesd v0.16b,v21.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v21.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v21.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
  ld1 {v3.16b},[x0],#16
  aesd v0.16b,v22.16b
  aesimc v0.16b,v0.16b
  aesd v1.16b,v22.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v22.16b
- aesimc v18.16b,v18.16b
- ld1 {v19.16b},[x0],#16
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
  aesd v0.16b,v23.16b
  aesd v1.16b,v23.16b
- aesd v18.16b,v23.16b
+ aesd v24.16b,v23.16b
  ld1 {v16.4s},[x7],#16
  add w6,w5,#2
- eor v4.16b,v4.16b,v0.16b
- eor v5.16b,v5.16b,v1.16b
- eor v18.16b,v18.16b,v17.16b
+ eor v4.16b,v7.16b,v0.16b
+ eor v5.16b,v7.16b,v1.16b
+ eor v24.16b,v24.16b,v7.16b
  ld1 {v17.4s},[x7],#16
  st1 {v4.16b},[x1],#16
  orr v0.16b,v2.16b,v2.16b
  st1 {v5.16b},[x1],#16
  orr v1.16b,v3.16b,v3.16b
- st1 {v18.16b},[x1],#16
- orr v18.16b,v19.16b,v19.16b
- b.hs Loop3x_cbc_dec
+ st1 {v24.16b},[x1],#16
+ orr v24.16b,v27.16b,v27.16b
+ b.hs Loop3x_ecb_dec
 
  cmn x2,#0x30
- b.eq Lcbc_done
+ b.eq Lecb_done
  nop
 
-Lcbc_dec_tail:
+Lecb_dec_tail:
  aesd v1.16b,v16.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
  ld1 {v16.4s},[x7],#16
  subs w6,w6,#2
  aesd v1.16b,v17.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
  ld1 {v17.4s},[x7],#16
- b.gt Lcbc_dec_tail
+ b.gt Lecb_dec_tail
 
  aesd v1.16b,v16.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v17.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v20.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v20.16b
- aesimc v18.16b,v18.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
  cmn x2,#0x20
  aesd v1.16b,v21.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v21.16b
- aesimc v18.16b,v18.16b
- eor v5.16b,v6.16b,v7.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v22.16b
  aesimc v1.16b,v1.16b
- aesd v18.16b,v22.16b
- aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
  aesd v1.16b,v23.16b
- aesd v18.16b,v23.16b
- b.eq Lcbc_dec_one
- eor v5.16b,v5.16b,v1.16b
- eor v17.16b,v17.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+ aesd v24.16b,v23.16b
+ b.eq Lecb_dec_one
+ eor v5.16b,v7.16b,v1.16b
+ eor v17.16b,v7.16b,v24.16b
  st1 {v5.16b},[x1],#16
  st1 {v17.16b},[x1],#16
- b Lcbc_done
+ b Lecb_done
 
-Lcbc_dec_one:
- eor v5.16b,v5.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+Lecb_dec_one:
+ eor v5.16b,v7.16b,v24.16b
  st1 {v5.16b},[x1],#16
 
-Lcbc_done:
- st1 {v6.16b},[x4]
-Lcbc_abort:
+Lecb_done:
  ldr x29,[sp],#16
+Lecb_Final_abort:
  ret
 
-.globl _aes_v8_ctr32_encrypt_blocks
+.globl _aes_v8_cbc_encrypt
 
 .align 5
-_aes_v8_ctr32_encrypt_blocks:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- ldr w5,[x3,#240]
-
- ldr w8, [x4, #12]
+_aes_v8_cbc_encrypt:
 
 
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo Lcbc_abort
+ csel x8,xzr,x8,eq
 
- ld1 {v0.4s},[x4]
+ cmp w5,#0
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
 
  ld1 {v16.4s,v17.4s},[x3]
- sub w5,w5,#4
- mov x12,#16
- cmp x2,#2
+ sub w5,w5,#6
  add x7,x3,x5,lsl#4
  sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
  ld1 {v20.4s,v21.4s},[x7],#32
  ld1 {v22.4s,v23.4s},[x7],#32
  ld1 {v7.4s},[x7]
+
  add x7,x3,#32
  mov w6,w5
- csel x12,xzr,x12,lo
+ b.eq Lcbc_dec
 
- rev w8, w8
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq Lcbc_enc128
 
- orr v1.16b,v0.16b,v0.16b
- add w10, w8, #1
- orr v18.16b,v0.16b,v0.16b
- add w8, w8, #2
- orr v6.16b,v0.16b,v0.16b
- rev w10, w10
- mov v1.s[3],w10
- b.ls Lctr32_tail
- rev w12, w8
- sub x2,x2,#3
- mov v18.s[3],w12
- b Loop3x_ctr32
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b Lenter_cbc_enc
 
 .align 4
-Loop3x_ctr32:
+Loop_cbc_enc:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc:
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v17.16b
- aesmc v18.16b,v18.16b
- ld1 {v17.4s},[x7],#16
- b.gt Loop3x_ctr32
-
- aese v0.16b,v16.16b
- aesmc v4.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v5.16b,v1.16b
- ld1 {v2.16b},[x0],#16
- orr v0.16b,v6.16b,v6.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
- orr v1.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v17.16b
- aesmc v5.16b,v5.16b
- ld1 {v19.16b},[x0],#16
- mov x7,x3
- aese v18.16b,v17.16b
- aesmc v17.16b,v18.16b
- orr v18.16b,v6.16b,v6.16b
- add w9,w8,#1
- aese v4.16b,v20.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v20.16b
- aesmc v5.16b,v5.16b
- eor v2.16b,v2.16b,v7.16b
- add w10,w8,#2
- aese v17.16b,v20.16b
- aesmc v17.16b,v17.16b
- eor v3.16b,v3.16b,v7.16b
- add w8,w8,#3
- aese v4.16b,v21.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v21.16b
- aesmc v5.16b,v5.16b
- eor v19.16b,v19.16b,v7.16b
- rev w9,w9
- aese v17.16b,v21.16b
- aesmc v17.16b,v17.16b
- mov v0.s[3], w9
- rev w10,w10
- aese v4.16b,v22.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v22.16b
- aesmc v5.16b,v5.16b
- mov v1.s[3], w10
- rev w12,w8
- aese v17.16b,v22.16b
- aesmc v17.16b,v17.16b
- mov v18.s[3], w12
- subs x2,x2,#3
- aese v4.16b,v23.16b
- aese v5.16b,v23.16b
- aese v17.16b,v23.16b
-
- eor v2.16b,v2.16b,v4.16b
- ld1 {v16.4s},[x7],#16
- st1 {v2.16b},[x1],#16
- eor v3.16b,v3.16b,v5.16b
- mov w6,w5
- st1 {v3.16b},[x1],#16
- eor v19.16b,v19.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- st1 {v19.16b},[x1],#16
- b.hs Loop3x_ctr32
-
- adds x2,x2,#3
- b.eq Lctr32_done
- cmp x2,#1
- mov x12,#16
- csel x12,xzr,x12,eq
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq Lcbc_enc192
 
-Lctr32_tail:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
+ ld1 {v16.4s},[x14]
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v17.4s},[x7],#16
- b.gt Lctr32_tail
+ ld1 {v17.4s},[x3]
+ nop
 
+Lcbc_enc192:
  aese v0.16b,v16.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
+ subs x2,x2,#16
  aese v0.16b,v17.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v2.16b},[x0],x12
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
  aese v0.16b,v20.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v20.16b
- aesmc v1.16b,v1.16b
- ld1 {v3.16b},[x0]
+ eor v16.16b,v16.16b,v5.16b
  aese v0.16b,v21.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v21.16b
- aesmc v1.16b,v1.16b
- eor v2.16b,v2.16b,v7.16b
+ ld1 {v17.4s},[x7]
  aese v0.16b,v22.16b
  aesmc v0.16b,v0.16b
- aese v1.16b,v22.16b
- aesmc v1.16b,v1.16b
- eor v3.16b,v3.16b,v7.16b
  aese v0.16b,v23.16b
- aese v1.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc
 
- cmp x2,#1
- eor v2.16b,v2.16b,v0.16b
- eor v3.16b,v3.16b,v1.16b
- st1 {v2.16b},[x1],#16
- b.eq Lctr32_done
- st1 {v3.16b},[x1]
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
 
-Lctr32_done:
- ldr x29,[sp],#16
+.align 5
+Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b Lenter_cbc_enc128
+Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+.align 5
+Lcbc_dec:
+ ld1 {v24.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v27.16b,v24.16b,v24.16b
+ b.lo Lcbc_dec_tail
+
+ orr v1.16b,v24.16b,v24.16b
+ ld1 {v24.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v27.16b,v24.16b,v24.16b
+ cmp x2,#32
+ b.lo Loop3x_cbc_dec
+
+ ld1 {v25.16b},[x0],#16
+ ld1 {v26.16b},[x0],#16
+ sub x2,x2,#32
+ mov w6,w5
+ orr v28.16b,v25.16b,v25.16b
+ orr v29.16b,v26.16b,v26.16b
+
+Loop5x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop5x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ cmp x2,#0x40
+ sub x2,x2,#0x50
+
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v18.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v18.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v18.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v18.16b
+ aesimc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v19.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v19.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v19.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v19.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v20.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v20.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v21.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v21.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v22.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v22.16b
+ aesimc v26.16b,v26.16b
+
+ eor v4.16b,v6.16b,v7.16b
+ aesd v0.16b,v23.16b
+ eor v5.16b,v2.16b,v7.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v1.16b,v23.16b
+ eor v17.16b,v3.16b,v7.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v24.16b,v23.16b
+ eor v30.16b,v27.16b,v7.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v25.16b,v23.16b
+ eor v31.16b,v28.16b,v7.16b
+ ld1 {v28.16b},[x0],#16
+ aesd v26.16b,v23.16b
+ orr v6.16b,v29.16b,v29.16b
+ ld1 {v29.16b},[x0],#16
+ cbz x6,Lcbc_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ orr v0.16b,v2.16b,v2.16b
+ eor v5.16b,v5.16b,v1.16b
+ orr v1.16b,v3.16b,v3.16b
+ eor v17.16b,v17.16b,v24.16b
+ orr v24.16b,v27.16b,v27.16b
+ eor v30.16b,v30.16b,v25.16b
+ orr v25.16b,v28.16b,v28.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ orr v26.16b,v29.16b,v29.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs Loop5x_cbc_dec
+
+ add x2,x2,#0x50
+ cbz x2,Lcbc_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ orr v0.16b,v27.16b,v27.16b
+ orr v2.16b,v27.16b,v27.16b
+ orr v1.16b,v28.16b,v28.16b
+ orr v3.16b,v28.16b,v28.16b
+ orr v24.16b,v29.16b,v29.16b
+ orr v27.16b,v29.16b,v29.16b
+ b.lo Lcbc_dec_tail
+
+ b Loop3x_cbc_dec
+
+.align 4
+Lcbc_tail4x:
+ eor v5.16b,v4.16b,v1.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v30.16b,v30.16b,v25.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+
+ b Lcbc_done
+.align 4
+Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6
+
+
+ orr v6.16b,v27.16b,v27.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ ld1 {v16.4s},[x7],#16
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v24.16b,v24.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v24.16b},[x1],#16
+ orr v24.16b,v27.16b,v27.16b
+ b.hs Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq Lcbc_done
+ nop
+
+Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ b.eq Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v24.16b
+ orr v6.16b,v27.16b,v27.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b Lcbc_done
+
+Lcbc_dec_one:
+ eor v5.16b,v5.16b,v24.16b
+ orr v6.16b,v27.16b,v27.16b
+ st1 {v5.16b},[x1],#16
+
+Lcbc_done:
+ st1 {v6.16b},[x4]
+Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+
+.globl _aes_v8_ctr32_encrypt_blocks
+
+.align 5
+_aes_v8_ctr32_encrypt_blocks:
+
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+
+
+
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3]
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ rev w8, w8
+
+ orr v1.16b,v0.16b,v0.16b
+ add w10, w8, #1
+ orr v18.16b,v0.16b,v0.16b
+ add w8, w8, #2
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v1.s[3],w10
+ b.ls Lctr32_tail
+ rev w12, w8
+ sub x2,x2,#3
+ mov v18.s[3],w12
+ cmp x2,#32
+ b.lo Loop3x_ctr32
+
+ add w13,w8,#1
+ add w14,w8,#2
+ orr v24.16b,v0.16b,v0.16b
+ rev w13,w13
+ orr v25.16b,v0.16b,v0.16b
+ rev w14,w14
+ mov v24.s[3],w13
+ sub x2,x2,#2
+ mov v25.s[3],w14
+ add w8,w8,#2
+ b Loop5x_ctr32
+
+.align 4
+Loop5x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop5x_ctr32
+
+ mov x7,x3
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v16.4s},[x7],#16
+
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v17.4s},[x7],#16
+
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ add w9,w8,#1
+ add w10,w8,#2
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ add w12,w8,#3
+ add w13,w8,#4
+ aese v18.16b,v20.16b
+ aesmc v18.16b,v18.16b
+ add w14,w8,#5
+ rev w9,w9
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ rev w10,w10
+ rev w12,w12
+ aese v25.16b,v20.16b
+ aesmc v25.16b,v25.16b
+ rev w13,w13
+ rev w14,w14
+
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v21.16b
+ aesmc v18.16b,v18.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v21.16b
+ aesmc v25.16b,v25.16b
+
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v2.16b},[x0],#16
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0],#16
+ aese v18.16b,v22.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v26.16b},[x0],#16
+ aese v25.16b,v22.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v27.16b},[x0],#16
+
+ aese v0.16b,v23.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v1.16b,v23.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v18.16b,v23.16b
+ eor v19.16b,v19.16b,v7.16b
+ aese v24.16b,v23.16b
+ eor v26.16b,v26.16b,v7.16b
+ aese v25.16b,v23.16b
+ eor v27.16b,v27.16b,v7.16b
+
+ eor v2.16b,v2.16b,v0.16b
+ orr v0.16b,v6.16b,v6.16b
+ eor v3.16b,v3.16b,v1.16b
+ orr v1.16b,v6.16b,v6.16b
+ eor v19.16b,v19.16b,v18.16b
+ orr v18.16b,v6.16b,v6.16b
+ eor v26.16b,v26.16b,v24.16b
+ orr v24.16b,v6.16b,v6.16b
+ eor v27.16b,v27.16b,v25.16b
+ orr v25.16b,v6.16b,v6.16b
+
+ st1 {v2.16b},[x1],#16
+ mov v0.s[3],w9
+ st1 {v3.16b},[x1],#16
+ mov v1.s[3],w10
+ st1 {v19.16b},[x1],#16
+ mov v18.s[3],w12
+ st1 {v26.16b},[x1],#16
+ mov v24.s[3],w13
+ st1 {v27.16b},[x1],#16
+ mov v25.s[3],w14
+
+ mov w6,w5
+ cbz x2,Lctr32_done
+
+ add w8,w8,#5
+ subs x2,x2,#5
+ b.hs Loop5x_ctr32
+
+ add x2,x2,#5
+ sub w8,w8,#5
+
+ cmp x2,#2
+ mov x12,#16
+ csel x12,xzr,x12,lo
+ b.ls Lctr32_tail
+
+ sub x2,x2,#3
+ add w8,w8,#3
+ b Loop3x_ctr32
+
+.align 4
+Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ orr v0.16b,v6.16b,v6.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ orr v18.16b,v6.16b,v6.16b
+ add w9,w8,#1
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ eor v19.16b,v19.16b,v7.16b
+ rev w9,w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ mov v0.s[3], w9
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ mov v1.s[3], w10
+ rev w12,w8
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ mov v18.s[3], w12
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v19.16b},[x1],#16
+ b.hs Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq Lctr32_done
+ st1 {v3.16b},[x1]
+
+Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+
+.globl _aes_v8_xts_encrypt
+
+.align 5
+_aes_v8_xts_encrypt:
+
+ cmp x2,#16
+
+ b.ne Lxts_enc_big_size
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+Loop_enc_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt Loop_enc_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+ ld1 {v0.16b},[x0]
+ eor v0.16b,v6.16b,v0.16b
+
+ ldr w6,[x3,#240]
+ ld1 {v28.4s,v29.4s},[x3],#32
+
+ aese v0.16b,v28.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aese v0.16b,v29.16b
+ aesmc v0.16b,v0.16b
+ subs w6,w6,#10
+ b.eq Lxts_128_enc
+Lxts_enc_round_loop:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w6,w6,#2
+ b.gt Lxts_enc_round_loop
+Lxts_128_enc:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ eor v0.16b,v0.16b,v6.16b
+ st1 {v0.16b},[x1]
+ b Lxts_enc_final_abort
+
+.align 4
+Lxts_enc_big_size:
+ stp x19,x20,[sp,#-64]!
+ stp x21,x22,[sp,#48]
+ stp d8,d9,[sp,#32]
+ stp d10,d11,[sp,#16]
+
+
+ and x21,x2,#0xf
+ and x2,x2,#-16
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo Lxts_abort
+ csel x8,xzr,x8,eq
+
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+Loop_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt Loop_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+
+
+
+ fmov x9,d6
+ fmov x10,v6.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d8,x9
+ fmov v8.d[1],x10
+
+ ldr w5,[x3,#240]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3]
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+
+
+Lxts_enc:
+ ld1 {v24.16b},[x0],#16
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v28.16b,v0.16b,v0.16b
+ orr v27.16b,v24.16b,v24.16b
+ orr v29.16b,v24.16b,v24.16b
+ b.lo Lxts_inner_enc_tail
+ eor v0.16b,v0.16b,v6.16b
+ eor v24.16b,v24.16b,v8.16b
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d9,x9
+ fmov v9.d[1],x10
+
+
+ orr v1.16b,v24.16b,v24.16b
+ ld1 {v24.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ eor v27.16b,v24.16b,v9.16b
+ eor v24.16b,v24.16b,v9.16b
+ cmp x2,#32
+ b.lo Lxts_outer_enc_tail
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d10,x9
+ fmov v10.d[1],x10
+
+ ld1 {v25.16b},[x0],#16
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v26.16b},[x0],#16
+ eor v25.16b,v25.16b,v10.16b
+ eor v26.16b,v26.16b,v11.16b
+ sub x2,x2,#32
+ mov w6,w5
+ b Loop5x_xts_enc
+
+.align 4
+Loop5x_xts_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop5x_xts_enc
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v16.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v16.16b
+ aesmc v26.16b,v26.16b
+ subs x2,x2,#0x50
+
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v17.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v17.16b
+ aesmc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v18.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v18.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v18.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v18.16b
+ aesmc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v19.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v19.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v19.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v19.16b
+ aesmc v26.16b,v26.16b
+
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v20.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v20.16b
+ aesmc v26.16b,v26.16b
+
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v21.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v21.16b
+ aesmc v26.16b,v26.16b
+
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v22.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v22.16b
+ aesmc v26.16b,v26.16b
+
+ eor v4.16b,v7.16b,v6.16b
+ aese v0.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v7.16b,v8.16b
+ ld1 {v2.16b},[x0],#16
+ aese v1.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d8,x9
+ fmov v8.d[1],x10
+ eor v17.16b,v7.16b,v9.16b
+ ld1 {v3.16b},[x0],#16
+ aese v24.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d9,x9
+ fmov v9.d[1],x10
+ eor v30.16b,v7.16b,v10.16b
+ ld1 {v27.16b},[x0],#16
+ aese v25.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d10,x9
+ fmov v10.d[1],x10
+ eor v31.16b,v7.16b,v11.16b
+ ld1 {v28.16b},[x0],#16
+ aese v26.16b,v23.16b
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v29.16b},[x0],#16
+ cbz x6,Lxts_enc_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ eor v0.16b,v2.16b,v6.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v1.16b,v3.16b,v8.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v30.16b,v30.16b,v25.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ eor v26.16b,v29.16b,v11.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs Loop5x_xts_enc
+
+
+
+ cmn x2,#0x10
+ b.ne Loop5x_enc_after
+ orr v11.16b,v10.16b,v10.16b
+ orr v10.16b,v9.16b,v9.16b
+ orr v9.16b,v8.16b,v8.16b
+ orr v8.16b,v6.16b,v6.16b
+ fmov x9,d11
+ fmov x10,v11.d[1]
+ eor v0.16b,v6.16b,v2.16b
+ eor v1.16b,v8.16b,v3.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v26.16b,v29.16b,v11.16b
+ b.eq Loop5x_xts_enc
+
+Loop5x_enc_after:
+ add x2,x2,#0x50
+ cbz x2,Lxts_enc_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ b.lo Lxts_inner_enc_tail
+
+ eor v0.16b,v6.16b,v27.16b
+ eor v1.16b,v8.16b,v28.16b
+ eor v24.16b,v29.16b,v9.16b
+ b Lxts_outer_enc_tail
+
+.align 4
+Lxts_enc_tail4x:
+ add x0,x0,#16
+ eor v5.16b,v1.16b,v5.16b
+ st1 {v5.16b},[x1],#16
+ eor v17.16b,v24.16b,v17.16b
+ st1 {v17.16b},[x1],#16
+ eor v30.16b,v25.16b,v30.16b
+ eor v31.16b,v26.16b,v31.16b
+ st1 {v30.16b,v31.16b},[x1],#32
+
+ b Lxts_enc_done
+.align 4
+Lxts_outer_enc_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lxts_outer_enc_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+
+ fmov x9,d9
+ fmov x10,v9.d[1]
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr#31
+ eor x9,x11,x9,lsl#1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v8.16b,v7.16b
+ csel x6,x2,x6,lo
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ eor v17.16b,v9.16b,v7.16b
+
+ add x6,x6,#0x20
+ add x0,x0,x6
+ mov x7,x3
+
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ ld1 {v27.16b},[x0],#16
+ add w6,w5,#2
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v24.16b,v24.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ st1 {v5.16b},[x1],#16
+ st1 {v24.16b},[x1],#16
+ cmn x2,#0x30
+ b.eq Lxts_enc_done
+Lxts_encxor_one:
+ orr v28.16b,v3.16b,v3.16b
+ orr v29.16b,v27.16b,v27.16b
+ nop
+
+Lxts_inner_enc_tail:
+ cmn x2,#0x10
+ eor v1.16b,v28.16b,v6.16b
+ eor v24.16b,v29.16b,v8.16b
+ b.eq Lxts_enc_tail_loop
+ eor v24.16b,v29.16b,v6.16b
+Lxts_enc_tail_loop:
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lxts_enc_tail_loop
+
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v16.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v17.16b
+ aesmc v24.16b,v24.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v20.16b
+ aesmc v24.16b,v24.16b
+ cmn x2,#0x20
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v21.16b
+ aesmc v24.16b,v24.16b
+ eor v5.16b,v6.16b,v7.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ aese v24.16b,v22.16b
+ aesmc v24.16b,v24.16b
+ eor v17.16b,v8.16b,v7.16b
+ aese v1.16b,v23.16b
+ aese v24.16b,v23.16b
+ b.eq Lxts_enc_one
+ eor v5.16b,v5.16b,v1.16b
+ st1 {v5.16b},[x1],#16
+ eor v17.16b,v17.16b,v24.16b
+ orr v6.16b,v8.16b,v8.16b
+ st1 {v17.16b},[x1],#16
+ fmov x9,d8
+ fmov x10,v8.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ b Lxts_enc_done
+
+Lxts_enc_one:
+ eor v5.16b,v5.16b,v24.16b
+ orr v6.16b,v6.16b,v6.16b
+ st1 {v5.16b},[x1],#16
+ fmov x9,d6
+ fmov x10,v6.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ b Lxts_enc_done
+.align 5
+Lxts_enc_done:
+
+ tst x21,#0xf
+ b.eq Lxts_abort
+
+ mov x20,x0
+ mov x13,x1
+ sub x1,x1,#16
+.composite_enc_loop:
+ subs x21,x21,#1
+ ldrb w15,[x1,x21]
+ ldrb w14,[x20,x21]
+ strb w15,[x13,x21]
+ strb w14,[x1,x21]
+ b.gt .composite_enc_loop
+Lxts_enc_load_done:
+ ld1 {v26.16b},[x1]
+ eor v26.16b,v26.16b,v6.16b
+
+
+ ldr w6,[x3,#240]
+ ld1 {v0.4s},[x3],#16
+ sub w6,w6,#2
+ ld1 {v1.4s},[x3],#16
+Loop_final_enc:
+ aese v26.16b,v0.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v0.4s},[x3],#16
+ subs w6,w6,#2
+ aese v26.16b,v1.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v1.4s},[x3],#16
+ b.gt Loop_final_enc
+
+ aese v26.16b,v0.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v0.4s},[x3]
+ aese v26.16b,v1.16b
+ eor v26.16b,v26.16b,v0.16b
+ eor v26.16b,v26.16b,v6.16b
+ st1 {v26.16b},[x1]
+
+Lxts_abort:
+ ldp x21,x22,[sp,#48]
+ ldp d8,d9,[sp,#32]
+ ldp d10,d11,[sp,#16]
+ ldp x19,x20,[sp],#64
+Lxts_enc_final_abort:
+ ret
+
+.globl _aes_v8_xts_decrypt
+
+.align 5
+_aes_v8_xts_decrypt:
+
+ cmp x2,#16
+
+ b.ne Lxts_dec_big_size
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+Loop_dec_small_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt Loop_dec_small_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+ ld1 {v0.16b},[x0]
+ eor v0.16b,v6.16b,v0.16b
+
+ ldr w6,[x3,#240]
+ ld1 {v28.4s,v29.4s},[x3],#32
+
+ aesd v0.16b,v28.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s,v17.4s},[x3],#32
+ aesd v0.16b,v29.16b
+ aesimc v0.16b,v0.16b
+ subs w6,w6,#10
+ b.eq Lxts_128_dec
+Lxts_dec_round_loop:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v16.4s},[x3],#16
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v17.4s},[x3],#16
+ subs w6,w6,#2
+ b.gt Lxts_dec_round_loop
+Lxts_128_dec:
+ ld1 {v18.4s,v19.4s},[x3],#32
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v20.4s,v21.4s},[x3],#32
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v22.4s,v23.4s},[x3],#32
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ ld1 {v7.4s},[x3]
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v23.16b
+ eor v0.16b,v0.16b,v7.16b
+ eor v0.16b,v6.16b,v0.16b
+ st1 {v0.16b},[x1]
+ b Lxts_dec_final_abort
+Lxts_dec_big_size:
+ stp x19,x20,[sp,#-64]!
+ stp x21,x22,[sp,#48]
+ stp d8,d9,[sp,#32]
+ stp d10,d11,[sp,#16]
+
+ and x21,x2,#0xf
+ and x2,x2,#-16
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo Lxts_dec_abort
+
+
+ ldr w6,[x4,#240]
+ ld1 {v0.4s},[x4],#16
+ ld1 {v6.16b},[x5]
+ sub w6,w6,#2
+ ld1 {v1.4s},[x4],#16
+
+Loop_dec_iv_enc:
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4],#16
+ subs w6,w6,#2
+ aese v6.16b,v1.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v1.4s},[x4],#16
+ b.gt Loop_dec_iv_enc
+
+ aese v6.16b,v0.16b
+ aesmc v6.16b,v6.16b
+ ld1 {v0.4s},[x4]
+ aese v6.16b,v1.16b
+ eor v6.16b,v6.16b,v0.16b
+
+
+
+
+ fmov x9,d6
+ fmov x10,v6.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d8,x9
+ fmov v8.d[1],x10
+
+ ldr w5,[x3,#240]
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d9,x9
+ fmov v9.d[1],x10
+
+ ld1 {v16.4s,v17.4s},[x3]
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d10,x9
+ fmov v10.d[1],x10
+
+ add x7,x3,#32
+ mov w6,w5
+ b Lxts_dec
+
+
+.align 5
+Lxts_dec:
+ tst x21,#0xf
+ b.eq Lxts_dec_begin
+ subs x2,x2,#16
+ csel x8,xzr,x8,eq
+ ld1 {v0.16b},[x0],#16
+ b.lo Lxts_done
+ sub x0,x0,#16
+Lxts_dec_begin:
+ ld1 {v0.16b},[x0],x8
+ subs x2,x2,#32
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v28.16b,v0.16b,v0.16b
+ ld1 {v24.16b},[x0],#16
+ orr v27.16b,v24.16b,v24.16b
+ orr v29.16b,v24.16b,v24.16b
+ b.lo Lxts_inner_dec_tail
+ eor v0.16b,v0.16b,v6.16b
+ eor v24.16b,v24.16b,v8.16b
+
+ orr v1.16b,v24.16b,v24.16b
+ ld1 {v24.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ eor v27.16b,v24.16b,v9.16b
+ eor v24.16b,v24.16b,v9.16b
+ cmp x2,#32
+ b.lo Lxts_outer_dec_tail
+
+ ld1 {v25.16b},[x0],#16
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v26.16b},[x0],#16
+ eor v25.16b,v25.16b,v10.16b
+ eor v26.16b,v26.16b,v11.16b
+ sub x2,x2,#32
+ mov w6,w5
+ b Loop5x_xts_dec
+
+.align 4
+Loop5x_xts_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop5x_xts_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v16.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v16.16b
+ aesimc v26.16b,v26.16b
+ subs x2,x2,#0x50
+
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v17.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v17.16b
+ aesimc v26.16b,v26.16b
+ csel x6,xzr,x2,gt
+ mov x7,x3
+
+ aesd v0.16b,v18.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v18.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v18.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v18.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v18.16b
+ aesimc v26.16b,v26.16b
+ add x0,x0,x6
+
+
+ add x6,x2,#0x60
+
+ aesd v0.16b,v19.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v19.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v19.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v19.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v19.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v20.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v20.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v21.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v21.16b
+ aesimc v26.16b,v26.16b
+
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ aesd v25.16b,v22.16b
+ aesimc v25.16b,v25.16b
+ aesd v26.16b,v22.16b
+ aesimc v26.16b,v26.16b
+
+ eor v4.16b,v7.16b,v6.16b
+ aesd v0.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v7.16b,v8.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v1.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d8,x9
+ fmov v8.d[1],x10
+ eor v17.16b,v7.16b,v9.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v24.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d9,x9
+ fmov v9.d[1],x10
+ eor v30.16b,v7.16b,v10.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v25.16b,v23.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d10,x9
+ fmov v10.d[1],x10
+ eor v31.16b,v7.16b,v11.16b
+ ld1 {v28.16b},[x0],#16
+ aesd v26.16b,v23.16b
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d11,x9
+ fmov v11.d[1],x10
+
+ ld1 {v29.16b},[x0],#16
+ cbz x6,Lxts_dec_tail4x
+ ld1 {v16.4s},[x7],#16
+ eor v4.16b,v4.16b,v0.16b
+ eor v0.16b,v2.16b,v6.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v1.16b,v3.16b,v8.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v30.16b,v30.16b,v25.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v31.16b,v31.16b,v26.16b
+ st1 {v4.16b},[x1],#16
+ eor v26.16b,v29.16b,v11.16b
+ st1 {v5.16b},[x1],#16
+ mov w6,w5
+ st1 {v17.16b},[x1],#16
+ ld1 {v17.4s},[x7],#16
+ st1 {v30.16b},[x1],#16
+ st1 {v31.16b},[x1],#16
+ b.hs Loop5x_xts_dec
+
+ cmn x2,#0x10
+ b.ne Loop5x_dec_after
+
+
+
+ orr v11.16b,v10.16b,v10.16b
+ orr v10.16b,v9.16b,v9.16b
+ orr v9.16b,v8.16b,v8.16b
+ orr v8.16b,v6.16b,v6.16b
+ fmov x9,d11
+ fmov x10,v11.d[1]
+ eor v0.16b,v6.16b,v2.16b
+ eor v1.16b,v8.16b,v3.16b
+ eor v24.16b,v27.16b,v9.16b
+ eor v25.16b,v28.16b,v10.16b
+ eor v26.16b,v29.16b,v11.16b
+ b.eq Loop5x_xts_dec
+
+Loop5x_dec_after:
+ add x2,x2,#0x50
+ cbz x2,Lxts_done
+
+ add w6,w5,#2
+ subs x2,x2,#0x30
+ b.lo Lxts_inner_dec_tail
+
+ eor v0.16b,v6.16b,v27.16b
+ eor v1.16b,v8.16b,v28.16b
+ eor v24.16b,v29.16b,v9.16b
+ b Lxts_outer_dec_tail
+
+.align 4
+Lxts_dec_tail4x:
+ add x0,x0,#16
+ tst x21,#0xf
+ eor v5.16b,v1.16b,v4.16b
+ st1 {v5.16b},[x1],#16
+ eor v17.16b,v24.16b,v17.16b
+ st1 {v17.16b},[x1],#16
+ eor v30.16b,v25.16b,v30.16b
+ eor v31.16b,v26.16b,v31.16b
+ st1 {v30.16b,v31.16b},[x1],#32
+
+ b.eq Lxts_dec_abort
+ ld1 {v0.16b},[x0],#16
+ b Lxts_done
+.align 4
+Lxts_outer_dec_tail:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lxts_outer_dec_tail
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+
+ fmov x9,d9
+ fmov x10,v9.d[1]
+ mov w19,#0x87
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d6,x9
+ fmov v6.d[1],x10
+ eor v5.16b,v8.16b,v7.16b
+ csel x6,x2,x6,lo
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v9.16b,v7.16b
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d8,x9
+ fmov v8.d[1],x10
+
+ add x6,x6,#0x20
+ add x0,x0,x6
+
+ mov x7,x3
+
+
+ extr x22,x10,x10,#32
+ extr x10,x10,x9,#63
+ and w11,w19,w22,asr #31
+ eor x9,x11,x9,lsl #1
+ fmov d9,x9
+ fmov v9.d[1],x10
+
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v27.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ ld1 {v16.4s},[x7],#16
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v24.16b,v24.16b,v17.16b
+ ld1 {v17.4s},[x7],#16
+ st1 {v4.16b},[x1],#16
+ st1 {v5.16b},[x1],#16
+ st1 {v24.16b},[x1],#16
+
+ cmn x2,#0x30
+ add x2,x2,#0x30
+ b.eq Lxts_done
+ sub x2,x2,#0x30
+ orr v28.16b,v3.16b,v3.16b
+ orr v29.16b,v27.16b,v27.16b
+ nop
+
+Lxts_inner_dec_tail:
+
+ cmn x2,#0x10
+ eor v1.16b,v28.16b,v6.16b
+ eor v24.16b,v29.16b,v8.16b
+ b.eq Lxts_dec_tail_loop
+ eor v24.16b,v29.16b,v6.16b
+Lxts_dec_tail_loop:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lxts_dec_tail_loop
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v16.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v17.16b
+ aesimc v24.16b,v24.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v20.16b
+ aesimc v24.16b,v24.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v21.16b
+ aesimc v24.16b,v24.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v24.16b,v22.16b
+ aesimc v24.16b,v24.16b
+ eor v17.16b,v8.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v24.16b,v23.16b
+ b.eq Lxts_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v24.16b
+ orr v6.16b,v9.16b,v9.16b
+ orr v8.16b,v10.16b,v10.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ add x2,x2,#16
+ b Lxts_done
+
+Lxts_dec_one:
+ eor v5.16b,v5.16b,v24.16b
+ orr v6.16b,v8.16b,v8.16b
+ orr v8.16b,v9.16b,v9.16b
+ st1 {v5.16b},[x1],#16
+ add x2,x2,#32
+
+Lxts_done:
+ tst x21,#0xf
+ b.eq Lxts_dec_abort
+
+ mov x7,x3
+ cbnz x2,Lxts_dec_1st_done
+ ld1 {v0.16b},[x0],#16
+
+
+Lxts_dec_1st_done:
+ eor v26.16b,v0.16b,v8.16b
+ ldr w6,[x3,#240]
+ ld1 {v0.4s},[x3],#16
+ sub w6,w6,#2
+ ld1 {v1.4s},[x3],#16
+Loop_final_2nd_dec:
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x3],#16
+ subs w6,w6,#2
+ aesd v26.16b,v1.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v1.4s},[x3],#16
+ b.gt Loop_final_2nd_dec
+
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x3]
+ aesd v26.16b,v1.16b
+ eor v26.16b,v26.16b,v0.16b
+ eor v26.16b,v26.16b,v8.16b
+ st1 {v26.16b},[x1]
+
+ mov x20,x0
+ add x13,x1,#16
+
+
+
+.composite_dec_loop:
+ subs x21,x21,#1
+ ldrb w15,[x1,x21]
+ ldrb w14,[x20,x21]
+ strb w15,[x13,x21]
+ strb w14,[x1,x21]
+ b.gt .composite_dec_loop
+Lxts_dec_load_done:
+ ld1 {v26.16b},[x1]
+ eor v26.16b,v26.16b,v6.16b
+
+
+ ldr w6,[x7,#240]
+ ld1 {v0.4s},[x7],#16
+ sub w6,w6,#2
+ ld1 {v1.4s},[x7],#16
+Loop_final_dec:
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v26.16b,v1.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v1.4s},[x7],#16
+ b.gt Loop_final_dec
+
+ aesd v26.16b,v0.16b
+ aesimc v26.16b,v26.16b
+ ld1 {v0.4s},[x7]
+ aesd v26.16b,v1.16b
+ eor v26.16b,v26.16b,v0.16b
+ eor v26.16b,v26.16b,v6.16b
+ st1 {v26.16b},[x1]
+
+Lxts_dec_abort:
+ ldp x21,x22,[sp,#48]
+ ldp d8,d9,[sp,#32]
+ ldp d10,d11,[sp,#16]
+ ldp x19,x20,[sp],#64
+
+Lxts_dec_final_abort:
  ret
index bf33773aa80cad11e445fecfefaf4914c610ef22..ecbdca544bbc0cbc123209a31d3c38ed1574b3cd 100644 (file)
 #
 # *** This file is auto-generated ***
 #
-# 1 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
+# 0 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S" 2
 
 
-.text
 
+.text
 .globl _gcm_init_v8
 
 .align 4
 _gcm_init_v8:
+
  ld1 {v17.2d},[x1]
  movi v19.16b,#0xe1
  shl v19.2d,v19.2d,#57
@@ -126,21 +127,110 @@ _gcm_init_v8:
  pmull v5.1q,v5.1d,v19.1d
  eor v18.16b,v18.16b,v2.16b
  eor v4.16b,v4.16b,v7.16b
- eor v20.16b, v0.16b,v18.16b
- eor v22.16b,v5.16b,v4.16b
+ eor v23.16b, v0.16b,v18.16b
+ eor v25.16b,v5.16b,v4.16b
+
+ ext v16.16b,v23.16b, v23.16b,#8
+ ext v17.16b,v25.16b,v25.16b,#8
+ ext v18.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v23.16b
+ eor v17.16b,v17.16b,v25.16b
+ eor v18.16b,v18.16b,v22.16b
+ ext v24.16b,v16.16b,v17.16b,#8
+ st1 {v23.2d,v24.2d,v25.2d},[x0],#48
+
+
+ pmull v0.1q,v22.1d, v23.1d
+ pmull v5.1q,v23.1d,v23.1d
+ pmull2 v2.1q,v22.2d, v23.2d
+ pmull2 v7.1q,v23.2d,v23.2d
+ pmull v1.1q,v16.1d,v18.1d
+ pmull v6.1q,v16.1d,v16.1d
 
- ext v16.16b,v20.16b, v20.16b,#8
- ext v17.16b,v22.16b,v22.16b,#8
- eor v16.16b,v16.16b,v20.16b
- eor v17.16b,v17.16b,v22.16b
- ext v21.16b,v16.16b,v17.16b,#8
- st1 {v20.2d,v21.2d,v22.2d},[x0]
+ ext v16.16b,v0.16b,v2.16b,#8
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v26.16b,v0.16b,v18.16b
+ eor v28.16b,v5.16b,v4.16b
+
+ ext v16.16b,v26.16b, v26.16b,#8
+ ext v17.16b,v28.16b,v28.16b,#8
+ ext v18.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v26.16b
+ eor v17.16b,v17.16b,v28.16b
+ eor v18.16b,v18.16b,v22.16b
+ ext v27.16b,v16.16b,v17.16b,#8
+ st1 {v26.2d,v27.2d,v28.2d},[x0],#48
+
+
+ pmull v0.1q,v22.1d,v26.1d
+ pmull v5.1q,v22.1d,v28.1d
+ pmull2 v2.1q,v22.2d,v26.2d
+ pmull2 v7.1q,v22.2d,v28.2d
+ pmull v1.1q,v16.1d,v18.1d
+ pmull v6.1q,v17.1d,v18.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v29.16b,v0.16b,v18.16b
+ eor v31.16b,v5.16b,v4.16b
+
+ ext v16.16b,v29.16b,v29.16b,#8
+ ext v17.16b,v31.16b,v31.16b,#8
+ eor v16.16b,v16.16b,v29.16b
+ eor v17.16b,v17.16b,v31.16b
+ ext v30.16b,v16.16b,v17.16b,#8
+ st1 {v29.2d,v30.2d,v31.2d},[x0]
  ret
 
 .globl _gcm_gmult_v8
 
 .align 4
 _gcm_gmult_v8:
+
  ld1 {v17.2d},[x0]
  movi v19.16b,#0xe1
  ld1 {v20.2d,v21.2d},[x1]
@@ -182,6 +272,7 @@ _gcm_gmult_v8:
 
 .align 4
 _gcm_ghash_v8:
+
  cmp x3,#64
  b.hs Lgcm_ghash_v8_4x
  ld1 {v0.2d},[x0]
@@ -192,7 +283,7 @@ _gcm_ghash_v8:
 
  subs x3,x3,#32
  mov x12,#16
-# 159 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
+# 250 "lib/accelerated/aarch64/macosx/ghash-aarch64.s.tmp.S"
  ld1 {v20.2d,v21.2d},[x1],#32
  movi v19.16b,#0xe1
  ld1 {v22.2d},[x1]
index 9b2bdf2d85cadc38c65c237e13281d6f7544164c..62e930a87f5d2848abe18523d1bf339b7c5a47ad 100644 (file)
 #
 # *** This file is auto-generated ***
 #
-# 1 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
+# 0 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
 # 2 "lib/accelerated/aarch64/macosx/sha1-armv8.s.tmp.S" 2
 
-.text
-
 
 .private_extern __gnutls_arm_cpuid_s
-.globl _sha1_block_data_order
 
-.align 6
-_sha1_block_data_order:
 
+.text
 
+.globl _sha1_block_data_order
 
- ldr x16,L_gnutls_arm_cpuid_s
+.align 6
+_sha1_block_data_order:
 
- adr x17,L_gnutls_arm_cpuid_s
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#(1<<3)
+ adrp x16,__gnutls_arm_cpuid_s@PAGE
+ ldr w16,[x16,__gnutls_arm_cpuid_s@PAGEOFF]
+ tst w16,#(1 << 3)
  b.ne Lv8_entry
 
+
  stp x29,x30,[sp,#-96]!
  add x29,sp,#0
  stp x19,x20,[sp,#16]
@@ -88,7 +86,7 @@ Loop:
  add w24,w24,w28
  add w24,w24,w3
  lsr x4,x3,#32
- ldr x5,[x1,#-56]
+ ldur x5,[x1,#-56]
  bic w25,w23,w21
  and w26,w22,w21
  ror w27,w20,#27
@@ -113,7 +111,7 @@ Loop:
  add w22,w22,w5
  add w23,w23,w25
  lsr x6,x5,#32
- ldr x7,[x1,#-48]
+ ldur x7,[x1,#-48]
  bic w25,w21,w24
  and w26,w20,w24
  ror w27,w23,#27
@@ -138,7 +136,7 @@ Loop:
  add w20,w20,w7
  add w21,w21,w25
  lsr x8,x7,#32
- ldr x9,[x1,#-40]
+ ldur x9,[x1,#-40]
  bic w25,w24,w22
  and w26,w23,w22
  ror w27,w21,#27
@@ -163,7 +161,7 @@ Loop:
  add w23,w23,w9
  add w24,w24,w25
  lsr x10,x9,#32
- ldr x11,[x1,#-32]
+ ldur x11,[x1,#-32]
  bic w25,w22,w20
  and w26,w21,w20
  ror w27,w24,#27
@@ -188,7 +186,7 @@ Loop:
  add w21,w21,w11
  add w22,w22,w25
  lsr x12,x11,#32
- ldr x13,[x1,#-24]
+ ldur x13,[x1,#-24]
  bic w25,w20,w23
  and w26,w24,w23
  ror w27,w22,#27
@@ -213,7 +211,7 @@ Loop:
  add w24,w24,w13
  add w20,w20,w25
  lsr x14,x13,#32
- ldr x15,[x1,#-16]
+ ldur x15,[x1,#-16]
  bic w25,w23,w21
  and w26,w22,w21
  ror w27,w20,#27
@@ -238,7 +236,7 @@ Loop:
  add w22,w22,w15
  add w23,w23,w25
  lsr x16,x15,#32
- ldr x17,[x1,#-8]
+ ldur x17,[x1,#-8]
  bic w25,w21,w24
  and w26,w20,w24
  ror w27,w23,#27
@@ -1122,6 +1120,7 @@ Loop:
 .align 6
 sha1_block_armv8:
 Lv8_entry:
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
 
@@ -1254,12 +1253,6 @@ Lconst:
 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-L_gnutls_arm_cpuid_s:
-
-
-
-.quad __gnutls_arm_cpuid_s-.
-
 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 2
 .align 2
index 3ee6befc664a8172bc0157827a2621a97476c4bd..96212e56dcf5724776c7325b28b2bea385eacf1d 100644 (file)
 #
 # *** This file is auto-generated ***
 #
+# 0 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
-# 56 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
+# 58 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
-# 57 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S" 2
+# 59 "lib/accelerated/aarch64/macosx/sha256-armv8.s.tmp.S" 2
 
 
-.text
+.private_extern __gnutls_arm_cpuid_s
 
 
-.private_extern __gnutls_arm_cpuid_s
+.text
+
 .globl _sha256_block_data_order
 
 .align 6
 _sha256_block_data_order:
 
 
-
-
- ldr x16,L_gnutls_arm_cpuid_s
-
- adr x17,L_gnutls_arm_cpuid_s
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#(1<<4)
+ adrp x16,__gnutls_arm_cpuid_s@PAGE
+ ldr w16,[x16,__gnutls_arm_cpuid_s@PAGEOFF]
+ tst w16,#(1 << 4)
  b.ne Lv8_entry
- tst w16,#(1<<0)
+ tst w16,#(1 << 0)
  b.ne Lneon_entry
 
-.long 0xd503233f
+
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1028,7 +1024,7 @@ Loop_16_xx:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
-.long 0xd50323bf
+
  ret
 
 
@@ -1053,15 +1049,6 @@ LK256:
 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .long 0
 
-
-.align 3
-L_gnutls_arm_cpuid_s:
-
-
-
-.quad __gnutls_arm_cpuid_s-.
-
-
 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 2
 .align 2
@@ -1070,6 +1057,7 @@ L_gnutls_arm_cpuid_s:
 .align 6
 sha256_block_armv8:
 Lv8_entry:
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0
 
@@ -1211,7 +1199,9 @@ Loop_hw:
 
 .align 4
 _sha256_block_neon:
+
 Lneon_entry:
+
  stp x29, x30, [sp, #-16]!
  mov x29, sp
  sub sp,sp,#16*4
index 8c3abda83bd9f7ecc101c2f918b3134f8fcaf686..8426cd656ef7a55d8fddb7201be96142e9968b5a 100644 (file)
 #
 # *** This file is auto-generated ***
 #
+# 0 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
+# 0 "<built-in>"
+# 0 "<command-line>"
 # 1 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
-# 56 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
+# 58 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S"
 # 1 "lib/accelerated/aarch64/aarch64-common.h" 1
-# 57 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S" 2
+# 59 "lib/accelerated/aarch64/macosx/sha512-armv8.s.tmp.S" 2
 
 
-.text
+.private_extern __gnutls_arm_cpuid_s
 
 
-.private_extern __gnutls_arm_cpuid_s
+.text
+
 .globl _sha512_block_data_order
 
 .align 6
 _sha512_block_data_order:
 
 
-
-
- ldr x16,L_gnutls_arm_cpuid_s
-
- adr x17,L_gnutls_arm_cpuid_s
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#(1<<6)
+ adrp x16,__gnutls_arm_cpuid_s@PAGE
+ ldr w16,[x16,__gnutls_arm_cpuid_s@PAGEOFF]
+ tst w16,#(1 << 6)
  b.ne Lv8_entry
 
-.long 0xd503233f
+
  stp x29,x30,[sp,#-128]!
  add x29,sp,#0
 
@@ -1026,7 +1022,7 @@ Loop_16_xx:
  ldp x25,x26,[x29,#64]
  ldp x27,x28,[x29,#80]
  ldp x29,x30,[sp],#128
-.long 0xd50323bf
+
  ret
 
 
@@ -1075,15 +1071,6 @@ LK512:
 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 .quad 0
 
-
-.align 3
-L_gnutls_arm_cpuid_s:
-
-
-
-.quad __gnutls_arm_cpuid_s-.
-
-
 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 2
 .align 2
@@ -1092,6 +1079,7 @@ L_gnutls_arm_cpuid_s:
 .align 6
 sha512_block_armv8:
 Lv8_entry:
+
  stp x29,x30,[sp,#-16]!
  add x29,sp,#0