ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
#endif
.Loop:
ldp $t0,$t1,[$inp],#16 // load input
sub $len,$len,#16
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $t0,$t0
rev $t1,$t1
#endif
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
adcs $h1,$h1,xzr
adc $h2,$h2,xzr
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
lsl $padbit,$padbit,#24
add x15,$ctx,#48
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
ld1 {$S2,$R3,$S3,$R4},[x15],#64
ld1 {$S4},[x15]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
umull $ACC1,$IN23_0,${R1}[2]
ldp x9,x13,[$in2],#48
umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
umlal $ACC4,$IN01_2,${R2}[0]
umlal $ACC1,$IN01_2,${S4}[0]
umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif