.p2align 6
L(copy_long):
+ /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
+ This loop is identical to the one below it but using ldnp/stnp
+ instructions. For loops that are less than 32768 bytes,
+ the ldnp/stnp instructions will not help and will cause a slow
+ down so only use the ldnp/stnp loop for the largest sizes. */
+
+ cmp count, #32768
+ b.lo L(copy_long_without_nontemp)
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldnp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldnp A_l, A_h, [src, 16]
+ stnp D_l, D_h, [dstin]
+ ldnp B_l, B_h, [src, 32]
+ ldnp C_l, C_h, [src, 48]
+ ldnp D_l, D_h, [src, 64]
+ add src, src, #64
+ subs count, count, 128 + 16 /* Test and readjust count. */
+
+L(nontemp_loop64):
+ tbz src, #6, 1f
+1:
+ stnp A_l, A_h, [dst, 16]
+ ldnp A_l, A_h, [src, 16]
+ stnp B_l, B_h, [dst, 32]
+ ldnp B_l, B_h, [src, 32]
+ stnp C_l, C_h, [dst, 48]
+ ldnp C_l, C_h, [src, 48]
+ stnp D_l, D_h, [dst, 64]
+ ldnp D_l, D_h, [src, 64]
+ add src, src, #64
+ add dst, dst, #64
+ subs count, count, 64
+ b.hi L(nontemp_loop64)
+ b L(last64)
+
+L(copy_long_without_nontemp):
+
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]