]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/aarch64/memset.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / memset.S
CommitLineData
04277e02 1/* Copyright (C) 2012-2019 Free Software Foundation, Inc.
58faa087
MS
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
a8c5a2a9 19#include <sysdep.h>
5a67c4fa 20#include "memset-reg.h"
a8c5a2a9 21
4e001969
AZ
22#ifndef MEMSET
23# define MEMSET memset
24#endif
25
58faa087
MS
26/* Assumptions:
27 *
a8c5a2a9 28 * ARMv8-a, AArch64, unaligned accesses
58faa087
MS
29 *
30 */
31
5a67c4fa 32ENTRY_ALIGN (MEMSET, 6)
58faa087 33
389d1f1b
SE
34 DELOUSE (0)
35 DELOUSE (2)
36
a8c5a2a9
WD
37 dup v0.16B, valw
38 add dstend, dstin, count
39
40 cmp count, 96
41 b.hi L(set_long)
42 cmp count, 16
43 b.hs L(set_medium)
44 mov val, v0.D[0]
45
46 /* Set 0..15 bytes. */
47 tbz count, 3, 1f
48 str val, [dstin]
49 str val, [dstend, -8]
50 ret
51 nop
521: tbz count, 2, 2f
53 str valw, [dstin]
54 str valw, [dstend, -4]
55 ret
562: cbz count, 3f
57 strb valw, [dstin]
58 tbz count, 1, 3f
59 strh valw, [dstend, -2]
603: ret
61
62 /* Set 17..96 bytes. */
63L(set_medium):
64 str q0, [dstin]
65 tbnz count, 6, L(set96)
66 str q0, [dstend, -16]
67 tbz count, 5, 1f
68 str q0, [dstin, 16]
69 str q0, [dstend, -32]
701: ret
71
72 .p2align 4
73 /* Set 64..96 bytes. Write 64 bytes from the start and
74 32 bytes from the end. */
75L(set96):
76 str q0, [dstin, 16]
77 stp q0, q0, [dstin, 32]
78 stp q0, q0, [dstend, -32]
79 ret
80
81 .p2align 3
82 nop
83L(set_long):
84 and valw, valw, 255
85 bic dst, dstin, 15
86 str q0, [dstin]
87 cmp count, 256
88 ccmp valw, 0, 0, cs
89 b.eq L(try_zva)
90L(no_zva):
91 sub count, dstend, dst /* Count is 16 too large. */
5770c0ad 92 sub dst, dst, 16 /* Dst is biased by -32. */
a8c5a2a9 93 sub count, count, 64 + 16 /* Adjust count and bias for loop. */
5770c0ad
WD
941: stp q0, q0, [dst, 32]
95 stp q0, q0, [dst, 64]!
a8c5a2a9
WD
96L(tail64):
97 subs count, count, 64
98 b.hi 1b
992: stp q0, q0, [dstend, -64]
100 stp q0, q0, [dstend, -32]
101 ret
102
a8c5a2a9 103L(try_zva):
5a67c4fa
SP
104#ifdef ZVA_MACRO
105 zva_macro
106#else
107 .p2align 3
58faa087 108 mrs tmp1, dczid_el0
a8c5a2a9
WD
109 tbnz tmp1w, 4, L(no_zva)
110 and tmp1w, tmp1w, 15
111 cmp tmp1w, 4 /* ZVA size is 64 bytes. */
112 b.ne L(zva_128)
113
114 /* Write the first and last 64 byte aligned block using stp rather
115 than using DC ZVA. This is faster on some cores.
116 */
117L(zva_64):
118 str q0, [dst, 16]
119 stp q0, q0, [dst, 32]
120 bic dst, dst, 63
121 stp q0, q0, [dst, 64]
122 stp q0, q0, [dst, 96]
123 sub count, dstend, dst /* Count is now 128 too large. */
124 sub count, count, 128+64+64 /* Adjust count and bias for loop. */
125 add dst, dst, 128
126 nop
1271: dc zva, dst
128 add dst, dst, 64
129 subs count, count, 64
130 b.hi 1b
131 stp q0, q0, [dst, 0]
132 stp q0, q0, [dst, 32]
133 stp q0, q0, [dstend, -64]
134 stp q0, q0, [dstend, -32]
135 ret
136
137 .p2align 3
138L(zva_128):
139 cmp tmp1w, 5 /* ZVA size is 128 bytes. */
140 b.ne L(zva_other)
141
142 str q0, [dst, 16]
143 stp q0, q0, [dst, 32]
144 stp q0, q0, [dst, 64]
145 stp q0, q0, [dst, 96]
146 bic dst, dst, 127
147 sub count, dstend, dst /* Count is now 128 too large. */
148 sub count, count, 128+128 /* Adjust count and bias for loop. */
149 add dst, dst, 128
1501: dc zva, dst
151 add dst, dst, 128
152 subs count, count, 128
153 b.hi 1b
154 stp q0, q0, [dstend, -128]
155 stp q0, q0, [dstend, -96]
156 stp q0, q0, [dstend, -64]
157 stp q0, q0, [dstend, -32]
158 ret
159
160L(zva_other):
161 mov tmp2w, 4
162 lsl zva_lenw, tmp2w, tmp1w
163 add tmp1, zva_len, 64 /* Max alignment bytes written. */
164 cmp count, tmp1
165 blo L(no_zva)
166
167 sub tmp2, zva_len, 1
168 add tmp1, dst, zva_len
169 add dst, dst, 16
170 subs count, tmp1, dst /* Actual alignment bytes to write. */
171 bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
172 beq 2f
1731: stp q0, q0, [dst], 64
174 stp q0, q0, [dst, -32]
175 subs count, count, 64
176 b.hi 1b
1772: mov dst, tmp1
178 sub count, dstend, tmp1 /* Remaining bytes to write. */
179 subs count, count, zva_len
180 b.lo 4f
1813: dc zva, dst
182 add dst, dst, zva_len
183 subs count, count, zva_len
184 b.hs 3b
1854: add count, count, zva_len
5770c0ad 186 sub dst, dst, 32 /* Bias dst for tail loop. */
a8c5a2a9 187 b L(tail64)
5a67c4fa 188#endif
58faa087 189
5a67c4fa
SP
190END (MEMSET)
191libc_hidden_builtin_def (MEMSET)