]>
Commit | Line | Data |
---|---|---|
6d7e8eda | 1 | /* Copyright (C) 2018-2023 Free Software Foundation, Inc. |
c7d3890f FX |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
c7d3890f FX |
18 | |
19 | #include <sysdep.h> | |
20 | #include "memset-reg.h" | |
21 | ||
22 | #ifndef MEMSET | |
23 | # define MEMSET __memset_base64 | |
24 | #endif | |
25 | ||
b68fabfb | 26 | /* To disable DC ZVA, set this threshold to 0. */ |
c7d3890f FX |
27 | #ifndef DC_ZVA_THRESHOLD |
28 | # define DC_ZVA_THRESHOLD 512 | |
29 | #endif | |
30 | ||
31 | /* Assumptions: | |
32 | * | |
33 | * ARMv8-a, AArch64, unaligned accesses | |
34 | * | |
35 | */ | |
36 | ||
37 | ENTRY_ALIGN (MEMSET, 6) | |
38 | ||
45b1e17e SN |
39 | PTR_ARG (0) |
40 | SIZE_ARG (2) | |
c7d3890f FX |
41 | |
42 | bfi valw, valw, 8, 8 | |
43 | bfi valw, valw, 16, 16 | |
44 | bfi val, val, 32, 32 | |
45 | ||
46 | add dstend, dstin, count | |
47 | ||
48 | cmp count, 96 | |
49 | b.hi L(set_long) | |
50 | cmp count, 16 | |
51 | b.hs L(set_medium) | |
52 | ||
53 | /* Set 0..15 bytes. */ | |
54 | tbz count, 3, 1f | |
55 | str val, [dstin] | |
56 | str val, [dstend, -8] | |
57 | ret | |
58 | ||
59 | .p2align 3 | |
60 | 1: tbz count, 2, 2f | |
61 | str valw, [dstin] | |
62 | str valw, [dstend, -4] | |
63 | ret | |
64 | 2: cbz count, 3f | |
65 | strb valw, [dstin] | |
66 | tbz count, 1, 3f | |
67 | strh valw, [dstend, -2] | |
68 | 3: ret | |
69 | ||
70 | .p2align 3 | |
71 | /* Set 16..96 bytes. */ | |
72 | L(set_medium): | |
73 | stp val, val, [dstin] | |
74 | tbnz count, 6, L(set96) | |
75 | stp val, val, [dstend, -16] | |
76 | tbz count, 5, 1f | |
77 | stp val, val, [dstin, 16] | |
78 | stp val, val, [dstend, -32] | |
79 | 1: ret | |
80 | ||
81 | .p2align 4 | |
82 | /* Set 64..96 bytes. Write 64 bytes from the start and | |
83 | 32 bytes from the end. */ | |
84 | L(set96): | |
85 | stp val, val, [dstin, 16] | |
86 | stp val, val, [dstin, 32] | |
87 | stp val, val, [dstin, 48] | |
88 | stp val, val, [dstend, -32] | |
89 | stp val, val, [dstend, -16] | |
90 | ret | |
91 | ||
92 | .p2align 4 | |
93 | L(set_long): | |
94 | stp val, val, [dstin] | |
b68fabfb FX |
95 | bic dst, dstin, 15 |
96 | #if DC_ZVA_THRESHOLD | |
c7d3890f FX |
97 | cmp count, DC_ZVA_THRESHOLD |
98 | ccmp val, 0, 0, cs | |
c7d3890f | 99 | b.eq L(zva_64) |
b68fabfb | 100 | #endif |
c7d3890f FX |
101 | /* Small-size or non-zero memset does not use DC ZVA. */ |
102 | sub count, dstend, dst | |
103 | ||
104 | /* | |
105 | * Adjust count and bias for loop. By substracting extra 1 from count, | |
106 | * it is easy to use tbz instruction to check whether loop tailing | |
107 | * count is less than 33 bytes, so as to bypass 2 unneccesary stps. | |
108 | */ | |
109 | sub count, count, 64+16+1 | |
b68fabfb FX |
110 | |
111 | #if DC_ZVA_THRESHOLD | |
112 | /* Align loop on 16-byte boundary, this might be friendly to i-cache. */ | |
c7d3890f | 113 | nop |
b68fabfb | 114 | #endif |
c7d3890f FX |
115 | |
116 | 1: stp val, val, [dst, 16] | |
117 | stp val, val, [dst, 32] | |
118 | stp val, val, [dst, 48] | |
119 | stp val, val, [dst, 64]! | |
120 | subs count, count, 64 | |
121 | b.hs 1b | |
122 | ||
123 | tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ | |
124 | stp val, val, [dst, 16] | |
125 | stp val, val, [dst, 32] | |
126 | 1: stp val, val, [dstend, -32] | |
127 | stp val, val, [dstend, -16] | |
128 | ret | |
129 | ||
b68fabfb | 130 | #if DC_ZVA_THRESHOLD |
c7d3890f FX |
131 | .p2align 3 |
132 | L(zva_64): | |
133 | stp val, val, [dst, 16] | |
134 | stp val, val, [dst, 32] | |
135 | stp val, val, [dst, 48] | |
136 | bic dst, dst, 63 | |
137 | ||
138 | /* | |
139 | * Previous memory writes might cross cache line boundary, and cause | |
140 | * cache line partially dirty. Zeroing this kind of cache line using | |
141 | * DC ZVA will incur extra cost, for it requires loading untouched | |
142 | * part of the line from memory before zeoring. | |
143 | * | |
144 | * So, write the first 64 byte aligned block using stp to force | |
145 | * fully dirty cache line. | |
146 | */ | |
147 | stp val, val, [dst, 64] | |
148 | stp val, val, [dst, 80] | |
149 | stp val, val, [dst, 96] | |
150 | stp val, val, [dst, 112] | |
151 | ||
152 | sub count, dstend, dst | |
153 | /* | |
154 | * Adjust count and bias for loop. By substracting extra 1 from count, | |
155 | * it is easy to use tbz instruction to check whether loop tailing | |
156 | * count is less than 33 bytes, so as to bypass 2 unneccesary stps. | |
157 | */ | |
158 | sub count, count, 128+64+64+1 | |
159 | add dst, dst, 128 | |
160 | nop | |
161 | ||
162 | /* DC ZVA sets 64 bytes each time. */ | |
163 | 1: dc zva, dst | |
164 | add dst, dst, 64 | |
165 | subs count, count, 64 | |
166 | b.hs 1b | |
167 | ||
168 | /* | |
169 | * Write the last 64 byte aligned block using stp to force fully | |
170 | * dirty cache line. | |
171 | */ | |
172 | stp val, val, [dst, 0] | |
173 | stp val, val, [dst, 16] | |
174 | stp val, val, [dst, 32] | |
175 | stp val, val, [dst, 48] | |
176 | ||
177 | tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ | |
178 | stp val, val, [dst, 64] | |
179 | stp val, val, [dst, 80] | |
180 | 1: stp val, val, [dstend, -32] | |
181 | stp val, val, [dstend, -16] | |
182 | ret | |
b68fabfb | 183 | #endif |
c7d3890f FX |
184 | |
185 | END (MEMSET) | |
186 | libc_hidden_builtin_def (MEMSET) |