]>
Commit | Line | Data |
---|---|---|
c7d3890f FX |
1 | /* Copyright (C) 2018 Free Software Foundation, Inc. |
2 | ||
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | #include "memset-reg.h" | |
21 | ||
22 | #ifndef MEMSET | |
23 | # define MEMSET __memset_base64 | |
24 | #endif | |
25 | ||
26 | #ifndef DC_ZVA_THRESHOLD | |
27 | # define DC_ZVA_THRESHOLD 512 | |
28 | #endif | |
29 | ||
30 | /* Assumptions: | |
31 | * | |
32 | * ARMv8-a, AArch64, unaligned accesses | |
33 | * | |
34 | */ | |
35 | ||
36 | ENTRY_ALIGN (MEMSET, 6) | |
37 | ||
38 | DELOUSE (0) | |
39 | DELOUSE (2) | |
40 | ||
41 | bfi valw, valw, 8, 8 | |
42 | bfi valw, valw, 16, 16 | |
43 | bfi val, val, 32, 32 | |
44 | ||
45 | add dstend, dstin, count | |
46 | ||
47 | cmp count, 96 | |
48 | b.hi L(set_long) | |
49 | cmp count, 16 | |
50 | b.hs L(set_medium) | |
51 | ||
52 | /* Set 0..15 bytes. */ | |
53 | tbz count, 3, 1f | |
54 | str val, [dstin] | |
55 | str val, [dstend, -8] | |
56 | ret | |
57 | ||
58 | .p2align 3 | |
59 | 1: tbz count, 2, 2f | |
60 | str valw, [dstin] | |
61 | str valw, [dstend, -4] | |
62 | ret | |
63 | 2: cbz count, 3f | |
64 | strb valw, [dstin] | |
65 | tbz count, 1, 3f | |
66 | strh valw, [dstend, -2] | |
67 | 3: ret | |
68 | ||
69 | .p2align 3 | |
70 | /* Set 16..96 bytes. */ | |
71 | L(set_medium): | |
72 | stp val, val, [dstin] | |
73 | tbnz count, 6, L(set96) | |
74 | stp val, val, [dstend, -16] | |
75 | tbz count, 5, 1f | |
76 | stp val, val, [dstin, 16] | |
77 | stp val, val, [dstend, -32] | |
78 | 1: ret | |
79 | ||
80 | .p2align 4 | |
81 | /* Set 64..96 bytes. Write 64 bytes from the start and | |
82 | 32 bytes from the end. */ | |
83 | L(set96): | |
84 | stp val, val, [dstin, 16] | |
85 | stp val, val, [dstin, 32] | |
86 | stp val, val, [dstin, 48] | |
87 | stp val, val, [dstend, -32] | |
88 | stp val, val, [dstend, -16] | |
89 | ret | |
90 | ||
91 | .p2align 4 | |
92 | L(set_long): | |
93 | stp val, val, [dstin] | |
94 | cmp count, DC_ZVA_THRESHOLD | |
95 | ccmp val, 0, 0, cs | |
96 | bic dst, dstin, 15 | |
97 | b.eq L(zva_64) | |
98 | ||
99 | /* Small-size or non-zero memset does not use DC ZVA. */ | |
100 | sub count, dstend, dst | |
101 | ||
102 | /* | |
103 | * Adjust count and bias for loop. By substracting extra 1 from count, | |
104 | * it is easy to use tbz instruction to check whether loop tailing | |
105 | * count is less than 33 bytes, so as to bypass 2 unneccesary stps. | |
106 | */ | |
107 | sub count, count, 64+16+1 | |
108 | nop | |
109 | ||
110 | 1: stp val, val, [dst, 16] | |
111 | stp val, val, [dst, 32] | |
112 | stp val, val, [dst, 48] | |
113 | stp val, val, [dst, 64]! | |
114 | subs count, count, 64 | |
115 | b.hs 1b | |
116 | ||
117 | tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ | |
118 | stp val, val, [dst, 16] | |
119 | stp val, val, [dst, 32] | |
120 | 1: stp val, val, [dstend, -32] | |
121 | stp val, val, [dstend, -16] | |
122 | ret | |
123 | ||
124 | .p2align 3 | |
125 | L(zva_64): | |
126 | stp val, val, [dst, 16] | |
127 | stp val, val, [dst, 32] | |
128 | stp val, val, [dst, 48] | |
129 | bic dst, dst, 63 | |
130 | ||
131 | /* | |
132 | * Previous memory writes might cross cache line boundary, and cause | |
133 | * cache line partially dirty. Zeroing this kind of cache line using | |
134 | * DC ZVA will incur extra cost, for it requires loading untouched | |
135 | * part of the line from memory before zeoring. | |
136 | * | |
137 | * So, write the first 64 byte aligned block using stp to force | |
138 | * fully dirty cache line. | |
139 | */ | |
140 | stp val, val, [dst, 64] | |
141 | stp val, val, [dst, 80] | |
142 | stp val, val, [dst, 96] | |
143 | stp val, val, [dst, 112] | |
144 | ||
145 | sub count, dstend, dst | |
146 | /* | |
147 | * Adjust count and bias for loop. By substracting extra 1 from count, | |
148 | * it is easy to use tbz instruction to check whether loop tailing | |
149 | * count is less than 33 bytes, so as to bypass 2 unneccesary stps. | |
150 | */ | |
151 | sub count, count, 128+64+64+1 | |
152 | add dst, dst, 128 | |
153 | nop | |
154 | ||
155 | /* DC ZVA sets 64 bytes each time. */ | |
156 | 1: dc zva, dst | |
157 | add dst, dst, 64 | |
158 | subs count, count, 64 | |
159 | b.hs 1b | |
160 | ||
161 | /* | |
162 | * Write the last 64 byte aligned block using stp to force fully | |
163 | * dirty cache line. | |
164 | */ | |
165 | stp val, val, [dst, 0] | |
166 | stp val, val, [dst, 16] | |
167 | stp val, val, [dst, 32] | |
168 | stp val, val, [dst, 48] | |
169 | ||
170 | tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ | |
171 | stp val, val, [dst, 64] | |
172 | stp val, val, [dst, 80] | |
173 | 1: stp val, val, [dstend, -32] | |
174 | stp val, val, [dstend, -16] | |
175 | ret | |
176 | ||
177 | END (MEMSET) | |
178 | libc_hidden_builtin_def (MEMSET) |