]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/aarch64/multiarch/memset_base64.S
aarch64: Optimized memset specific to AmpereComputing emag
[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memset_base64.S
CommitLineData
c7d3890f
FX
1/* Copyright (C) 2018 Free Software Foundation, Inc.
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "memset-reg.h"
21
22#ifndef MEMSET
23# define MEMSET __memset_base64
24#endif
25
26#ifndef DC_ZVA_THRESHOLD
27# define DC_ZVA_THRESHOLD 512
28#endif
29
30/* Assumptions:
31 *
32 * ARMv8-a, AArch64, unaligned accesses
33 *
34 */
35
36ENTRY_ALIGN (MEMSET, 6)
37
38 DELOUSE (0)
39 DELOUSE (2)
40
41 bfi valw, valw, 8, 8
42 bfi valw, valw, 16, 16
43 bfi val, val, 32, 32
44
45 add dstend, dstin, count
46
47 cmp count, 96
48 b.hi L(set_long)
49 cmp count, 16
50 b.hs L(set_medium)
51
52 /* Set 0..15 bytes. */
53 tbz count, 3, 1f
54 str val, [dstin]
55 str val, [dstend, -8]
56 ret
57
58 .p2align 3
591: tbz count, 2, 2f
60 str valw, [dstin]
61 str valw, [dstend, -4]
62 ret
632: cbz count, 3f
64 strb valw, [dstin]
65 tbz count, 1, 3f
66 strh valw, [dstend, -2]
673: ret
68
69 .p2align 3
70 /* Set 16..96 bytes. */
71L(set_medium):
72 stp val, val, [dstin]
73 tbnz count, 6, L(set96)
74 stp val, val, [dstend, -16]
75 tbz count, 5, 1f
76 stp val, val, [dstin, 16]
77 stp val, val, [dstend, -32]
781: ret
79
80 .p2align 4
81 /* Set 64..96 bytes. Write 64 bytes from the start and
82 32 bytes from the end. */
83L(set96):
84 stp val, val, [dstin, 16]
85 stp val, val, [dstin, 32]
86 stp val, val, [dstin, 48]
87 stp val, val, [dstend, -32]
88 stp val, val, [dstend, -16]
89 ret
90
91 .p2align 4
92L(set_long):
93 stp val, val, [dstin]
94 cmp count, DC_ZVA_THRESHOLD
95 ccmp val, 0, 0, cs
96 bic dst, dstin, 15
97 b.eq L(zva_64)
98
99 /* Small-size or non-zero memset does not use DC ZVA. */
100 sub count, dstend, dst
101
102 /*
103 * Adjust count and bias for loop. By substracting extra 1 from count,
104 * it is easy to use tbz instruction to check whether loop tailing
105 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
106 */
107 sub count, count, 64+16+1
108 nop
109
1101: stp val, val, [dst, 16]
111 stp val, val, [dst, 32]
112 stp val, val, [dst, 48]
113 stp val, val, [dst, 64]!
114 subs count, count, 64
115 b.hs 1b
116
117 tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
118 stp val, val, [dst, 16]
119 stp val, val, [dst, 32]
1201: stp val, val, [dstend, -32]
121 stp val, val, [dstend, -16]
122 ret
123
124 .p2align 3
125L(zva_64):
126 stp val, val, [dst, 16]
127 stp val, val, [dst, 32]
128 stp val, val, [dst, 48]
129 bic dst, dst, 63
130
131 /*
132 * Previous memory writes might cross cache line boundary, and cause
133 * cache line partially dirty. Zeroing this kind of cache line using
134 * DC ZVA will incur extra cost, for it requires loading untouched
135 * part of the line from memory before zeoring.
136 *
137 * So, write the first 64 byte aligned block using stp to force
138 * fully dirty cache line.
139 */
140 stp val, val, [dst, 64]
141 stp val, val, [dst, 80]
142 stp val, val, [dst, 96]
143 stp val, val, [dst, 112]
144
145 sub count, dstend, dst
146 /*
147 * Adjust count and bias for loop. By substracting extra 1 from count,
148 * it is easy to use tbz instruction to check whether loop tailing
149 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
150 */
151 sub count, count, 128+64+64+1
152 add dst, dst, 128
153 nop
154
155 /* DC ZVA sets 64 bytes each time. */
1561: dc zva, dst
157 add dst, dst, 64
158 subs count, count, 64
159 b.hs 1b
160
161 /*
162 * Write the last 64 byte aligned block using stp to force fully
163 * dirty cache line.
164 */
165 stp val, val, [dst, 0]
166 stp val, val, [dst, 16]
167 stp val, val, [dst, 32]
168 stp val, val, [dst, 48]
169
170 tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
171 stp val, val, [dst, 64]
172 stp val, val, [dst, 80]
1731: stp val, val, [dstend, -32]
174 stp val, val, [dstend, -16]
175 ret
176
177END (MEMSET)
178libc_hidden_builtin_def (MEMSET)