]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / loongarch / lp64 / multiarch / strcpy-lasx.S
1 /* Optimized strcpy stpcpy implementation using LoongArch LASX instructions.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include <sys/regdef.h>
21 #include <sys/asm.h>
22
23 #if IS_IN (libc) && !defined __loongarch_soft_float
24
25 # ifndef STRCPY
26 # define STRCPY __strcpy_lasx
27 # endif
28
29 # ifdef USE_AS_STPCPY
30 # define dstend a0
31 # else
32 # define dstend a4
33 # endif
34
35 LEAF(STRCPY, 6)
36 ori t8, zero, 0xfe0
37 andi t0, a1, 0xfff
38 li.d t7, -1
39 move a2, a0
40
41 bltu t8, t0, L(page_cross_start)
42 L(start_entry):
43 xvld xr0, a1, 0
44 li.d t0, 32
45 andi t1, a2, 0x1f
46
47 xvsetanyeqz.b fcc0, xr0
48 sub.d t0, t0, t1
49 bcnez fcc0, L(end)
50 add.d a1, a1, t0
51
52 xvst xr0, a2, 0
53 andi a3, a1, 0x1f
54 add.d a2, a2, t0
55 bnez a3, L(unaligned)
56
57
58 xvld xr0, a1, 0
59 xvsetanyeqz.b fcc0, xr0
60 bcnez fcc0, L(al_end)
61 L(al_loop):
62 xvst xr0, a2, 0
63
64 xvld xr0, a1, 32
65 addi.d a2, a2, 32
66 addi.d a1, a1, 32
67 xvsetanyeqz.b fcc0, xr0
68
69 bceqz fcc0, L(al_loop)
70 L(al_end):
71 xvmsknz.b xr0, xr0
72 xvpickve.w xr1, xr0, 4
73 vilvl.h vr0, vr1, vr0
74
75 movfr2gr.s t0, fa0
76 cto.w t0, t0
77 add.d a1, a1, t0
78 xvld xr0, a1, -31
79
80
81 add.d dstend, a2, t0
82 xvst xr0, dstend, -31
83 jr ra
84 nop
85
86 L(page_cross_start):
87 move a4, a1
88 bstrins.d a4, zero, 4, 0
89 xvld xr0, a4, 0
90 xvmsknz.b xr0, xr0
91
92 xvpickve.w xr1, xr0, 4
93 vilvl.h vr0, vr1, vr0
94 movfr2gr.s t0, fa0
95 sra.w t0, t0, a1
96
97 beq t0, t7, L(start_entry)
98 b L(tail)
99 L(unaligned):
100 andi t0, a1, 0xfff
101 bltu t8, t0, L(un_page_cross)
102
103
104 L(un_start_entry):
105 xvld xr0, a1, 0
106 xvsetanyeqz.b fcc0, xr0
107 bcnez fcc0, L(un_end)
108 addi.d a1, a1, 32
109
110 L(un_loop):
111 xvst xr0, a2, 0
112 andi t0, a1, 0xfff
113 addi.d a2, a2, 32
114 bltu t8, t0, L(page_cross_loop)
115
116 L(un_loop_entry):
117 xvld xr0, a1, 0
118 addi.d a1, a1, 32
119 xvsetanyeqz.b fcc0, xr0
120 bceqz fcc0, L(un_loop)
121
122 addi.d a1, a1, -32
123 L(un_end):
124 xvmsknz.b xr0, xr0
125 xvpickve.w xr1, xr0, 4
126 vilvl.h vr0, vr1, vr0
127
128
129 movfr2gr.s t0, fa0
130 L(un_tail):
131 cto.w t0, t0
132 add.d a1, a1, t0
133 xvld xr0, a1, -31
134
135 add.d dstend, a2, t0
136 xvst xr0, dstend, -31
137 jr ra
138 L(un_page_cross):
139 sub.d a4, a1, a3
140
141 xvld xr0, a4, 0
142 xvmsknz.b xr0, xr0
143 xvpickve.w xr1, xr0, 4
144 vilvl.h vr0, vr1, vr0
145
146 movfr2gr.s t0, fa0
147 sra.w t0, t0, a1
148 beq t0, t7, L(un_start_entry)
149 b L(un_tail)
150
151
152 L(page_cross_loop):
153 sub.d a4, a1, a3
154 xvld xr0, a4, 0
155 xvmsknz.b xr0, xr0
156 xvpickve.w xr1, xr0, 4
157
158 vilvl.h vr0, vr1, vr0
159 movfr2gr.s t0, fa0
160 sra.w t0, t0, a1
161 beq t0, t7, L(un_loop_entry)
162
163 b L(un_tail)
164 L(end):
165 xvmsknz.b xr0, xr0
166 xvpickve.w xr1, xr0, 4
167 vilvl.h vr0, vr1, vr0
168
169 movfr2gr.s t0, fa0
170 L(tail):
171 cto.w t0, t0
172 add.d dstend, a2, t0
173 add.d a5, a1, t0
174
175 L(less_32):
176 srli.d t1, t0, 4
177 beqz t1, L(less_16)
178 vld vr0, a1, 0
179 vld vr1, a5, -15
180
181 vst vr0, a2, 0
182 vst vr1, dstend, -15
183 jr ra
184 L(less_16):
185 srli.d t1, t0, 3
186
187 beqz t1, L(less_8)
188 ld.d t2, a1, 0
189 ld.d t3, a5, -7
190 st.d t2, a2, 0
191
192 st.d t3, dstend, -7
193 jr ra
194 L(less_8):
195 li.d t1, 3
196 bltu t0, t1, L(less_3)
197
198 ld.w t2, a1, 0
199 ld.w t3, a5, -3
200 st.w t2, a2, 0
201 st.w t3, dstend, -3
202
203 jr ra
204 L(less_3):
205 beqz t0, L(zero_byte)
206 ld.h t2, a1, 0
207
208 st.h t2, a2, 0
209 L(zero_byte):
210 st.b zero, dstend, 0
211 jr ra
212 END(STRCPY)
213
214 libc_hidden_builtin_def (STRCPY)
215 #endif