]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/le/power9/strlen.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / le / power9 / strlen.S
CommitLineData
a23bd00f 1/* Optimized strlen implementation for PowerPC64/POWER9.
2b778ceb 2 Copyright (C) 2020-2021 Free Software Foundation, Inc.
a23bd00f
PM
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifndef STRLEN
22# define STRLEN __strlen
23# define DEFINE_STRLEN_HIDDEN_DEF 1
24#endif
25
26/* Implements the function
27
28 int [r3] strlen (const void *s [r3])
29
30 The implementation can load bytes past a matching byte, but only
31 up to the next 64B boundary, so it never crosses a page. */
32
33.machine power9
34ENTRY_TOCLESS (STRLEN, 4)
35 CALL_MCOUNT 2
36
37 vspltisb v18,0
38 vspltisb v19,-1
39
40 neg r5,r3
41 rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
42
43 /* Align data and fill bytes not loaded with non matching char. */
44 lvx v0,0,r3
45 lvsr v1,0,r3
46 vperm v0,v19,v0,v1
47
48 vcmpequb. v6,v0,v18
49 beq cr6,L(aligned)
50
51 vctzlsbb r3,v6
52 blr
53
54 /* Test 64B 16B at a time. The 64B vector loop is optimized for
55 longer strings. Likewise, we check a multiple of 64B to avoid
56 breaking the alignment calculation below. */
57L(aligned):
58 add r4,r3,r9
59 rldicl. r5,r4,60,62 /* Determine the number of 48B loops needed for
60 alignment to 64B. And test for zero. */
61
62 lxv v0+32,0(r4)
63 vcmpequb. v6,v0,v18
64 bne cr6,L(tail1)
65
66 lxv v0+32,16(r4)
67 vcmpequb. v6,v0,v18
68 bne cr6,L(tail2)
69
70 lxv v0+32,32(r4)
71 vcmpequb. v6,v0,v18
72 bne cr6,L(tail3)
73
74 lxv v0+32,48(r4)
75 vcmpequb. v6,v0,v18
76 bne cr6,L(tail4)
77 addi r4,r4,64
78
79 /* Speculatively generate a fake 16B aligned address to generate the
80 vector byte constant 0,1,..,15 using lvsl during reduction. */
81 li r0,0
82
83 /* Skip the alignment if already 64B aligned. */
84 beq L(loop_64b)
85 mtctr r5
86
87 /* Test 48B per iteration until 64B aligned. */
88 .p2align 5
89L(loop):
90 lxv v0+32,0(r4)
91 vcmpequb. v6,v0,v18
92 bne cr6,L(tail1)
93
94 lxv v0+32,16(r4)
95 vcmpequb. v6,v0,v18
96 bne cr6,L(tail2)
97
98 lxv v0+32,32(r4)
99 vcmpequb. v6,v0,v18
100 bne cr6,L(tail3)
101
102 addi r4,r4,48
103 bdnz L(loop)
104
105 .p2align 5
106L(loop_64b):
107 lxv v1+32,0(r4) /* Load 4 quadwords. */
108 lxv v2+32,16(r4)
109 lxv v3+32,32(r4)
110 lxv v4+32,48(r4)
111 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
112 vminub v6,v3,v4
113 vminub v7,v5,v6
114 vcmpequb. v7,v7,v18 /* Check for NULLs. */
115 addi r4,r4,64 /* Adjust address for the next iteration. */
116 bne cr6,L(vmx_zero)
117
118 lxv v1+32,0(r4) /* Load 4 quadwords. */
119 lxv v2+32,16(r4)
120 lxv v3+32,32(r4)
121 lxv v4+32,48(r4)
122 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
123 vminub v6,v3,v4
124 vminub v7,v5,v6
125 vcmpequb. v7,v7,v18 /* Check for NULLs. */
126 addi r4,r4,64 /* Adjust address for the next iteration. */
127 bne cr6,L(vmx_zero)
128
129 lxv v1+32,0(r4) /* Load 4 quadwords. */
130 lxv v2+32,16(r4)
131 lxv v3+32,32(r4)
132 lxv v4+32,48(r4)
133 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
134 vminub v6,v3,v4
135 vminub v7,v5,v6
136 vcmpequb. v7,v7,v18 /* Check for NULLs. */
137 addi r4,r4,64 /* Adjust address for the next iteration. */
138 beq cr6,L(loop_64b)
139
140L(vmx_zero):
141 /* OK, we found a null byte. Let's look for it in the current 64-byte
142 block and mark it in its corresponding VR. */
143 vcmpequb v1,v1,v18
144 vcmpequb v2,v2,v18
145 vcmpequb v3,v3,v18
146 vcmpequb v4,v4,v18
147
148 /* We will now 'compress' the result into a single doubleword, so it
149 can be moved to a GPR for the final calculation. First, we
150 generate an appropriate mask for vbpermq, so we can permute bits into
151 the first halfword. */
152 vspltisb v10,3
153 lvsl v11,0,r0
154 vslb v10,v11,v10
155
156 /* Permute the first bit of each byte into bits 48-63. */
157 vbpermq v1,v1,v10
158 vbpermq v2,v2,v10
159 vbpermq v3,v3,v10
160 vbpermq v4,v4,v10
161
162 /* Shift each component into its correct position for merging. */
163 vsldoi v2,v2,v2,2
164 vsldoi v3,v3,v3,4
165 vsldoi v4,v4,v4,6
166
167 /* Merge the results and move to a GPR. */
168 vor v1,v2,v1
169 vor v2,v3,v4
170 vor v4,v1,v2
171 mfvrd r10,v4
172
173 /* Adjust address to the begninning of the current 64-byte block. */
174 addi r4,r4,-64
175
176 cnttzd r0,r10 /* Count trailing zeros before the match. */
177 subf r5,r3,r4
178 add r3,r5,r0 /* Compute final length. */
179 blr
180
181L(tail1):
182 vctzlsbb r0,v6
183 add r4,r4,r0
184 subf r3,r3,r4
185 blr
186
187L(tail2):
188 vctzlsbb r0,v6
189 add r4,r4,r0
190 addi r4,r4,16
191 subf r3,r3,r4
192 blr
193
194L(tail3):
195 vctzlsbb r0,v6
196 add r4,r4,r0
197 addi r4,r4,32
198 subf r3,r3,r4
199 blr
200
201L(tail4):
202 vctzlsbb r0,v6
203 add r4,r4,r0
204 addi r4,r4,48
205 subf r3,r3,r4
206 blr
207
208END (STRLEN)
209
210#ifdef DEFINE_STRLEN_HIDDEN_DEF
211weak_alias (__strlen, strlen)
212libc_hidden_builtin_def (strlen)
213#endif