]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/strlen.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / strlen.S
1 /* Optimized strlen implementation for PowerPC64.
2 Copyright (C) 1997-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* The algorithm here uses the following techniques:
22
23 1) Given a word 'x', we can test to see if it contains any 0 bytes
24 by subtracting 0x01010101, and seeing if any of the high bits of each
25 byte changed from 0 to 1. This works because the least significant
26 0 byte must have had no incoming carry (otherwise it's not the least
27 significant), so it is 0x00 - 0x01 == 0xff. For all other
28 byte values, either they have the high bit set initially, or when
29 1 is subtracted you get a value in the range 0x00-0x7f, none of which
30 have their high bit set. The expression here is
31 (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
32 there were no 0x00 bytes in the word. You get 0x80 in bytes that
33 match, but possibly false 0x80 matches in the next more significant
34 byte to a true match due to carries. For little-endian this is
35 of no consequence since the least significant match is the one
36 we're interested in, but big-endian needs method 2 to find which
37 byte matches.
38
39 2) Given a word 'x', we can test to see _which_ byte was zero by
40 calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
41 This produces 0x80 in each byte that was zero, and 0x00 in all
42 the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
43 byte, and the '| x' part ensures that bytes with the high bit set
44 produce 0x00. The addition will carry into the high bit of each byte
45 iff that byte had one of its low 7 bits set. We can then just see
46 which was the most significant bit set and divide by 8 to find how
47 many to add to the index.
48 This is from the book 'The PowerPC Compiler Writer's Guide',
49 by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
50
51 We deal with strings not aligned to a word boundary by taking the
52 first word and ensuring that bytes not part of the string
53 are treated as nonzero. To allow for memory latency, we unroll the
54 loop a few times, being careful to ensure that we do not read ahead
55 across cache line boundaries.
56
57 Questions to answer:
58 1) How long are strings passed to strlen? If they're often really long,
59 we should probably use cache management instructions and/or unroll the
60 loop more. If they're often quite short, it might be better to use
61 fact (2) in the inner loop than have to recalculate it.
62 2) How popular are bytes with the high bit set? If they are very rare,
63 on some processors it might be useful to use the simpler expression
64 ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
65 ALU), but this fails when any character has its high bit set.
66
67 Answer:
68 1) Added a Data Cache Block Touch early to prefetch the first 128
69 byte cache line. Adding dcbt instructions to the loop would not be
70 effective since most strings will be shorter than the cache line. */
71
72 /* Some notes on register usage: Under the SVR4 ABI, we can use registers
73 0 and 3 through 12 (so long as we don't call any procedures) without
74 saving them. We can also use registers 14 through 31 if we save them.
75 We can't use r1 (it's the stack pointer), r2 nor r13 because the user
76 program may expect them to hold their usual value if we get sent
77 a signal. Integer parameters are passed in r3 through r10.
78 We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
79 them, the others we must save. */
80
81 /* int [r3] strlen (char *s [r3]) */
82
83 ENTRY (strlen)
84 CALL_MCOUNT 1
85
86 #define rTMP4 r0
87 #define rRTN r3 /* incoming STR arg, outgoing result */
88 #define rSTR r4 /* current string position */
89 #define rPADN r5 /* number of padding bits we prepend to the
90 string to make it start at a word boundary */
91 #define rFEFE r6 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
92 #define r7F7F r7 /* constant 0x7f7f7f7f7f7f7f7f */
93 #define rWORD1 r8 /* current string doubleword */
94 #define rWORD2 r9 /* next string doubleword */
95 #define rMASK r9 /* mask for first string doubleword */
96 #define rTMP1 r10
97 #define rTMP2 r11
98 #define rTMP3 r12
99
100 dcbt 0,rRTN
101 clrrdi rSTR, rRTN, 3
102 lis r7F7F, 0x7f7f
103 rlwinm rPADN, rRTN, 3, 26, 28
104 ld rWORD1, 0(rSTR)
105 addi r7F7F, r7F7F, 0x7f7f
106 li rMASK, -1
107 insrdi r7F7F, r7F7F, 32, 0
108 /* We use method (2) on the first two doublewords, because rFEFE isn't
109 required which reduces setup overhead. Also gives a faster return
110 for small strings on big-endian due to needing to recalculate with
111 method (2) anyway. */
112 #ifdef __LITTLE_ENDIAN__
113 sld rMASK, rMASK, rPADN
114 #else
115 srd rMASK, rMASK, rPADN
116 #endif
117 and rTMP1, r7F7F, rWORD1
118 or rTMP2, r7F7F, rWORD1
119 lis rFEFE, -0x101
120 add rTMP1, rTMP1, r7F7F
121 addi rFEFE, rFEFE, -0x101
122 nor rTMP3, rTMP2, rTMP1
123 and. rTMP3, rTMP3, rMASK
124 mtcrf 0x01, rRTN
125 bne L(done0)
126 sldi rTMP1, rFEFE, 32
127 add rFEFE, rFEFE, rTMP1
128 /* Are we now aligned to a doubleword boundary? */
129 bt 28, L(loop)
130
131 /* Handle second doubleword of pair. */
132 /* Perhaps use method (1) here for little-endian, saving one instruction? */
133 ldu rWORD1, 8(rSTR)
134 and rTMP1, r7F7F, rWORD1
135 or rTMP2, r7F7F, rWORD1
136 add rTMP1, rTMP1, r7F7F
137 nor. rTMP3, rTMP2, rTMP1
138 bne L(done0)
139
140 /* The loop. */
141
142 L(loop):
143 ld rWORD1, 8(rSTR)
144 ldu rWORD2, 16(rSTR)
145 add rTMP1, rFEFE, rWORD1
146 nor rTMP2, r7F7F, rWORD1
147 and. rTMP1, rTMP1, rTMP2
148 add rTMP3, rFEFE, rWORD2
149 nor rTMP4, r7F7F, rWORD2
150 bne L(done1)
151 and. rTMP3, rTMP3, rTMP4
152 beq L(loop)
153
154 #ifndef __LITTLE_ENDIAN__
155 and rTMP1, r7F7F, rWORD2
156 add rTMP1, rTMP1, r7F7F
157 andc rTMP3, rTMP4, rTMP1
158 b L(done0)
159
160 L(done1):
161 and rTMP1, r7F7F, rWORD1
162 subi rSTR, rSTR, 8
163 add rTMP1, rTMP1, r7F7F
164 andc rTMP3, rTMP2, rTMP1
165
166 /* When we get to here, rSTR points to the first doubleword in the string that
167 contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
168 otherwise. */
169 L(done0):
170 cntlzd rTMP3, rTMP3
171 subf rTMP1, rRTN, rSTR
172 srdi rTMP3, rTMP3, 3
173 add rRTN, rTMP1, rTMP3
174 blr
175 #else
176
177 L(done0):
178 addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
179 andc rTMP1, rTMP1, rTMP3
180 cntlzd rTMP1, rTMP1 /* Count bits not in the mask. */
181 subf rTMP3, rRTN, rSTR
182 subfic rTMP1, rTMP1, 64-7
183 srdi rTMP1, rTMP1, 3
184 add rRTN, rTMP1, rTMP3
185 blr
186
187 L(done1):
188 addi rTMP3, rTMP1, -1
189 andc rTMP3, rTMP3, rTMP1
190 cntlzd rTMP3, rTMP3
191 subf rTMP1, rRTN, rSTR
192 subfic rTMP3, rTMP3, 64-7-64
193 sradi rTMP3, rTMP3, 3
194 add rRTN, rTMP1, rTMP3
195 blr
196 #endif
197
198 END (strlen)
199 libc_hidden_builtin_def (strlen)