]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/strlen.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / strlen.S
CommitLineData
9a0a462c 1/* Optimized strlen implementation for PowerPC.
b168057a 2 Copyright (C) 1997-2015 Free Software Foundation, Inc.
9a0a462c
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9a0a462c
UD
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 13 Lesser General Public License for more details.
9a0a462c 14
41bdb6e2 15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
9a0a462c
UD
18
19#include <sysdep.h>
20
21/* The algorithm here uses the following techniques:
22
23 1) Given a word 'x', we can test to see if it contains any 0 bytes
24 by subtracting 0x01010101, and seeing if any of the high bits of each
25 byte changed from 0 to 1. This works because the least significant
26 0 byte must have had no incoming carry (otherwise it's not the least
27 significant), so it is 0x00 - 0x01 == 0xff. For all other
28 byte values, either they have the high bit set initially, or when
29 1 is subtracted you get a value in the range 0x00-0x7f, none of which
30 have their high bit set. The expression here is
31 (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
db9b4570
AM
32 there were no 0x00 bytes in the word. You get 0x80 in bytes that
33 match, but possibly false 0x80 matches in the next more significant
34 byte to a true match due to carries. For little-endian this is
35 of no consequence since the least significant match is the one
36 we're interested in, but big-endian needs method 2 to find which
37 byte matches.
9a0a462c
UD
38
39 2) Given a word 'x', we can test to see _which_ byte was zero by
40 calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
41 This produces 0x80 in each byte that was zero, and 0x00 in all
42 the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
43 byte, and the '| x' part ensures that bytes with the high bit set
44 produce 0x00. The addition will carry into the high bit of each byte
45 iff that byte had one of its low 7 bits set. We can then just see
46 which was the most significant bit set and divide by 8 to find how
47 many to add to the index.
48 This is from the book 'The PowerPC Compiler Writer's Guide',
49 by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
50
51 We deal with strings not aligned to a word boundary by taking the
52 first word and ensuring that bytes not part of the string
53 are treated as nonzero. To allow for memory latency, we unroll the
54 loop a few times, being careful to ensure that we do not read ahead
55 across cache line boundaries.
56
57 Questions to answer:
58 1) How long are strings passed to strlen? If they're often really long,
59 we should probably use cache management instructions and/or unroll the
60 loop more. If they're often quite short, it might be better to use
61 fact (2) in the inner loop than have to recalculate it.
62 2) How popular are bytes with the high bit set? If they are very rare,
63 on some processors it might be useful to use the simpler expression
64 ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
65 ALU), but this fails when any character has its high bit set. */
66
67/* Some notes on register usage: Under the SVR4 ABI, we can use registers
68 0 and 3 through 12 (so long as we don't call any procedures) without
69 saving them. We can also use registers 14 through 31 if we save them.
70 We can't use r1 (it's the stack pointer), r2 nor r13 because the user
71 program may expect them to hold their usual value if we get sent
72 a signal. Integer parameters are passed in r3 through r10.
73 We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
74 them, the others we must save. */
75
1d280d9f
GM
76/* int [r3] strlen (char *s [r3]) */
77
b5510883 78ENTRY (strlen)
1d280d9f 79
db9b4570 80#define rTMP4 r0
1d280d9f
GM
81#define rRTN r3 /* incoming STR arg, outgoing result */
82#define rSTR r4 /* current string position */
83#define rPADN r5 /* number of padding bits we prepend to the
84 string to make it start at a word boundary */
85#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */
86#define r7F7F r7 /* constant 0x7f7f7f7f */
87#define rWORD1 r8 /* current string word */
88#define rWORD2 r9 /* next string word */
89#define rMASK r9 /* mask for first string word */
db9b4570
AM
90#define rTMP1 r10
91#define rTMP2 r11
92#define rTMP3 r12
1d280d9f 93
fa87f403 94
1d280d9f
GM
95 clrrwi rSTR, rRTN, 2
96 lis r7F7F, 0x7f7f
97 rlwinm rPADN, rRTN, 3, 27, 28
98 lwz rWORD1, 0(rSTR)
99 li rMASK, -1
100 addi r7F7F, r7F7F, 0x7f7f
db9b4570
AM
101/* We use method (2) on the first two words, because rFEFE isn't
102 required which reduces setup overhead. Also gives a faster return
103 for small strings on big-endian due to needing to recalculate with
104 method (2) anyway. */
105#ifdef __LITTLE_ENDIAN__
106 slw rMASK, rMASK, rPADN
107#else
1d280d9f 108 srw rMASK, rMASK, rPADN
db9b4570 109#endif
1d280d9f
GM
110 and rTMP1, r7F7F, rWORD1
111 or rTMP2, r7F7F, rWORD1
112 add rTMP1, rTMP1, r7F7F
db9b4570
AM
113 nor rTMP3, rTMP2, rTMP1
114 and. rTMP3, rTMP3, rMASK
1d280d9f
GM
115 mtcrf 0x01, rRTN
116 bne L(done0)
117 lis rFEFE, -0x101
118 addi rFEFE, rFEFE, -0x101
9a0a462c 119/* Are we now aligned to a doubleword boundary? */
1d280d9f 120 bt 29, L(loop)
9a0a462c
UD
121
122/* Handle second word of pair. */
db9b4570 123/* Perhaps use method (1) here for little-endian, saving one instruction? */
1d280d9f
GM
124 lwzu rWORD1, 4(rSTR)
125 and rTMP1, r7F7F, rWORD1
126 or rTMP2, r7F7F, rWORD1
127 add rTMP1, rTMP1, r7F7F
db9b4570 128 nor. rTMP3, rTMP2, rTMP1
1d280d9f 129 bne L(done0)
9a0a462c
UD
130
131/* The loop. */
132
133L(loop):
1d280d9f
GM
134 lwz rWORD1, 4(rSTR)
135 lwzu rWORD2, 8(rSTR)
136 add rTMP1, rFEFE, rWORD1
137 nor rTMP2, r7F7F, rWORD1
138 and. rTMP1, rTMP1, rTMP2
139 add rTMP3, rFEFE, rWORD2
140 nor rTMP4, r7F7F, rWORD2
141 bne L(done1)
db9b4570 142 and. rTMP3, rTMP3, rTMP4
1d280d9f
GM
143 beq L(loop)
144
db9b4570 145#ifndef __LITTLE_ENDIAN__
1d280d9f
GM
146 and rTMP1, r7F7F, rWORD2
147 add rTMP1, rTMP1, r7F7F
db9b4570 148 andc rTMP3, rTMP4, rTMP1
1d280d9f 149 b L(done0)
9a0a462c
UD
150
151L(done1):
1d280d9f
GM
152 and rTMP1, r7F7F, rWORD1
153 subi rSTR, rSTR, 4
154 add rTMP1, rTMP1, r7F7F
db9b4570 155 andc rTMP3, rTMP2, rTMP1
9a0a462c 156
1d280d9f 157/* When we get to here, rSTR points to the first word in the string that
db9b4570
AM
158 contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
159 and 0x00 otherwise. */
9a0a462c 160L(done0):
db9b4570 161 cntlzw rTMP3, rTMP3
1d280d9f
GM
162 subf rTMP1, rRTN, rSTR
163 srwi rTMP3, rTMP3, 3
164 add rRTN, rTMP1, rTMP3
9a0a462c 165 blr
db9b4570
AM
166#else
167
168L(done0):
169 addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
170 andc rTMP1, rTMP1, rTMP3
171 cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */
172 subf rTMP3, rRTN, rSTR
173 subfic rTMP1, rTMP1, 32-7
174 srwi rTMP1, rTMP1, 3
175 add rRTN, rTMP1, rTMP3
176 blr
177
178L(done1):
179 addi rTMP3, rTMP1, -1
180 andc rTMP3, rTMP3, rTMP1
181 cntlzw rTMP3, rTMP3
182 subf rTMP1, rRTN, rSTR
183 subfic rTMP3, rTMP3, 32-7-32
184 srawi rTMP3, rTMP3, 3
185 add rRTN, rTMP1, rTMP3
186 blr
187#endif
188
b5510883 189END (strlen)
85dd1003 190libc_hidden_builtin_def (strlen)