1 /* Optimized strcmp implementation for PowerPC64/POWER9.
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18 #ifdef __LITTLE_ENDIAN__
21 /* Implements the function
23 int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
25 The implementation uses unaligned doubleword access for first 32 bytes
26 as in POWER8 patch and uses vectorised loops after that. */
28 /* TODO: Change this to actual instructions when minimum binutils is upgraded
29 to 2.27. Macros are defined below for these newer instructions in order
30 to maintain compatibility. */
31 # define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
33 # define VEXTUBRX(t,a,b) .long (0x1000070d \
38 # define VCMPNEZB(t,a,b) .long (0x10000507 \
43 /* Get 16 bytes for unaligned case.
44 reg1: Vector to hold next 16 bytes.
45 reg2: Address to read from.
46 reg3: Permute control vector. */
47 # define GET16BYTES(reg1, reg2, reg3) \
49 vperm v8, v2, reg1, reg3; \
50 vcmpequb. v8, v0, v8; \
59 vperm reg1, v9, reg1, reg3;
61 /* TODO: change this to .machine power9 when the minimum required binutils
68 /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
71 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
73 with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
77 cmpldi cr7, r7, 4096-32
78 bgt cr7, L(pagecross_check)
79 cmpldi cr5, r9, 4096-32
80 bgt cr5, L(pagecross_check)
82 /* For short strings up to 32 bytes, load both s1 and s2 using
83 unaligned dwords and compare. */
89 bne cr0, L(different_nocmpb)
96 bne cr0, L(different_nocmpb)
103 bne cr0, L(different_nocmpb)
110 bne cr0, L(different_nocmpb)
116 /* Now it has checked for first 32 bytes. */
119 lvsr v6, 0, r4 /* Compute mask. */
125 lvsr v10, 0, r7 /* Compute mask. */
127 /* Both s1 and s2 are unaligned. */
128 GET16BYTES(v4, r7, v10)
129 GET16BYTES(v5, r4, v6)
134 /* Align s1 to qw and adjust s2 address. */
144 /* There are 2 loops depending on the input alignment.
145 Each loop gets 16 bytes from s1 and s2 and compares.
146 Loop until a mismatch or null occurs. */
149 GET16BYTES(v5, r4, v6)
153 bne cr6, L(different)
156 GET16BYTES(v5, r4, v6)
160 bne cr6, L(different)
163 GET16BYTES(v5, r4, v6)
167 bne cr6, L(different)
170 GET16BYTES(v5, r4, v6)
184 bne cr6, L(different)
191 bne cr6, L(different)
198 bne cr6, L(different)
207 /* Calculate and return the difference. */
224 rldicl r10, r10, 0, 56
235 bge cr7, L(pagecross)
238 /* If unaligned 16 bytes reads across a 4K page boundary, it uses
239 a simple byte a byte comparison until the page alignment for s1
248 /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
249 and if *s1 is '\0'. */
256 bne cr7, L(pagecross_ne)
257 beq cr5, L(pagecross_nullfound)
258 bdnz L(pagecross_loop)
265 L(pagecross_retdiff):
271 L(pagecross_nullfound):
273 b L(pagecross_retdiff)
275 libc_hidden_builtin_def (strcmp)
277 #include <sysdeps/powerpc/powerpc64/power8/strcmp.S>