[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strspn.S

/* Optimized strspn implementation for Power8.

   Copyright (C) 2016-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

/* size_t [r3] strspn (const char *string [r3],
                       const char *needleAccept [r4])  */

/* This takes a novel approach by computing a 256 bit mask whereby
   each set bit implies the byte is "accepted".  P8 vector hardware
   has extremely efficient hardware for selecting bits from a mask.

   One might ask "why not use bpermd for short strings"?  It is
   so slow that its performance about matches the generic PPC64
   variant without any fancy masking, with the added expense of
   making the mask.  That was the first variant of this.  */


#include "sysdep.h"

#ifndef USE_AS_STRCSPN
#  define USE_AS_STRCSPN 0
#  ifndef STRSPN
#    define STRSPN strspn
#  endif
#  define INITIAL_MASK 0
#  define UPDATE_MASK(RA, RS, RB) or	RA, RS, RB
#else
#  ifndef STRSPN
#    define STRSPN strcspn
#  endif
#  define INITIAL_MASK -1
#  define UPDATE_MASK(RA, RS, RB) andc	RA, RS, RB
#endif

/* Simple macro to use VSX instructions in overlapping VR's.  */
#define XXVR(insn, vrt, vra, vrb) \
	insn 32+vrt, 32+vra, 32+vrb

/* ISA 2.07B instructions are not all defined for older binutils.
   Macros are defined below for these newer instructions in order
   to maintain compatibility.  */

/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs.  */
#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))

#define VBPERMQ(t,a,b) .long (0x1000054c \
			      | ((t)<<(32-11))	\
			      | ((a)<<(32-16))	\
			      | ((b)<<(32-21)) )

	/* This can be updated to power8 once the minimum version of
	   binutils supports power8 and the above instructions.  */
	.machine power7
ENTRY_TOCLESS (STRSPN, 4)
	CALL_MCOUNT 2

	/* Generate useful constants for later on.  */
	vspltisb v1, 7
	vspltisb v2, -1
	vslb	v1, v1, v1	/* 0x80 to swap high bit for vbpermq.  */
	vspltisb v10, 0
	vsldoi	v4, v10, v2, 2	/* 0xFFFF into vr4.  */
	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */

	/* Prepare to compute 256b mask.  */
	addi	r4, r4, -1
	li	r5, INITIAL_MASK
	li	r6, INITIAL_MASK
	li	r7, INITIAL_MASK
	li	r8, INITIAL_MASK

#if USE_AS_STRCSPN
	/* Ensure the null character never matches by clearing ISA bit 0 in
	   in r5 which is the bit which will check for it in the later usage
	   of vbpermq.  */
	srdi	r5, r5, 1
#endif

	li	r11, 1
	sldi	r11, r11, 63

	/* Start interleaved Mask computation.
	   This will eventually or 1's into ignored bits from vbpermq.  */
	lvsr	v11, 0, r3
	vspltb  v11, v11, 0	/* Splat shift constant.  */

	/* Build a 256b mask in r5-r8.  */
	.align 4
L(next_needle):
	lbzu	r9, 1(r4)

	cmpldi	cr0, r9, 0
	cmpldi	cr1, r9, 128

	/* This is a little tricky.  srd only uses the first 7 bits,
	   and if bit 7 is set, value is always 0.  So, we can
	   effectively shift 128b in this case.  */
	xori	r12, r9,  0x40	/* Invert bit 6.  */
	srd	r10, r11, r9	/* Mask for bits 0-63.  */
	srd	r12, r11, r12	/* Mask for bits 64-127.  */

	beq	cr0, L(start_cmp)

	/* Now, or the value into the correct GPR.  */
	bge cr1,L(needle_gt128)
	UPDATE_MASK (r5, r5, r10)	/* 0 - 63.  */
	UPDATE_MASK (r6, r6, r12)	/* 64 - 127.  */
	b L(next_needle)

	.align 4
L(needle_gt128):
	UPDATE_MASK (r7, r7, r10)	/* 128 - 191.  */
	UPDATE_MASK (r8, r8, r12)	/* 192 - 255.  */
	b L(next_needle)


	.align 4
L(start_cmp):
	/* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
	mr	r0, r3		/* Save r3 for final length computation.  */
	MTVRD (v5, r5)
	MTVRD (v6, r6)
	MTVRD (v7, r7)
	MTVRD (v8, r8)

	/* Continue interleaved mask generation.  */
#ifdef __LITTLE_ENDIAN__
	vsrw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
	vsplth  v11, v11, 0	/* Only care about the high 16 bits of v10.  */
#else
	vslw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
	vsplth  v11, v11, 1	/* Only care about the low 16 bits of v10.  */
#endif
	lvx	v0, 0, r3	/* Note, unaligned load ignores lower bits.  */

	/* Do the merging of the bitmask.  */
	XXVR(xxmrghd, v5, v5, v6)
	XXVR(xxmrghd, v6, v7, v8)

	/* Finish mask generation.  */
	vand	v11, v11, v4	/* Throwaway bits not in the mask.  */

	/* Compare the first 1-16B, while masking unwanted bytes.  */
	clrrdi  r3, r3, 4	/* Note,  counts from qw boundaries.  */
	vxor	v9, v0, v1	/* Swap high bit.  */
	VBPERMQ (v8, v5, v0)
	VBPERMQ (v7, v6, v9)
	vor	v7, v7, v8
	vor	v7, v7, v11	/* Ignore non-participating bytes.  */
	vcmpequh. v8, v7, v4
	bnl	cr6, L(done)

	addi	r3, r3, 16

	.align 4
L(vec):
	lvx	v0, 0, r3
	addi	r3, r3, 16
	vxor	v9, v0, v1	/* Swap high bit.  */
	VBPERMQ (v8, v5, v0)
	VBPERMQ (v7, v6, v9)
	vor	v7, v7, v8
	vcmpequh. v8, v7, v4
	blt	cr6, L(vec)

	addi	r3, r3, -16
L(done):
	subf	r3, r0, r3
	MFVRD (r10, v7)

#ifdef __LITTLE_ENDIAN__
	addi	r0,  r10, 1	/* Count the trailing 1's.  */
	andc	r10, r10, r0
	popcntd	r10, r10
#else
	xori	r10, r10, 0xffff /* Count leading 1's by inverting.  */
	addi	r3,  r3,  -48	/* Account for the extra leading zeros.  */
	cntlzd  r10, r10
#endif

	add	r3, r3, r10
	blr

END(STRSPN)
libc_hidden_builtin_def (STRSPN)
Commit	Line	Data
25dba0ad PM	1	/* Optimized strspn implementation for Power8.
25dba0ad PM	2
04277e02	3	Copyright (C) 2016-2019 Free Software Foundation, Inc.
25dba0ad PM	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, see
	18	<http://www.gnu.org/licenses/>. */
	19
	20	/* size_t [r3] strspn (const char *string [r3],
	21	const char needleAccept [r4]) /
	22
	23	/* This takes a novel approach by computing a 256 bit mask whereby
	24	each set bit implies the byte is "accepted". P8 vector hardware
	25	has extremely efficient hardware for selecting bits from a mask.
	26
	27	One might ask "why not use bpermd for short strings"? It is
	28	so slow that its performance about matches the generic PPC64
	29	variant without any fancy masking, with the added expense of
	30	making the mask. That was the first variant of this. */
	31
	32
	33
	34	#include "sysdep.h"
	35
8f1b841e PM	36	#ifndef USE_AS_STRCSPN
	37	# define USE_AS_STRCSPN 0
	38	# ifndef STRSPN
	39	# define STRSPN strspn
	40	# endif
	41	# define INITIAL_MASK 0
	42	# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB
	43	#else
	44	# ifndef STRSPN
	45	# define STRSPN strcspn
	46	# endif
	47	# define INITIAL_MASK -1
	48	# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB
	49	#endif
	50
25dba0ad PM	51	/* Simple macro to use VSX instructions in overlapping VR's. */
	52	#define XXVR(insn, vrt, vra, vrb) \
	53	insn 32+vrt, 32+vra, 32+vrb
	54
	55	/* ISA 2.07B instructions are not all defined for older binutils.
	56	Macros are defined below for these newer instructions in order
	57	to maintain compatibility. */
	58
	59	/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
	60	#define MTVRD(v,r) .long (0x7c000167 \| ((v)<<(32-11)) \| ((r)<<(32-16)))
	61	#define MFVRD(r,v) .long (0x7c000067 \| ((v)<<(32-11)) \| ((r)<<(32-16)))
	62
	63	#define VBPERMQ(t,a,b) .long (0x1000054c \
	64	\| ((t)<<(32-11)) \
	65	\| ((a)<<(32-16)) \
	66	\| ((b)<<(32-21)) )
	67
	68	/* This can be updated to power8 once the minimum version of
	69	binutils supports power8 and the above instructions. */
	70	.machine power7
d5b41185	71	ENTRY_TOCLESS (STRSPN, 4)
25dba0ad PM	72	CALL_MCOUNT 2
	73
	74	/* Generate useful constants for later on. */
	75	vspltisb v1, 7
	76	vspltisb v2, -1
	77	vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */
	78	vspltisb v10, 0
	79	vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */
	80	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */
	81
	82	/* Prepare to compute 256b mask. */
	83	addi r4, r4, -1
8f1b841e PM	84	li r5, INITIAL_MASK
	85	li r6, INITIAL_MASK
	86	li r7, INITIAL_MASK
	87	li r8, INITIAL_MASK
	88
	89	#if USE_AS_STRCSPN
	90	/* Ensure the null character never matches by clearing ISA bit 0 in
	91	in r5 which is the bit which will check for it in the later usage
	92	of vbpermq. */
	93	srdi r5, r5, 1
	94	#endif
	95
25dba0ad PM	96	li r11, 1
	97	sldi r11, r11, 63
	98
	99	/* Start interleaved Mask computation.
	100	This will eventually or 1's into ignored bits from vbpermq. */
	101	lvsr v11, 0, r3
	102	vspltb v11, v11, 0 /* Splat shift constant. */
	103
	104	/* Build a 256b mask in r5-r8. */
	105	.align 4
	106	L(next_needle):
	107	lbzu r9, 1(r4)
	108
	109	cmpldi cr0, r9, 0
	110	cmpldi cr1, r9, 128
	111
	112	/* This is a little tricky. srd only uses the first 7 bits,
	113	and if bit 7 is set, value is always 0. So, we can
	114	effectively shift 128b in this case. */
	115	xori r12, r9, 0x40 /* Invert bit 6. */
	116	srd r10, r11, r9 /* Mask for bits 0-63. */
	117	srd r12, r11, r12 /* Mask for bits 64-127. */
	118
	119	beq cr0, L(start_cmp)
	120
	121	/* Now, or the value into the correct GPR. */
	122	bge cr1,L(needle_gt128)
8f1b841e PM	123	UPDATE_MASK (r5, r5, r10) /* 0 - 63. */
8f1b841e PM	124	UPDATE_MASK (r6, r6, r12) /* 64 - 127. */
25dba0ad PM	125	b L(next_needle)
	126
	127	.align 4
	128	L(needle_gt128):
8f1b841e PM	129	UPDATE_MASK (r7, r7, r10) /* 128 - 191. */
8f1b841e PM	130	UPDATE_MASK (r8, r8, r12) /* 192 - 255. */
25dba0ad PM	131	b L(next_needle)
	132
	133
	134	.align 4
	135	L(start_cmp):
	136	/* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
	137	mr r0, r3 /* Save r3 for final length computation. */
	138	MTVRD (v5, r5)
	139	MTVRD (v6, r6)
	140	MTVRD (v7, r7)
	141	MTVRD (v8, r8)
	142
	143	/* Continue interleaved mask generation. */
	144	#ifdef __LITTLE_ENDIAN__
	145	vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */
	146	vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */
	147	#else
	148	vslw v11, v2, v11 /* Note, shift ignores higher order bits. */
	149	vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */
	150	#endif
	151	lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */
	152
	153	/* Do the merging of the bitmask. */
	154	XXVR(xxmrghd, v5, v5, v6)
	155	XXVR(xxmrghd, v6, v7, v8)
	156
	157	/* Finish mask generation. */
	158	vand v11, v11, v4 /* Throwaway bits not in the mask. */
	159
	160	/* Compare the first 1-16B, while masking unwanted bytes. */
	161	clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
	162	vxor v9, v0, v1 /* Swap high bit. */
	163	VBPERMQ (v8, v5, v0)
	164	VBPERMQ (v7, v6, v9)
	165	vor v7, v7, v8
	166	vor v7, v7, v11 /* Ignore non-participating bytes. */
	167	vcmpequh. v8, v7, v4
	168	bnl cr6, L(done)
	169
	170	addi r3, r3, 16
	171
	172	.align 4
	173	L(vec):
	174	lvx v0, 0, r3
	175	addi r3, r3, 16
	176	vxor v9, v0, v1 /* Swap high bit. */
	177	VBPERMQ (v8, v5, v0)
	178	VBPERMQ (v7, v6, v9)
	179	vor v7, v7, v8
	180	vcmpequh. v8, v7, v4
	181	blt cr6, L(vec)
	182
	183	addi r3, r3, -16
	184	L(done):
	185	subf r3, r0, r3
	186	MFVRD (r10, v7)
	187
	188	#ifdef __LITTLE_ENDIAN__
	189	addi r0, r10, 1 /* Count the trailing 1's. */
	190	andc r10, r10, r0
	191	popcntd r10, r10
	192	#else
	193	xori r10, r10, 0xffff /* Count leading 1's by inverting. */
	194	addi r3, r3, -48 /* Account for the extra leading zeros. */
195	cntlzd r10, r10
196	#endif
197
198	add r3, r3, r10
199	blr
200
8f1b841e PM	201	END(STRSPN)
8f1b841e PM	202	libc_hidden_builtin_def (STRSPN)