powerpc: Fix performance issues of strcmp power10

author Amrita H S <amritahs@linux.vnet.ibm.com>

Fri, 15 Dec 2023 16:48:17 +0000 (11:48 -0500)

committer Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>

Fri, 15 Dec 2023 22:42:40 +0000 (16:42 -0600)
author Amrita H S <amritahs@linux.vnet.ibm.com>
Fri, 15 Dec 2023 16:48:17 +0000 (11:48 -0500)
committer Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Fri, 15 Dec 2023 22:42:40 +0000 (16:42 -0600)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S

index a3c1adad539978e0dbdfbb360066b3f4af591326..3406f4f26a214270f69b2093777524c944d76588 100644 (file)
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
@@ -62,7 +62,7 @@
         lxvl      32+v5,reg2,r0;         \
         add       reg1,reg1,len_reg;     \
         add       reg2,reg2,len_reg;     \
-       vcmpnezb. v7,v4,v5;              \
+       vcmpnezb  v7,v4,v5;              \
         vctzlsbb  r6,v7;                 \
         cmpld     cr7,r6,len_reg;        \
         blt       cr7,L(different);      \
@@ -72,70 +72,110 @@
  
         .machine  power9
  ENTRY_TOCLESS (STRCMP, 4)
-       li       r11,16
-       /* eq bit of cr1 used as swap status flag to indicate if
-       source pointers were swapped.  */
-       crclr    4*cr1+eq
-       vspltisb v19,-1
-       andi.    r7,r3,15
-       sub      r7,r11,r7      /* r7(nalign1) = 16 - (str1 & 15).  */
-       andi.    r9,r4,15
-       sub      r5,r11,r9      /* r5(nalign2) = 16 - (str2 & 15).  */
-       cmpld    cr7,r7,r5
-       beq      cr7,L(same_aligned)
-       blt      cr7,L(nalign1_min)
-       /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
-       pointer which is closer to the next 16B boundary so that only
-       one CHECK_N_BYTES is needed before entering the loop below.  */
-       mr       r8,r4
-       mr       r4,r3
-       mr       r3,r8
-       mr       r12,r7
-       mr       r7,r5
-       mr       r5,r12
-       crset    4*cr1+eq       /* Set bit on swapping source pointers.  */
+       andi.   r7,r3,4095
+       andi.   r8,r4,4095
+       cmpldi  cr0,r7,4096-16
+       cmpldi  cr1,r8,4096-16
+       bgt     cr0,L(crosses)
+       bgt     cr1,L(crosses)
+       COMPARE_16(v4,v5,0)
  
-       .p2align 5
+L(crosses):
+       andi.   r7,r3,15
+       subfic  r7,r7,16        /* r7(nalign1) = 16 - (str1 & 15).  */
+       andi.   r9,r4,15
+       subfic  r5,r9,16        /* r5(nalign2) = 16 - (str2 & 15).  */
+       cmpld   cr7,r7,r5
+       beq     cr7,L(same_aligned)
+       blt     cr7,L(nalign1_min)
+
+       /* nalign2 is minimum and s2 pointer is aligned.  */
+       CHECK_N_BYTES(r3,r4,r5)
+       /* Are we on the 64B hunk which crosses a page?  */
+       andi.   r10,r3,63       /* Determine offset into 64B hunk.  */
+       andi.   r8,r3,15        /* The offset into the 16B hunk.  */
+       neg     r7,r3
+       andi.   r9,r7,15        /* Number of bytes after a 16B cross.  */
+       rlwinm. r7,r7,26,0x3F   /* ((r3-4096))>>6&63.  */
+       beq     L(compare_64_pagecross)
+       mtctr   r7
+       b       L(compare_64B_unaligned)
+
+       /* nalign1 is minimum and s1 pointer is aligned.  */
  L(nalign1_min):
         CHECK_N_BYTES(r3,r4,r7)
+       /* Are we on the 64B hunk which crosses a page?  */
+       andi.   r10,r4,63       /* Determine offset into 64B hunk.  */
+       andi.   r8,r4,15        /* The offset into the 16B hunk.  */
+       neg     r7,r4
+       andi.   r9,r7,15        /* Number of bytes after a 16B cross.  */
+       rlwinm. r7,r7,26,0x3F   /* ((r4-4096))>>6&63.  */
+       beq     L(compare_64_pagecross)
+       mtctr   r7
  
         .p2align 5
-L(s1_aligned):
-       /* r9 and r5 is number of bytes to be read after and before
-        page boundary correspondingly.  */
-       sub     r5,r5,r7
-       subfic  r9,r5,16
-       /* Now let r7 hold the count of quadwords which can be
-       checked without crossing a page boundary. quadword offset is
-       (str2>>4)&0xFF.  */
-       rlwinm  r7,r4,28,0xFF
-       /* Below check is required only for first iteration. For second
-       iteration and beyond, the new loop counter is always 255.  */
-       cmpldi  r7,255
-       beq     L(L3)
-       /* Get the initial loop count by 255-((str2>>4)&0xFF).  */
-       subfic  r11,r7,255
+L(compare_64B_unaligned):
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       COMPARE_16(v4,v5,32)
+       COMPARE_16(v4,v5,48)
+       addi    r3,r3,64
+       addi    r4,r4,64
+       bdnz    L(compare_64B_unaligned)
  
-       .p2align 5
-L(L1):
+       /* Cross the page boundary of s2, carefully. Only for first
+       iteration we have to get the count of 64B blocks to be checked.
+       From second iteration and beyond, loop counter is always 63.  */
+L(compare_64_pagecross):
+       li      r11, 63
         mtctr   r11
-
-       .p2align 5
-L(L2):
-       COMPARE_16(v4,v5,0)     /* Load 16B blocks using lxv.  */
+       cmpldi  r10,16
+       ble     L(cross_4)
+       cmpldi  r10,32
+       ble     L(cross_3)
+       cmpldi  r10,48
+       ble     L(cross_2)
+L(cross_1):
+       CHECK_N_BYTES(r3,r4,r9)
+       CHECK_N_BYTES(r3,r4,r8)
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       COMPARE_16(v4,v5,32)
+       addi    r3,r3,48
+       addi    r4,r4,48
+       b       L(compare_64B_unaligned)
+L(cross_2):
+       COMPARE_16(v4,v5,0)
         addi    r3,r3,16
         addi    r4,r4,16
-       bdnz    L(L2)
-       /* Cross the page boundary of s2, carefully.  */
-
-       .p2align 5
-L(L3):
-       CHECK_N_BYTES(r3,r4,r5)
         CHECK_N_BYTES(r3,r4,r9)
-       li      r11,255         /* Load the new loop counter.  */
-       b       L(L1)
+       CHECK_N_BYTES(r3,r4,r8)
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       addi    r3,r3,32
+       addi    r4,r4,32
+       b       L(compare_64B_unaligned)
+L(cross_3):
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       addi    r3,r3,32
+       addi    r4,r4,32
+       CHECK_N_BYTES(r3,r4,r9)
+       CHECK_N_BYTES(r3,r4,r8)
+       COMPARE_16(v4,v5,0)
+       addi    r3,r3,16
+       addi    r4,r4,16
+       b       L(compare_64B_unaligned)
+L(cross_4):
+       COMPARE_16(v4,v5,0)
+       COMPARE_16(v4,v5,16)
+       COMPARE_16(v4,v5,32)
+       addi    r3,r3,48
+       addi    r4,r4,48
+       CHECK_N_BYTES(r3,r4,r9)
+       CHECK_N_BYTES(r3,r4,r8)
+       b       L(compare_64B_unaligned)
  
-       .p2align 5
  L(same_aligned):
         CHECK_N_BYTES(r3,r4,r7)
          /* Align s1 to 32B and adjust s2 address.
@@ -168,18 +208,7 @@ L(16B_aligned_loop):
  
         /* Calculate and return the difference.  */
  L(different):
-       vctzlsbb r6,v7
-       vextubrx r5,r6,v4
-       vextubrx r4,r6,v5
-       bt       4*cr1+eq,L(swapped)
-       subf     r3,r4,r5
-       blr
-
-       /* If src pointers were swapped, then swap the
-       indices and calculate the return value.  */
-L(swapped):
-       subf     r3,r5,r4
-       blr
+       TAIL(v4,v5)
  
         .p2align 5
  L(32B_aligned_loop):
author	Amrita H S <amritahs@linux.vnet.ibm.com>
	Fri, 15 Dec 2023 16:48:17 +0000 (11:48 -0500)
committer	Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
	Fri, 15 Dec 2023 22:42:40 +0000 (16:42 -0600)