sysdeps/powerpc/powerpc64/power9/strcmp.S

   1 /* Optimized strcmp implementation for PowerPC64/POWER9.
   2    Copyright (C) 2016-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18 #ifdef __LITTLE_ENDIAN__
  19 #include <sysdep.h>
  20
  21 /* Implements the function
  22
  23    int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
  24
  25    The implementation uses unaligned doubleword access for first 32 bytes
  26    as in POWER8 patch and uses vectorised loops after that.  */
  27
  28 /* TODO: Change this to actual instructions when minimum binutils is upgraded
  29    to 2.27. Macros are defined below for these newer instructions in order
  30    to maintain compatibility.  */
  31 # define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
  32
  33 # define VEXTUBRX(t,a,b) .long (0x1000070d \
  34                                 | ((t)<<(32-11))  \
  35                                 | ((a)<<(32-16))  \
  36                                 | ((b)<<(32-21)) )
  37
  38 # define VCMPNEZB(t,a,b) .long (0x10000507 \
  39                                 | ((t)<<(32-11))  \
  40                                 | ((a)<<(32-16))  \
  41                                 | ((b)<<(32-21)) )
  42
  43 /* Get 16 bytes for unaligned case.
  44    reg1: Vector to hold next 16 bytes.
  45    reg2: Address to read from.
  46    reg3: Permute control vector.  */
  47 # define GET16BYTES(reg1, reg2, reg3) \
  48         lvx     reg1, 0, reg2; \
  49         vperm   v8, v2, reg1, reg3; \
  50         vcmpequb.       v8, v0, v8; \
  51         beq     cr6, 1f; \
  52         vspltisb        v9, 0; \
  53         b       2f; \
  54         .align 4; \
  55 1: \
  56         addi    r6, reg2, 16; \
  57         lvx     v9, 0, r6; \
  58 2: \
  59         vperm   reg1, v9, reg1, reg3;
  60
  61 /* TODO: change this to .machine power9 when the minimum required binutils
  62    allows it.  */
  63
  64         .machine  power7
  65 EALIGN (strcmp, 4, 0)
  66         li      r0, 0
  67
  68         /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
  69            the code:
  70
  71             (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
  72
  73            with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
  74
  75         rldicl  r7, r3, 0, 52
  76         rldicl  r9, r4, 0, 52
  77         cmpldi  cr7, r7, 4096-32
  78         bgt     cr7, L(pagecross_check)
  79         cmpldi  cr5, r9, 4096-32
  80         bgt     cr5, L(pagecross_check)
  81
  82         /* For short strings up to 32 bytes,  load both s1 and s2 using
  83            unaligned dwords and compare.  */
  84         ld      r8, 0(r3)
  85         ld      r10, 0(r4)
  86         cmpb    r12, r8, r0
  87         cmpb    r11, r8, r10
  88         orc.    r9, r12, r11
  89         bne     cr0, L(different_nocmpb)
  90
  91         ld      r8, 8(r3)
  92         ld      r10, 8(r4)
  93         cmpb    r12, r8, r0
  94         cmpb    r11, r8, r10
  95         orc.    r9, r12, r11
  96         bne     cr0, L(different_nocmpb)
  97
  98         ld      r8, 16(r3)
  99         ld      r10, 16(r4)
 100         cmpb    r12, r8, r0
 101         cmpb    r11, r8, r10
 102         orc.    r9, r12, r11
 103         bne     cr0, L(different_nocmpb)
 104
 105         ld      r8, 24(r3)
 106         ld      r10, 24(r4)
 107         cmpb    r12, r8, r0
 108         cmpb    r11, r8, r10
 109         orc.    r9, r12, r11
 110         bne     cr0, L(different_nocmpb)
 111
 112         addi    r7, r3, 32
 113         addi    r4, r4, 32
 114
 115 L(align):
 116         /* Now it has checked for first 32 bytes.  */
 117         vspltisb        v0, 0
 118         vspltisb        v2, -1
 119         lvsr    v6, 0, r4   /* Compute mask.  */
 120         or      r5, r4, r7
 121         andi.   r5, r5, 0xF
 122         beq     cr0, L(aligned)
 123         andi.   r5, r7, 0xF
 124         beq     cr0, L(s1_align)
 125         lvsr    v10, 0, r7   /* Compute mask.  */
 126
 127         /* Both s1 and s2 are unaligned.  */
 128         GET16BYTES(v4, r7, v10)
 129         GET16BYTES(v5, r4, v6)
 130         VCMPNEZB(v7, v5, v4)
 131         beq     cr6, L(match)
 132         b       L(different)
 133
 134         /* Align s1 to qw and adjust s2 address.  */
 135         .align  4
 136 L(match):
 137         clrldi  r6, r7, 60
 138         subfic  r5, r6, 16
 139         add     r7, r7, r5
 140         add     r4, r4, r5
 141         andi.   r5, r4, 0xF
 142         beq     cr0, L(aligned)
 143         lvsr    v6, 0, r4
 144         /* There are 2 loops depending on the input alignment.
 145            Each loop gets 16 bytes from s1 and s2 and compares.
 146            Loop until a mismatch or null occurs.  */
 147 L(s1_align):
 148         lvx     v4, r7, r0
 149         GET16BYTES(v5, r4, v6)
 150         VCMPNEZB(v7, v5, v4)
 151         addi    r7, r7, 16
 152         addi    r4, r4, 16
 153         bne     cr6, L(different)
 154
 155         lvx     v4, r7, r0
 156         GET16BYTES(v5, r4, v6)
 157         VCMPNEZB(v7, v5, v4)
 158         addi    r7, r7, 16
 159         addi    r4, r4, 16
 160         bne     cr6, L(different)
 161
 162         lvx     v4, r7, r0
 163         GET16BYTES(v5, r4, v6)
 164         VCMPNEZB(v7, v5, v4)
 165         addi    r7, r7, 16
 166         addi    r4, r4, 16
 167         bne     cr6, L(different)
 168
 169         lvx     v4, r7, r0
 170         GET16BYTES(v5, r4, v6)
 171         VCMPNEZB(v7, v5, v4)
 172         addi    r7, r7, 16
 173         addi    r4, r4, 16
 174         beq     cr6, L(s1_align)
 175         b       L(different)
 176
 177         .align  4
 178 L(aligned):
 179         lvx     v4, 0, r7
 180         lvx     v5, 0, r4
 181         VCMPNEZB(v7, v5, v4)
 182         addi    r7, r7, 16
 183         addi    r4, r4, 16
 184         bne     cr6, L(different)
 185
 186         lvx     v4, 0, r7
 187         lvx     v5, 0, r4
 188         VCMPNEZB(v7, v5, v4)
 189         addi    r7, r7, 16
 190         addi    r4, r4, 16
 191         bne     cr6, L(different)
 192
 193         lvx     v4, 0, r7
 194         lvx     v5, 0, r4
 195         VCMPNEZB(v7, v5, v4)
 196         addi    r7, r7, 16
 197         addi    r4, r4, 16
 198         bne     cr6, L(different)
 199
 200         lvx     v4, 0, r7
 201         lvx     v5, 0, r4
 202         VCMPNEZB(v7, v5, v4)
 203         addi    r7, r7, 16
 204         addi    r4, r4, 16
 205         beq     cr6, L(aligned)
 206
 207         /* Calculate and return the difference.  */
 208 L(different):
 209         VCTZLSBB(r6, v7)
 210         VEXTUBRX(r5, r6, v4)
 211         VEXTUBRX(r4, r6, v5)
 212         subf    r3, r4, r5
 213         extsw   r3, r3
 214         blr
 215
 216         .align  4
 217 L(different_nocmpb):
 218         neg     r3, r9
 219         and     r9, r9, r3
 220         cntlzd  r9, r9
 221         subfic  r9, r9, 63
 222         srd     r3, r8, r9
 223         srd     r10, r10, r9
 224         rldicl  r10, r10, 0, 56
 225         rldicl  r3, r3, 0, 56
 226         subf    r3, r10, r3
 227         extsw   r3, r3
 228         blr
 229
 230         .align  4
 231 L(pagecross_check):
 232         subfic  r9, r9, 4096
 233         subfic  r7, r7, 4096
 234         cmpld   cr7, r7, r9
 235         bge     cr7, L(pagecross)
 236         mr      r7, r9
 237
 238         /* If unaligned 16 bytes reads across a 4K page boundary, it uses
 239            a simple byte a byte comparison until the page alignment for s1
 240            is reached.  */
 241 L(pagecross):
 242         add     r7, r3, r7
 243         subf    r9, r3, r7
 244         mtctr   r9
 245
 246         .align  4
 247 L(pagecross_loop):
 248         /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
 249            and if *s1 is '\0'.  */
 250         lbz     r9, 0(r3)
 251         lbz     r10, 0(r4)
 252         addi    r3, r3, 1
 253         addi    r4, r4, 1
 254         cmplw   cr7, r9, r10
 255         cmpdi   cr5, r9, r0
 256         bne     cr7, L(pagecross_ne)
 257         beq     cr5, L(pagecross_nullfound)
 258         bdnz    L(pagecross_loop)
 259         b       L(align)
 260
 261         .align  4
 262 L(pagecross_ne):
 263         extsw   r3, r9
 264         mr      r9, r10
 265 L(pagecross_retdiff):
 266         subf    r9, r9, r3
 267         extsw   r3, r9
 268         blr
 269
 270         .align  4
 271 L(pagecross_nullfound):
 272         li      r3, 0
 273         b       L(pagecross_retdiff)
 274 END (strcmp)
 275 libc_hidden_builtin_def (strcmp)
 276 #else
 277 #include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
 278 #endif