sysdeps/i386/i586/strchr.S

   1 /* Find character CH in a NUL terminated string.
   2    Highly optimized version for ix85, x>=5.
   3    Copyright (C) 1995-2017 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include <sysdep.h>
  22 #include "asm-syntax.h"
  23
  24 /* This version is especially optimized for the i586 (and following?)
  25    processors.  This is mainly done by using the two pipelines.  The
  26    version optimized for i486 is weak in this aspect because to get
  27    as much parallelism we have to execute some *more* instructions.
  28
  29    The code below is structured to reflect the pairing of the instructions
  30    as *I think* it is.  I have no processor data book to verify this.
  31    If you find something you think is incorrect let me know.  */
  32
  33
  34 /* The magic value which is used throughout in the whole code.  */
  35 #define magic 0xfefefeff
  36
  37 #define PARMS   4+16    /* space for 4 saved regs */
  38 #define RTN     PARMS
  39 #define STR     RTN
  40 #define CHR     STR+4
  41
  42         .text
  43 ENTRY (strchr)
  44
  45         pushl %edi              /* Save callee-safe registers.  */
  46         cfi_adjust_cfa_offset (-4)
  47         pushl %esi
  48         cfi_adjust_cfa_offset (-4)
  49
  50         pushl %ebx
  51         cfi_adjust_cfa_offset (-4)
  52         pushl %ebp
  53         cfi_adjust_cfa_offset (-4)
  54
  55         movl STR(%esp), %eax
  56         movl CHR(%esp), %edx
  57
  58         movl %eax, %edi         /* duplicate string pointer for later */
  59         cfi_rel_offset (edi, 12)
  60         xorl %ecx, %ecx         /* clear %ecx */
  61
  62         /* At the moment %edx contains C.  What we need for the
  63            algorithm is C in all bytes of the dword.  Avoid
  64            operations on 16 bit words because these require an
  65            prefix byte (and one more cycle).  */
  66         movb %dl, %dh           /* now it is 0|0|c|c */
  67         movb %dl, %cl           /* we construct the lower half in %ecx */
  68
  69         shll $16, %edx          /* now %edx is c|c|0|0 */
  70         movb %cl, %ch           /* now %ecx is 0|0|c|c */
  71
  72         orl %ecx, %edx          /* and finally c|c|c|c */
  73         andl $3, %edi           /* mask alignment bits */
  74
  75         jz L(11)                /* alignment is 0 => start loop */
  76
  77         movb %dl, %cl           /* 0 is needed below */
  78         jp L(0)                 /* exactly two bits set */
  79
  80         xorb (%eax), %cl        /* is byte the one we are looking for? */
  81         jz L(out)               /* yes => return pointer */
  82
  83         xorb %dl, %cl           /* load single byte and test for NUL */
  84         je L(3)                 /* yes => return NULL */
  85
  86         movb 1(%eax), %cl       /* load single byte */
  87         incl %eax
  88
  89         cmpb %cl, %dl           /* is byte == C? */
  90         je L(out)               /* aligned => return pointer */
  91
  92         cmpb $0, %cl            /* is byte NUL? */
  93         je L(3)                 /* yes => return NULL */
  94
  95         incl %eax
  96         decl %edi
  97
  98         jne L(11)
  99
 100 L(0):   movb (%eax), %cl        /* load single byte */
 101
 102         cmpb %cl, %dl           /* is byte == C? */
 103         je L(out)               /* aligned => return pointer */
 104
 105         cmpb $0, %cl            /* is byte NUL? */
 106         je L(3)                 /* yes => return NULL */
 107
 108         incl %eax               /* increment pointer */
 109
 110         cfi_rel_offset (esi, 8)
 111         cfi_rel_offset (ebx, 4)
 112         cfi_rel_offset (ebp, 0)
 113
 114         /* The following code is the preparation for the loop.  The
 115            four instruction up to `L1' will not be executed in the loop
 116            because the same code is found at the end of the loop, but
 117            there it is executed in parallel with other instructions.  */
 118 L(11):  movl (%eax), %ecx
 119         movl $magic, %ebp
 120
 121         movl $magic, %edi
 122         addl %ecx, %ebp
 123
 124         /* The main loop: it looks complex and indeed it is.  I would
 125            love to say `it was hard to write, so it should he hard to
 126            read' but I will give some more hints.  To fully understand
 127            this code you should first take a look at the i486 version.
 128            The basic algorithm is the same, but here the code organized
 129            in a way which permits to use both pipelines all the time.
 130
 131            I tried to make it a bit more understandable by indenting
 132            the code according to stage in the algorithm.  It goes as
 133            follows:
 134                 check for 0 in 1st word
 135                         check for C in 1st word
 136                                         check for 0 in 2nd word
 137                                                 check for C in 2nd word
 138                 check for 0 in 3rd word
 139                         check for C in 3rd word
 140                                         check for 0 in 4th word
 141                                                 check for C in 4th word
 142
 143            Please note that doing the test for NUL before the test for
 144            C allows us to overlap the test for 0 in the next word with
 145            the test for C.  */
 146
 147 L(1):   xorl %ecx, %ebp                 /* (word^magic) */
 148         addl %ecx, %edi                 /* add magic word */
 149
 150         leal 4(%eax), %eax              /* increment pointer */
 151         jnc L(4)                        /* previous addl caused overflow? */
 152
 153                 movl %ecx, %ebx         /* duplicate original word */
 154         orl $magic, %ebp                /* (word^magic)|magic */
 155
 156         addl $1, %ebp                   /* (word^magic)|magic == 0xffffffff? */
 157         jne L(4)                                /* yes => we found word with NUL */
 158
 159                 movl $magic, %esi       /* load magic value */
 160                 xorl %edx, %ebx         /* clear words which are C */
 161
 162                                         movl (%eax), %ecx
 163                 addl %ebx, %esi         /* (word+magic) */
 164
 165                                         movl $magic, %edi
 166                 jnc L(5)                /* previous addl caused overflow? */
 167
 168                                         movl %edi, %ebp
 169                 xorl %ebx, %esi         /* (word+magic)^word */
 170
 171                                         addl %ecx, %ebp
 172                 orl $magic, %esi        /* ((word+magic)^word)|magic */
 173
 174                 addl $1, %esi           /* ((word+magic)^word)|magic==0xf..f?*/
 175                 jne L(5)                /* yes => we found word with C */
 176
 177                                         xorl %ecx, %ebp
 178                                         addl %ecx, %edi
 179
 180                                         leal 4(%eax), %eax
 181                                         jnc L(4)
 182
 183                                                 movl %ecx, %ebx
 184                                         orl $magic, %ebp
 185
 186                                         addl $1, %ebp
 187                                         jne L(4)
 188
 189                                                 movl $magic, %esi
 190                                                 xorl %edx, %ebx
 191
 192         movl (%eax), %ecx
 193                                                 addl %ebx, %esi
 194
 195         movl $magic, %edi
 196                                                 jnc L(5)
 197
 198         movl %edi, %ebp
 199                                                 xorl %ebx, %esi
 200
 201         addl %ecx, %ebp
 202                                                 orl $magic, %esi
 203
 204                                                 addl $1, %esi
 205                                                 jne L(5)
 206
 207         xorl %ecx, %ebp
 208         addl %ecx, %edi
 209
 210         leal 4(%eax), %eax
 211         jnc L(4)
 212
 213                 movl %ecx, %ebx
 214         orl $magic, %ebp
 215
 216         addl $1, %ebp
 217         jne L(4)
 218
 219                 movl $magic, %esi
 220                 xorl %edx, %ebx
 221
 222                                         movl (%eax), %ecx
 223                 addl %ebx, %esi
 224
 225                                         movl $magic, %edi
 226                 jnc L(5)
 227
 228                                         movl %edi, %ebp
 229                 xorl %ebx, %esi
 230
 231                                         addl %ecx, %ebp
 232                 orl $magic, %esi
 233
 234                 addl $1, %esi
 235                 jne L(5)
 236
 237                                         xorl %ecx, %ebp
 238                                         addl %ecx, %edi
 239
 240                                         leal 4(%eax), %eax
 241                                         jnc L(4)
 242
 243                                                 movl %ecx, %ebx
 244                                         orl $magic, %ebp
 245
 246                                         addl $1, %ebp
 247                                         jne L(4)
 248
 249                                                 movl $magic, %esi
 250                                                 xorl %edx, %ebx
 251
 252         movl (%eax), %ecx
 253                                                 addl %ebx, %esi
 254
 255         movl $magic, %edi
 256                                                 jnc L(5)
 257
 258         movl %edi, %ebp
 259                                                 xorl %ebx, %esi
 260
 261         addl %ecx, %ebp
 262                                                 orl $magic, %esi
 263
 264                                                 addl $1, %esi
 265
 266                                                 je L(1)
 267
 268         /* We know there is no NUL byte but a C byte in the word.
 269            %ebx contains NUL in this particular byte.  */
 270 L(5):   subl $4, %eax           /* adjust pointer */
 271         testb %bl, %bl          /* first byte == C? */
 272
 273         jz L(out)               /* yes => return pointer */
 274
 275         incl %eax               /* increment pointer */
 276         testb %bh, %bh          /* second byte == C? */
 277
 278         jz L(out)               /* yes => return pointer */
 279
 280         shrl $16, %ebx          /* make upper bytes accessible */
 281         incl %eax               /* increment pointer */
 282
 283         cmp $0, %bl             /* third byte == C */
 284         je L(out)               /* yes => return pointer */
 285
 286         incl %eax               /* increment pointer */
 287
 288 L(out): popl %ebp               /* restore saved registers */
 289         cfi_adjust_cfa_offset (-4)
 290         cfi_restore (ebp)
 291         popl %ebx
 292         cfi_adjust_cfa_offset (-4)
 293         cfi_restore (ebx)
 294
 295         popl %esi
 296         cfi_adjust_cfa_offset (-4)
 297         cfi_restore (esi)
 298         popl %edi
 299         cfi_adjust_cfa_offset (-4)
 300         cfi_restore (edi)
 301
 302         ret
 303
 304         cfi_adjust_cfa_offset (16)
 305         cfi_rel_offset (edi, 12)
 306         cfi_rel_offset (esi, 8)
 307         cfi_rel_offset (ebx, 4)
 308         cfi_rel_offset (ebp, 0)
 309         /* We know there is a NUL byte in the word.  But we have to test
 310            whether there is an C byte before it in the word.  */
 311 L(4):   subl $4, %eax           /* adjust pointer */
 312         cmpb %dl, %cl           /* first byte == C? */
 313
 314         je L(out)               /* yes => return pointer */
 315
 316         cmpb $0, %cl            /* first byte == NUL? */
 317         je L(3)                 /* yes => return NULL */
 318
 319         incl %eax               /* increment pointer */
 320
 321         cmpb %dl, %ch           /* second byte == C? */
 322         je L(out)               /* yes => return pointer */
 323
 324         cmpb $0, %ch            /* second byte == NUL? */
 325         je L(3)                 /* yes => return NULL */
 326
 327         shrl $16, %ecx          /* make upper bytes accessible */
 328         incl %eax               /* increment pointer */
 329
 330         cmpb %dl, %cl           /* third byte == C? */
 331         je L(out)               /* yes => return pointer */
 332
 333         cmpb $0, %cl            /* third byte == NUL? */
 334         je L(3)                 /* yes => return NULL */
 335
 336         incl %eax               /* increment pointer */
 337
 338         /* The test four the fourth byte is necessary!  */
 339         cmpb %dl, %ch           /* fourth byte == C? */
 340         je L(out)               /* yes => return pointer */
 341
 342 L(3):   xorl %eax, %eax
 343         jmp L(out)
 344 END (strchr)
 345
 346 #undef index
 347 weak_alias (strchr, index)
 348 libc_hidden_builtin_def (strchr)