sysdeps/x86_64/multiarch/memrchr-avx2.S

   1 /* memrchr optimized with AVX2.
   2    Copyright (C) 2017-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #if IS_IN (libc)
  20
  21 # include <sysdep.h>
  22
  23 # ifndef VZEROUPPER
  24 #  define VZEROUPPER    vzeroupper
  25 # endif
  26
  27 # define VEC_SIZE 32
  28
  29         .section .text.avx,"ax",@progbits
  30 ENTRY (__memrchr_avx2)
  31         /* Broadcast CHAR to YMM0.  */
  32         vmovd   %esi, %xmm0
  33         vpbroadcastb %xmm0, %ymm0
  34
  35         sub     $VEC_SIZE, %RDX_LP
  36         jbe     L(last_vec_or_less)
  37
  38         add     %RDX_LP, %RDI_LP
  39
  40         /* Check the last VEC_SIZE bytes.  */
  41         vpcmpeqb (%rdi), %ymm0, %ymm1
  42         vpmovmskb %ymm1, %eax
  43         testl   %eax, %eax
  44         jnz     L(last_vec_x0)
  45
  46         subq    $(VEC_SIZE * 4), %rdi
  47         movl    %edi, %ecx
  48         andl    $(VEC_SIZE - 1), %ecx
  49         jz      L(aligned_more)
  50
  51         /* Align data for aligned loads in the loop.  */
  52         addq    $VEC_SIZE, %rdi
  53         addq    $VEC_SIZE, %rdx
  54         andq    $-VEC_SIZE, %rdi
  55         subq    %rcx, %rdx
  56
  57         .p2align 4
  58 L(aligned_more):
  59         subq    $(VEC_SIZE * 4), %rdx
  60         jbe     L(last_4x_vec_or_less)
  61
  62         /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
  63            since data is only aligned to VEC_SIZE.  */
  64         vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
  65         vpmovmskb %ymm1, %eax
  66         testl   %eax, %eax
  67         jnz     L(last_vec_x3)
  68
  69         vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
  70         vpmovmskb %ymm2, %eax
  71         testl   %eax, %eax
  72         jnz     L(last_vec_x2)
  73
  74         vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
  75         vpmovmskb %ymm3, %eax
  76         testl   %eax, %eax
  77         jnz     L(last_vec_x1)
  78
  79         vpcmpeqb (%rdi), %ymm0, %ymm4
  80         vpmovmskb %ymm4, %eax
  81         testl   %eax, %eax
  82         jnz     L(last_vec_x0)
  83
  84         /* Align data to 4 * VEC_SIZE for loop with fewer branches.
  85            There are some overlaps with above if data isn't aligned
  86            to 4 * VEC_SIZE.  */
  87         movl    %edi, %ecx
  88         andl    $(VEC_SIZE * 4 - 1), %ecx
  89         jz      L(loop_4x_vec)
  90
  91         addq    $(VEC_SIZE * 4), %rdi
  92         addq    $(VEC_SIZE * 4), %rdx
  93         andq    $-(VEC_SIZE * 4), %rdi
  94         subq    %rcx, %rdx
  95
  96         .p2align 4
  97 L(loop_4x_vec):
  98         /* Compare 4 * VEC at a time forward.  */
  99         subq    $(VEC_SIZE * 4), %rdi
 100         subq    $(VEC_SIZE * 4), %rdx
 101         jbe     L(last_4x_vec_or_less)
 102
 103         vmovdqa (%rdi), %ymm1
 104         vmovdqa VEC_SIZE(%rdi), %ymm2
 105         vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
 106         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
 107
 108         vpcmpeqb %ymm1, %ymm0, %ymm1
 109         vpcmpeqb %ymm2, %ymm0, %ymm2
 110         vpcmpeqb %ymm3, %ymm0, %ymm3
 111         vpcmpeqb %ymm4, %ymm0, %ymm4
 112
 113         vpor    %ymm1, %ymm2, %ymm5
 114         vpor    %ymm3, %ymm4, %ymm6
 115         vpor    %ymm5, %ymm6, %ymm5
 116
 117         vpmovmskb %ymm5, %eax
 118         testl   %eax, %eax
 119         jz      L(loop_4x_vec)
 120
 121         /* There is a match.  */
 122         vpmovmskb %ymm4, %eax
 123         testl   %eax, %eax
 124         jnz     L(last_vec_x3)
 125
 126         vpmovmskb %ymm3, %eax
 127         testl   %eax, %eax
 128         jnz     L(last_vec_x2)
 129
 130         vpmovmskb %ymm2, %eax
 131         testl   %eax, %eax
 132         jnz     L(last_vec_x1)
 133
 134         vpmovmskb %ymm1, %eax
 135         bsrl    %eax, %eax
 136         addq    %rdi, %rax
 137         VZEROUPPER
 138         ret
 139
 140         .p2align 4
 141 L(last_4x_vec_or_less):
 142         addl    $(VEC_SIZE * 4), %edx
 143         cmpl    $(VEC_SIZE * 2), %edx
 144         jbe     L(last_2x_vec)
 145
 146         vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 147         vpmovmskb %ymm1, %eax
 148         testl   %eax, %eax
 149         jnz     L(last_vec_x3)
 150
 151         vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
 152         vpmovmskb %ymm2, %eax
 153         testl   %eax, %eax
 154         jnz     L(last_vec_x2)
 155
 156         vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
 157         vpmovmskb %ymm3, %eax
 158         testl   %eax, %eax
 159         jnz     L(last_vec_x1_check)
 160         cmpl    $(VEC_SIZE * 3), %edx
 161         jbe     L(zero)
 162
 163         vpcmpeqb (%rdi), %ymm0, %ymm4
 164         vpmovmskb %ymm4, %eax
 165         testl   %eax, %eax
 166         jz      L(zero)
 167         bsrl    %eax, %eax
 168         subq    $(VEC_SIZE * 4), %rdx
 169         addq    %rax, %rdx
 170         jl      L(zero)
 171         addq    %rdi, %rax
 172         VZEROUPPER
 173         ret
 174
 175         .p2align 4
 176 L(last_2x_vec):
 177         vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 178         vpmovmskb %ymm1, %eax
 179         testl   %eax, %eax
 180         jnz     L(last_vec_x3_check)
 181         cmpl    $VEC_SIZE, %edx
 182         jbe     L(zero)
 183
 184         vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 185         vpmovmskb %ymm1, %eax
 186         testl   %eax, %eax
 187         jz      L(zero)
 188         bsrl    %eax, %eax
 189         subq    $(VEC_SIZE * 2), %rdx
 190         addq    %rax, %rdx
 191         jl      L(zero)
 192         addl    $(VEC_SIZE * 2), %eax
 193         addq    %rdi, %rax
 194         VZEROUPPER
 195         ret
 196
 197         .p2align 4
 198 L(last_vec_x0):
 199         bsrl    %eax, %eax
 200         addq    %rdi, %rax
 201         VZEROUPPER
 202         ret
 203
 204         .p2align 4
 205 L(last_vec_x1):
 206         bsrl    %eax, %eax
 207         addl    $VEC_SIZE, %eax
 208         addq    %rdi, %rax
 209         VZEROUPPER
 210         ret
 211
 212         .p2align 4
 213 L(last_vec_x2):
 214         bsrl    %eax, %eax
 215         addl    $(VEC_SIZE * 2), %eax
 216         addq    %rdi, %rax
 217         VZEROUPPER
 218         ret
 219
 220         .p2align 4
 221 L(last_vec_x3):
 222         bsrl    %eax, %eax
 223         addl    $(VEC_SIZE * 3), %eax
 224         addq    %rdi, %rax
 225         ret
 226
 227         .p2align 4
 228 L(last_vec_x1_check):
 229         bsrl    %eax, %eax
 230         subq    $(VEC_SIZE * 3), %rdx
 231         addq    %rax, %rdx
 232         jl      L(zero)
 233         addl    $VEC_SIZE, %eax
 234         addq    %rdi, %rax
 235         VZEROUPPER
 236         ret
 237
 238         .p2align 4
 239 L(last_vec_x3_check):
 240         bsrl    %eax, %eax
 241         subq    $VEC_SIZE, %rdx
 242         addq    %rax, %rdx
 243         jl      L(zero)
 244         addl    $(VEC_SIZE * 3), %eax
 245         addq    %rdi, %rax
 246         VZEROUPPER
 247         ret
 248
 249         .p2align 4
 250 L(zero):
 251         VZEROUPPER
 252 L(null):
 253         xorl    %eax, %eax
 254         ret
 255
 256         .p2align 4
 257 L(last_vec_or_less_aligned):
 258         movl    %edx, %ecx
 259
 260         vpcmpeqb (%rdi), %ymm0, %ymm1
 261
 262         movl    $1, %edx
 263         /* Support rdx << 32.  */
 264         salq    %cl, %rdx
 265         subq    $1, %rdx
 266
 267         vpmovmskb %ymm1, %eax
 268
 269         /* Remove the trailing bytes.  */
 270         andl    %edx, %eax
 271         testl   %eax, %eax
 272         jz      L(zero)
 273
 274         bsrl    %eax, %eax
 275         addq    %rdi, %rax
 276         VZEROUPPER
 277         ret
 278
 279         .p2align 4
 280 L(last_vec_or_less):
 281         addl    $VEC_SIZE, %edx
 282
 283         /* Check for zero length.  */
 284         testl   %edx, %edx
 285         jz      L(null)
 286
 287         movl    %edi, %ecx
 288         andl    $(VEC_SIZE - 1), %ecx
 289         jz      L(last_vec_or_less_aligned)
 290
 291         movl    %ecx, %esi
 292         movl    %ecx, %r8d
 293         addl    %edx, %esi
 294         andq    $-VEC_SIZE, %rdi
 295
 296         subl    $VEC_SIZE, %esi
 297         ja      L(last_vec_2x_aligned)
 298
 299         /* Check the last VEC.  */
 300         vpcmpeqb (%rdi), %ymm0, %ymm1
 301         vpmovmskb %ymm1, %eax
 302
 303         /* Remove the leading and trailing bytes.  */
 304         sarl    %cl, %eax
 305         movl    %edx, %ecx
 306
 307         movl    $1, %edx
 308         sall    %cl, %edx
 309         subl    $1, %edx
 310
 311         andl    %edx, %eax
 312         testl   %eax, %eax
 313         jz      L(zero)
 314
 315         bsrl    %eax, %eax
 316         addq    %rdi, %rax
 317         addq    %r8, %rax
 318         VZEROUPPER
 319         ret
 320
 321         .p2align 4
 322 L(last_vec_2x_aligned):
 323         movl    %esi, %ecx
 324
 325         /* Check the last VEC.  */
 326         vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
 327
 328         movl    $1, %edx
 329         sall    %cl, %edx
 330         subl    $1, %edx
 331
 332         vpmovmskb %ymm1, %eax
 333
 334         /* Remove the trailing bytes.  */
 335         andl    %edx, %eax
 336
 337         testl   %eax, %eax
 338         jnz     L(last_vec_x1)
 339
 340         /* Check the second last VEC.  */
 341         vpcmpeqb (%rdi), %ymm0, %ymm1
 342
 343         movl    %r8d, %ecx
 344
 345         vpmovmskb %ymm1, %eax
 346
 347         /* Remove the leading bytes.  Must use unsigned right shift for
 348            bsrl below.  */
 349         shrl    %cl, %eax
 350         testl   %eax, %eax
 351         jz      L(zero)
 352
 353         bsrl    %eax, %eax
 354         addq    %rdi, %rax
 355         addq    %r8, %rax
 356         VZEROUPPER
 357         ret
 358 END (__memrchr_avx2)
 359 #endif