sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S

   1 /* memcmp/wmemcmp optimized with AVX2.
   2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #if IS_IN (libc)
  20
  21 /* memcmp/wmemcmp is implemented as:
  22    1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
  23       to avoid branches.
  24    2. Use overlapping compare to avoid branch.
  25    3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
  26       bytes for wmemcmp.
  27    4. If size is 8 * VEC_SIZE or less, unroll the loop.
  28    5. Compare 4 * VEC_SIZE at a time with the aligned first memory
  29       area.
  30    6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
  31    7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
  32    8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
  33
  34 # include <sysdep.h>
  35
  36 # ifndef MEMCMP
  37 #  define MEMCMP        __memcmp_avx2_movbe
  38 # endif
  39
  40 # ifdef USE_AS_WMEMCMP
  41 #  define VPCMPEQ       vpcmpeqd
  42 # else
  43 #  define VPCMPEQ       vpcmpeqb
  44 # endif
  45
  46 # ifndef VZEROUPPER
  47 #  define VZEROUPPER    vzeroupper
  48 # endif
  49
  50 # define VEC_SIZE 32
  51 # define VEC_MASK ((1 << VEC_SIZE) - 1)
  52
  53 /* Warning!
  54            wmemcmp has to use SIGNED comparison for elements.
  55            memcmp has to use UNSIGNED comparison for elemnts.
  56 */
  57
  58         .section .text.avx,"ax",@progbits
  59 ENTRY (MEMCMP)
  60 # ifdef USE_AS_WMEMCMP
  61         shl     $2, %rdx
  62 # endif
  63         cmpq    $VEC_SIZE, %rdx
  64         jb      L(less_vec)
  65
  66         /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
  67         vmovdqu (%rsi), %ymm2
  68         VPCMPEQ (%rdi), %ymm2, %ymm2
  69         vpmovmskb %ymm2, %eax
  70         subl    $VEC_MASK, %eax
  71         jnz     L(first_vec)
  72
  73         cmpq    $(VEC_SIZE * 2), %rdx
  74         jbe     L(last_vec)
  75
  76         VPCMPEQ %ymm0, %ymm0, %ymm0
  77         /* More than 2 * VEC.  */
  78         cmpq    $(VEC_SIZE * 8), %rdx
  79         ja      L(more_8x_vec)
  80         cmpq    $(VEC_SIZE * 4), %rdx
  81         jb      L(last_4x_vec)
  82
  83         /* From 4 * VEC to 8 * VEC, inclusively. */
  84         vmovdqu (%rsi), %ymm1
  85         VPCMPEQ (%rdi), %ymm1, %ymm1
  86
  87         vmovdqu VEC_SIZE(%rsi), %ymm2
  88         VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
  89
  90         vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
  91         VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
  92
  93         vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
  94         VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
  95
  96         vpand   %ymm1, %ymm2, %ymm5
  97         vpand   %ymm3, %ymm4, %ymm6
  98         vpand   %ymm5, %ymm6, %ymm5
  99
 100         vptest  %ymm0, %ymm5
 101         jnc     L(4x_vec_end)
 102
 103         leaq    -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 104         leaq    -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 105         vmovdqu (%rsi), %ymm1
 106         VPCMPEQ (%rdi), %ymm1, %ymm1
 107
 108         vmovdqu VEC_SIZE(%rsi), %ymm2
 109         VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 110         vpand   %ymm2, %ymm1, %ymm5
 111
 112         vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
 113         VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 114         vpand   %ymm3, %ymm5, %ymm5
 115
 116         vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
 117         VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 118         vpand   %ymm4, %ymm5, %ymm5
 119
 120         vptest  %ymm0, %ymm5
 121         jnc     L(4x_vec_end)
 122         xorl    %eax, %eax
 123         VZEROUPPER
 124         ret
 125
 126         .p2align 4
 127 L(last_2x_vec):
 128         /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 129         vmovdqu (%rsi), %ymm2
 130         VPCMPEQ (%rdi), %ymm2, %ymm2
 131         vpmovmskb %ymm2, %eax
 132         subl    $VEC_MASK, %eax
 133         jnz     L(first_vec)
 134
 135 L(last_vec):
 136         /* Use overlapping loads to avoid branches.  */
 137         leaq    -VEC_SIZE(%rdi, %rdx), %rdi
 138         leaq    -VEC_SIZE(%rsi, %rdx), %rsi
 139         vmovdqu (%rsi), %ymm2
 140         VPCMPEQ (%rdi), %ymm2, %ymm2
 141         vpmovmskb %ymm2, %eax
 142         subl    $VEC_MASK, %eax
 143         jnz     L(first_vec)
 144         VZEROUPPER
 145         ret
 146
 147         .p2align 4
 148 L(first_vec):
 149         /* A byte or int32 is different within 16 or 32 bytes.  */
 150         tzcntl  %eax, %ecx
 151 # ifdef USE_AS_WMEMCMP
 152         xorl    %eax, %eax
 153         movl    (%rdi, %rcx), %edx
 154         cmpl    (%rsi, %rcx), %edx
 155 L(wmemcmp_return):
 156         setl    %al
 157         negl    %eax
 158         orl     $1, %eax
 159 # else
 160         movzbl  (%rdi, %rcx), %eax
 161         movzbl  (%rsi, %rcx), %edx
 162         sub     %edx, %eax
 163 # endif
 164         VZEROUPPER
 165         ret
 166
 167 # ifdef USE_AS_WMEMCMP
 168         .p2align 4
 169 L(4):
 170         xorl    %eax, %eax
 171         movl    (%rdi), %edx
 172         cmpl    (%rsi), %edx
 173         jne     L(wmemcmp_return)
 174         ret
 175 # else
 176         .p2align 4
 177 L(between_4_7):
 178         /* Load as big endian with overlapping movbe to avoid branches.  */
 179         movbe   (%rdi), %eax
 180         movbe   (%rsi), %ecx
 181         shlq    $32, %rax
 182         shlq    $32, %rcx
 183         movbe   -4(%rdi, %rdx), %edi
 184         movbe   -4(%rsi, %rdx), %esi
 185         orq     %rdi, %rax
 186         orq     %rsi, %rcx
 187         subq    %rcx, %rax
 188         je      L(exit)
 189         sbbl    %eax, %eax
 190         orl     $1, %eax
 191         ret
 192
 193         .p2align 4
 194 L(exit):
 195         ret
 196
 197         .p2align 4
 198 L(between_2_3):
 199         /* Load as big endian to avoid branches.  */
 200         movzwl  (%rdi), %eax
 201         movzwl  (%rsi), %ecx
 202         shll    $8, %eax
 203         shll    $8, %ecx
 204         bswap   %eax
 205         bswap   %ecx
 206         movb    -1(%rdi, %rdx), %al
 207         movb    -1(%rsi, %rdx), %cl
 208         /* Subtraction is okay because the upper 8 bits are zero.  */
 209         subl    %ecx, %eax
 210         ret
 211
 212         .p2align 4
 213 L(1):
 214         movzbl  (%rdi), %eax
 215         movzbl  (%rsi), %ecx
 216         subl    %ecx, %eax
 217         ret
 218 # endif
 219
 220         .p2align 4
 221 L(zero):
 222         xorl    %eax, %eax
 223         ret
 224
 225         .p2align 4
 226 L(less_vec):
 227 # ifdef USE_AS_WMEMCMP
 228         /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 229         cmpb    $4, %dl
 230         je      L(4)
 231         jb      L(zero)
 232 # else
 233         cmpb    $1, %dl
 234         je      L(1)
 235         jb      L(zero)
 236         cmpb    $4, %dl
 237         jb      L(between_2_3)
 238         cmpb    $8, %dl
 239         jb      L(between_4_7)
 240 # endif
 241         cmpb    $16, %dl
 242         jae     L(between_16_31)
 243         /* It is between 8 and 15 bytes.  */
 244         vmovq   (%rdi), %xmm1
 245         vmovq   (%rsi), %xmm2
 246         VPCMPEQ %xmm1, %xmm2, %xmm2
 247         vpmovmskb %xmm2, %eax
 248         subl    $0xffff, %eax
 249         jnz     L(first_vec)
 250         /* Use overlapping loads to avoid branches.  */
 251         leaq    -8(%rdi, %rdx), %rdi
 252         leaq    -8(%rsi, %rdx), %rsi
 253         vmovq   (%rdi), %xmm1
 254         vmovq   (%rsi), %xmm2
 255         VPCMPEQ %xmm1, %xmm2, %xmm2
 256         vpmovmskb %xmm2, %eax
 257         subl    $0xffff, %eax
 258         jnz     L(first_vec)
 259         ret
 260
 261         .p2align 4
 262 L(between_16_31):
 263         /* From 16 to 31 bytes.  No branch when size == 16.  */
 264         vmovdqu (%rsi), %xmm2
 265         VPCMPEQ (%rdi), %xmm2, %xmm2
 266         vpmovmskb %xmm2, %eax
 267         subl    $0xffff, %eax
 268         jnz     L(first_vec)
 269
 270         /* Use overlapping loads to avoid branches.  */
 271         leaq    -16(%rdi, %rdx), %rdi
 272         leaq    -16(%rsi, %rdx), %rsi
 273         vmovdqu (%rsi), %xmm2
 274         VPCMPEQ (%rdi), %xmm2, %xmm2
 275         vpmovmskb %xmm2, %eax
 276         subl    $0xffff, %eax
 277         jnz     L(first_vec)
 278         ret
 279
 280         .p2align 4
 281 L(more_8x_vec):
 282         /* More than 8 * VEC.  Check the first VEC.  */
 283         vmovdqu (%rsi), %ymm2
 284         VPCMPEQ (%rdi), %ymm2, %ymm2
 285         vpmovmskb %ymm2, %eax
 286         subl    $VEC_MASK, %eax
 287         jnz     L(first_vec)
 288
 289         /* Align the first memory area for aligned loads in the loop.
 290            Compute how much the first memory area is misaligned.  */
 291         movq    %rdi, %rcx
 292         andl    $(VEC_SIZE - 1), %ecx
 293         /* Get the negative of offset for alignment.  */
 294         subq    $VEC_SIZE, %rcx
 295         /* Adjust the second memory area.  */
 296         subq    %rcx, %rsi
 297         /* Adjust the first memory area which should be aligned now.  */
 298         subq    %rcx, %rdi
 299         /* Adjust length.  */
 300         addq    %rcx, %rdx
 301
 302 L(loop_4x_vec):
 303         /* Compare 4 * VEC at a time forward.  */
 304         vmovdqu (%rsi), %ymm1
 305         VPCMPEQ (%rdi), %ymm1, %ymm1
 306
 307         vmovdqu VEC_SIZE(%rsi), %ymm2
 308         VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 309         vpand   %ymm2, %ymm1, %ymm5
 310
 311         vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
 312         VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 313         vpand   %ymm3, %ymm5, %ymm5
 314
 315         vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
 316         VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 317         vpand   %ymm4, %ymm5, %ymm5
 318
 319         vptest  %ymm0, %ymm5
 320         jnc     L(4x_vec_end)
 321
 322         addq    $(VEC_SIZE * 4), %rdi
 323         addq    $(VEC_SIZE * 4), %rsi
 324
 325         subq    $(VEC_SIZE * 4), %rdx
 326         cmpq    $(VEC_SIZE * 4), %rdx
 327         jae     L(loop_4x_vec)
 328
 329         /* Less than 4 * VEC.  */
 330         cmpq    $VEC_SIZE, %rdx
 331         jbe     L(last_vec)
 332         cmpq    $(VEC_SIZE * 2), %rdx
 333         jbe     L(last_2x_vec)
 334
 335 L(last_4x_vec):
 336         /* From 2 * VEC to 4 * VEC. */
 337         vmovdqu (%rsi), %ymm2
 338         VPCMPEQ (%rdi), %ymm2, %ymm2
 339         vpmovmskb %ymm2, %eax
 340         subl    $VEC_MASK, %eax
 341         jnz     L(first_vec)
 342
 343         addq    $VEC_SIZE, %rdi
 344         addq    $VEC_SIZE, %rsi
 345         vmovdqu (%rsi), %ymm2
 346         VPCMPEQ (%rdi), %ymm2, %ymm2
 347         vpmovmskb %ymm2, %eax
 348         subl    $VEC_MASK, %eax
 349         jnz     L(first_vec)
 350
 351         /* Use overlapping loads to avoid branches.  */
 352         leaq    -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 353         leaq    -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 354         vmovdqu (%rsi), %ymm2
 355         VPCMPEQ (%rdi), %ymm2, %ymm2
 356         vpmovmskb %ymm2, %eax
 357         subl    $VEC_MASK, %eax
 358         jnz     L(first_vec)
 359
 360         addq    $VEC_SIZE, %rdi
 361         addq    $VEC_SIZE, %rsi
 362         vmovdqu (%rsi), %ymm2
 363         VPCMPEQ (%rdi), %ymm2, %ymm2
 364         vpmovmskb %ymm2, %eax
 365         subl    $VEC_MASK, %eax
 366         jnz     L(first_vec)
 367         VZEROUPPER
 368         ret
 369
 370         .p2align 4
 371 L(4x_vec_end):
 372         vpmovmskb %ymm1, %eax
 373         subl    $VEC_MASK, %eax
 374         jnz     L(first_vec)
 375         vpmovmskb %ymm2, %eax
 376         subl    $VEC_MASK, %eax
 377         jnz     L(first_vec_x1)
 378         vpmovmskb %ymm3, %eax
 379         subl    $VEC_MASK, %eax
 380         jnz     L(first_vec_x2)
 381         vpmovmskb %ymm4, %eax
 382         subl    $VEC_MASK, %eax
 383         tzcntl  %eax, %ecx
 384 # ifdef USE_AS_WMEMCMP
 385         xorl    %eax, %eax
 386         movl    (VEC_SIZE * 3)(%rdi, %rcx), %edx
 387         cmpl    (VEC_SIZE * 3)(%rsi, %rcx), %edx
 388         jmp     L(wmemcmp_return)
 389 # else
 390         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
 391         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %edx
 392         sub     %edx, %eax
 393 # endif
 394         VZEROUPPER
 395         ret
 396
 397         .p2align 4
 398 L(first_vec_x1):
 399         tzcntl  %eax, %ecx
 400 # ifdef USE_AS_WMEMCMP
 401         xorl    %eax, %eax
 402         movl    VEC_SIZE(%rdi, %rcx), %edx
 403         cmpl    VEC_SIZE(%rsi, %rcx), %edx
 404         jmp     L(wmemcmp_return)
 405 # else
 406         movzbl  VEC_SIZE(%rdi, %rcx), %eax
 407         movzbl  VEC_SIZE(%rsi, %rcx), %edx
 408         sub     %edx, %eax
 409 # endif
 410         VZEROUPPER
 411         ret
 412
 413         .p2align 4
 414 L(first_vec_x2):
 415         tzcntl  %eax, %ecx
 416 # ifdef USE_AS_WMEMCMP
 417         xorl    %eax, %eax
 418         movl    (VEC_SIZE * 2)(%rdi, %rcx), %edx
 419         cmpl    (VEC_SIZE * 2)(%rsi, %rcx), %edx
 420         jmp     L(wmemcmp_return)
 421 # else
 422         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
 423         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %edx
 424         sub     %edx, %eax
 425 # endif
 426         VZEROUPPER
 427         ret
 428 END (MEMCMP)
 429 #endif