sysdeps/x86_64/multiarch/memcmp-evex-movbe.S

   1 /* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
   2    Copyright (C) 2021-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23
  24 /* memcmp/wmemcmp is implemented as:
  25    1. Use ymm vector compares when possible. The only case where
  26       vector compares is not possible for when size < CHAR_PER_VEC
  27       and loading from either s1 or s2 would cause a page cross.
  28    2. For size from 2 to 7 bytes on page cross, load as big endian
  29       with movbe and bswap to avoid branches.
  30    3. Use xmm vector compare when size >= 4 bytes for memcmp or
  31       size >= 8 bytes for wmemcmp.
  32    4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
  33       to check for early mismatches. Only do this if its guaranteed the
  34       work is not wasted.
  35    5. If size is 8 * VEC_SIZE or less, unroll the loop.
  36    6. Compare 4 * VEC_SIZE at a time with the aligned first memory
  37       area.
  38    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
  39    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
  40    9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
  41
  42 When possible the implementation tries to optimize for frontend in the
  43 following ways:
  44 Throughput:
  45     1. All code sections that fit are able to run optimally out of the
  46        LSD.
  47     2. All code sections that fit are able to run optimally out of the
  48        DSB
  49     3. Basic blocks are contained in minimum number of fetch blocks
  50        necessary.
  51
  52 Latency:
  53     1. Logically connected basic blocks are put in the same
  54        cache-line.
  55     2. Logically connected basic blocks that do not fit in the same
  56        cache-line are put in adjacent lines. This can get beneficial
  57        L2 spatial prefetching and L1 next-line prefetching.  */
  58
  59 # include <sysdep.h>
  60
  61 # ifndef MEMCMP
  62 #  define MEMCMP        __memcmp_evex_movbe
  63 # endif
  64
  65 # ifndef VEC_SIZE
  66 #  include "x86-evex256-vecs.h"
  67 # endif
  68
  69 # ifdef USE_AS_WMEMCMP
  70 #  define VMOVU_MASK    vmovdqu32
  71 #  define CHAR_SIZE     4
  72 #  define VPCMP vpcmpd
  73 #  define VPCMPEQ       vpcmpeqd
  74 #  define VPTEST        vptestmd
  75
  76 #  define USE_WIDE_CHAR
  77 # else
  78 #  define VMOVU_MASK    vmovdqu8
  79 #  define CHAR_SIZE     1
  80 #  define VPCMP vpcmpub
  81 #  define VPCMPEQ       vpcmpeqb
  82 #  define VPTEST        vptestmb
  83 # endif
  84
  85 # include "reg-macros.h"
  86
  87 # define PAGE_SIZE      4096
  88 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  89
  90
  91 /* Warning!
  92            wmemcmp has to use SIGNED comparison for elements.
  93            memcmp has to use UNSIGNED comparison for elements.
  94 */
  95
  96         .section SECTION(.text), "ax", @progbits
  97 /* Cache align memcmp entry. This allows for much more thorough
  98    frontend optimization.  */
  99 ENTRY_P2ALIGN (MEMCMP, 6)
 100 # ifdef __ILP32__
 101         /* Clear the upper 32 bits.  */
 102         movl    %edx, %edx
 103 # endif
 104         cmp     $CHAR_PER_VEC, %RDX_LP
 105         /* Fall through for [0, VEC_SIZE] as its the hottest.  */
 106         ja      L(more_1x_vec)
 107
 108         /* Create mask of bytes that are guaranteed to be valid because
 109            of length (edx). Using masked movs allows us to skip checks
 110            for page crosses/zero size.  */
 111         mov     $-1, %VRAX
 112         bzhi    %VRDX, %VRAX, %VRAX
 113         /* NB: A `jz` might be useful here. Page-faults that are
 114            invalidated by predicate execution (the evex mask) can be
 115            very slow.  The expectation is this is not the norm so and
 116            "most" code will not regularly call 'memcmp' with length = 0
 117            and memory that is not wired up.  */
 118         KMOV    %VRAX, %k2
 119
 120
 121
 122         /* Safe to load full ymm with mask.  */
 123         VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
 124         /* Slightly different method for VEC_SIZE == 64 to save a bit of
 125            code size. This allows us to fit L(return_vec_0) entirely in
 126            the first cache line.  */
 127 # if VEC_SIZE == 64
 128         VPCMPEQ (%rdi), %VMM(2), %k1{%k2}
 129         KMOV    %k1, %VRCX
 130         sub     %VRCX, %VRAX
 131 # else
 132         VPCMP   $4, (%rdi), %VMM(2), %k1{%k2}
 133         KMOV    %k1, %VRAX
 134         test    %VRAX, %VRAX
 135 # endif
 136         jnz     L(return_vec_0)
 137         ret
 138
 139         .p2align 4,, 11
 140 L(return_vec_0):
 141         bsf     %VRAX, %VRAX
 142 # ifdef USE_AS_WMEMCMP
 143         movl    (%rdi, %rax, CHAR_SIZE), %ecx
 144         xorl    %edx, %edx
 145         cmpl    (%rsi, %rax, CHAR_SIZE), %ecx
 146         /* NB: no partial register stall here because xorl zero idiom
 147            above.  */
 148         setg    %dl
 149         leal    -1(%rdx, %rdx), %eax
 150 # else
 151         movzbl  (%rsi, %rax), %ecx
 152 #  if VEC_SIZE == 64
 153         movb    (%rdi, %rax), %al
 154 #  else
 155         movzbl  (%rdi, %rax), %eax
 156 #  endif
 157         subl    %ecx, %eax
 158 # endif
 159         ret
 160
 161         .p2align 4,, 11
 162 L(more_1x_vec):
 163         /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 164         VMOVU   (%rsi), %VMM(1)
 165         /* Use compare not equals to directly check for mismatch.  */
 166         VPCMP   $4, (%rdi), %VMM(1), %k1
 167         KMOV    %k1, %VRAX
 168         /* NB: eax must be destination register if going to
 169            L(return_vec_[0,2]). For L(return_vec_3) destination
 170            register must be ecx.  */
 171         test    %VRAX, %VRAX
 172         jnz     L(return_vec_0)
 173
 174         cmpq    $(CHAR_PER_VEC * 2), %rdx
 175         jbe     L(last_1x_vec)
 176
 177         /* Check second VEC no matter what.  */
 178         VMOVU   VEC_SIZE(%rsi), %VMM(2)
 179         VPCMP   $4, VEC_SIZE(%rdi), %VMM(2), %k1
 180         KMOV    %k1, %VRAX
 181         test    %VRAX, %VRAX
 182         jnz     L(return_vec_1)
 183
 184         /* Less than 4 * VEC.  */
 185         cmpq    $(CHAR_PER_VEC * 4), %rdx
 186         jbe     L(last_2x_vec)
 187
 188         /* Check third and fourth VEC no matter what.  */
 189         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
 190         VPCMP   $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
 191         KMOV    %k1, %VRAX
 192         test    %VRAX, %VRAX
 193         jnz     L(return_vec_2)
 194
 195         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
 196         VPCMP   $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
 197         KMOV    %k1, %VRCX
 198         test    %VRCX, %VRCX
 199         jnz     L(return_vec_3)
 200
 201         /* Go to 4x VEC loop.  */
 202         cmpq    $(CHAR_PER_VEC * 8), %rdx
 203         ja      L(more_8x_vec)
 204
 205         /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 206            branches.  */
 207
 208         /* Load first two VEC from s2 before adjusting addresses.  */
 209         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
 210         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2)
 211         leaq    -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
 212         leaq    -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 213
 214         /* Wait to load from s1 until addressed adjust due to
 215            unlamination of microfusion with complex address mode.  */
 216
 217         /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 218            will have some 1s.  */
 219         vpxorq  (%rdi), %VMM(1), %VMM(1)
 220         vpxorq  (VEC_SIZE)(%rdi), %VMM(2), %VMM(2)
 221
 222         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
 223         vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
 224
 225         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
 226         /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
 227            oring with VEC(1). Result is stored in VEC(4).  */
 228         vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
 229
 230         /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4).  */
 231         vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
 232
 233         /* Test VEC(4) against itself. Store any CHAR mismatches in k1.
 234          */
 235         VPTEST  %VMM(4), %VMM(4), %k1
 236         /* k1 must go to ecx for L(return_vec_0_1_2_3).  */
 237         KMOV    %k1, %VRCX
 238         test    %VRCX, %VRCX
 239         jnz     L(return_vec_0_1_2_3)
 240         /* NB: eax must be zero to reach here.  */
 241         ret
 242
 243
 244         .p2align 4,, 9
 245 L(8x_end_return_vec_0_1_2_3):
 246         movq    %rdx, %rdi
 247 L(8x_return_vec_0_1_2_3):
 248         /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64.  */
 249 # if VEC_SIZE == 64
 250         KMOV    %k1, %VRCX
 251 # endif
 252         addq    %rdi, %rsi
 253 L(return_vec_0_1_2_3):
 254         VPTEST  %VMM(1), %VMM(1), %k0
 255         KMOV    %k0, %VRAX
 256         test    %VRAX, %VRAX
 257         jnz     L(return_vec_0)
 258
 259         VPTEST  %VMM(2), %VMM(2), %k0
 260         KMOV    %k0, %VRAX
 261         test    %VRAX, %VRAX
 262         jnz     L(return_vec_1)
 263
 264         VPTEST  %VMM(3), %VMM(3), %k0
 265         KMOV    %k0, %VRAX
 266         test    %VRAX, %VRAX
 267         jnz     L(return_vec_2)
 268         .p2align 4,, 2
 269 L(return_vec_3):
 270         /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
 271            fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
 272            line.  */
 273         bsf     %VRCX, %VRCX
 274 # ifdef USE_AS_WMEMCMP
 275         movl    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 276         xorl    %edx, %edx
 277         cmpl    (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 278         setg    %dl
 279         leal    -1(%rdx, %rdx), %eax
 280 # else
 281         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
 282         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
 283         subl    %ecx, %eax
 284 # endif
 285         ret
 286
 287
 288         .p2align 4,, 8
 289 L(return_vec_1):
 290         /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
 291            fetch block.  */
 292         bsf     %VRAX, %VRAX
 293 # ifdef USE_AS_WMEMCMP
 294         movl    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 295         xorl    %edx, %edx
 296         cmpl    VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
 297         setg    %dl
 298         leal    -1(%rdx, %rdx), %eax
 299 # else
 300         movzbl  VEC_SIZE(%rsi, %rax), %ecx
 301         movzbl  VEC_SIZE(%rdi, %rax), %eax
 302         subl    %ecx, %eax
 303 # endif
 304         ret
 305
 306         .p2align 4,, 7
 307 L(return_vec_2):
 308         /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
 309            fetch block.  */
 310         bsf     %VRAX, %VRAX
 311 # ifdef USE_AS_WMEMCMP
 312         movl    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 313         xorl    %edx, %edx
 314         cmpl    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 315         setg    %dl
 316         leal    -1(%rdx, %rdx), %eax
 317 # else
 318         movzbl  (VEC_SIZE * 2)(%rsi, %rax), %ecx
 319         movzbl  (VEC_SIZE * 2)(%rdi, %rax), %eax
 320         subl    %ecx, %eax
 321 # endif
 322         ret
 323
 324         .p2align 4,, 8
 325 L(more_8x_vec):
 326         /* Set end of s1 in rdx.  */
 327         leaq    -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
 328         /* rsi stores s2 - s1. This allows loop to only update one
 329            pointer.  */
 330         subq    %rdi, %rsi
 331         /* Align s1 pointer.  */
 332         andq    $-VEC_SIZE, %rdi
 333         /* Adjust because first 4x vec where check already.  */
 334         subq    $-(VEC_SIZE * 4), %rdi
 335
 336         .p2align 4
 337 L(loop_4x_vec):
 338         VMOVU   (%rsi, %rdi), %VMM(1)
 339         vpxorq  (%rdi), %VMM(1), %VMM(1)
 340         VMOVU   VEC_SIZE(%rsi, %rdi), %VMM(2)
 341         vpxorq  VEC_SIZE(%rdi), %VMM(2), %VMM(2)
 342         VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
 343         vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
 344         VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
 345         vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
 346         vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
 347         VPTEST  %VMM(4), %VMM(4), %k1
 348         /* If VEC_SIZE == 64 just branch with KTEST. We have free port0
 349            space and it allows the loop to fit in 2x cache lines
 350            instead of 3.  */
 351 # if VEC_SIZE == 64
 352         KTEST   %k1, %k1
 353 # else
 354         KMOV    %k1, %VRCX
 355         test    %VRCX, %VRCX
 356 # endif
 357         jnz     L(8x_return_vec_0_1_2_3)
 358         subq    $-(VEC_SIZE * 4), %rdi
 359         cmpq    %rdx, %rdi
 360         jb      L(loop_4x_vec)
 361         subq    %rdx, %rdi
 362         /* rdi has 4 * VEC_SIZE - remaining length.  */
 363         cmpl    $(VEC_SIZE * 3), %edi
 364         jge     L(8x_last_1x_vec)
 365         /* Load regardless of branch.  */
 366         VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
 367
 368         /* Separate logic as we can only use testb for VEC_SIZE == 64.
 369          */
 370 # if VEC_SIZE == 64
 371         testb   %dil, %dil
 372         js      L(8x_last_2x_vec)
 373 # else
 374         cmpl    $(VEC_SIZE * 2), %edi
 375         jge     L(8x_last_2x_vec)
 376 # endif
 377
 378         vpxorq  (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3)
 379
 380         VMOVU   (%rsi, %rdx), %VMM(1)
 381         vpxorq  (%rdx), %VMM(1), %VMM(1)
 382
 383         VMOVU   VEC_SIZE(%rsi, %rdx), %VMM(2)
 384         vpxorq  VEC_SIZE(%rdx), %VMM(2), %VMM(2)
 385         VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
 386         vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4)
 387         vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
 388         VPTEST  %VMM(4), %VMM(4), %k1
 389         /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in
 390            `k1`  if VEC_SIZE == 64.  */
 391 # if VEC_SIZE == 64
 392         KTEST   %k1, %k1
 393 # else
 394         KMOV    %k1, %VRCX
 395         test    %VRCX, %VRCX
 396 # endif
 397         jnz     L(8x_end_return_vec_0_1_2_3)
 398         /* NB: eax must be zero to reach here.  */
 399         ret
 400
 401         /* Only entry is from L(more_8x_vec).  */
 402         .p2align 4,, 6
 403 L(8x_last_2x_vec):
 404         VPCMP   $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1
 405         KMOV    %k1, %VRAX
 406         test    %VRAX, %VRAX
 407         jnz     L(8x_return_vec_2)
 408         .p2align 4,, 5
 409 L(8x_last_1x_vec):
 410         VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
 411         VPCMP   $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1
 412         KMOV    %k1, %VRAX
 413         test    %VRAX, %VRAX
 414         jnz     L(8x_return_vec_3)
 415         ret
 416
 417         /* Not ideally aligned (at offset +9 bytes in fetch block) but
 418            not aligning keeps it in the same cache line as
 419            L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
 420            size.  */
 421         .p2align 4,, 4
 422 L(8x_return_vec_2):
 423         subq    $VEC_SIZE, %rdx
 424 L(8x_return_vec_3):
 425         bsf     %VRAX, %VRAX
 426 # ifdef USE_AS_WMEMCMP
 427         leaq    (%rdx, %rax, CHAR_SIZE), %rax
 428         movl    (VEC_SIZE * 3)(%rax), %ecx
 429         xorl    %edx, %edx
 430         cmpl    (VEC_SIZE * 3)(%rsi, %rax), %ecx
 431         setg    %dl
 432         leal    -1(%rdx, %rdx), %eax
 433 # else
 434         addq    %rdx, %rax
 435         movzbl  (VEC_SIZE * 3)(%rsi, %rax), %ecx
 436         movzbl  (VEC_SIZE * 3)(%rax), %eax
 437         subl    %ecx, %eax
 438 # endif
 439         ret
 440
 441         .p2align 4,, 8
 442 L(last_2x_vec):
 443         /* Check second to last VEC.  */
 444         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
 445         VPCMP   $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
 446         KMOV    %k1, %VRAX
 447         test    %VRAX, %VRAX
 448         jnz     L(return_vec_1_end)
 449
 450         /* Check last VEC.  */
 451         .p2align 4,, 8
 452 L(last_1x_vec):
 453         VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
 454         VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
 455         KMOV    %k1, %VRAX
 456         test    %VRAX, %VRAX
 457         jnz     L(return_vec_0_end)
 458         ret
 459
 460
 461         /* Don't fully align. Takes 2-fetch blocks either way and
 462            aligning will cause code to spill into another cacheline.
 463          */
 464         .p2align 4,, 3
 465 L(return_vec_1_end):
 466         /* Use bsf to save code size. This is necessary to have
 467            L(one_or_less) fit in aligning bytes between.  */
 468         bsf     %VRAX, %VRAX
 469         addl    %edx, %eax
 470 # ifdef USE_AS_WMEMCMP
 471         movl    -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 472         xorl    %edx, %edx
 473         cmpl    -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 474         setg    %dl
 475         leal    -1(%rdx, %rdx), %eax
 476 # else
 477         movzbl  -(VEC_SIZE * 2)(%rsi, %rax), %ecx
 478         movzbl  -(VEC_SIZE * 2)(%rdi, %rax), %eax
 479         subl    %ecx, %eax
 480 # endif
 481         ret
 482
 483         .p2align 4,, 2
 484         /* Don't align. Takes 2-fetch blocks either way and aligning
 485            will cause code to spill into another cacheline.  */
 486 L(return_vec_0_end):
 487         bsf     %VRAX, %VRAX
 488         addl    %edx, %eax
 489 # ifdef USE_AS_WMEMCMP
 490         movl    -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 491         xorl    %edx, %edx
 492         cmpl    -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
 493         setg    %dl
 494         leal    -1(%rdx, %rdx), %eax
 495 # else
 496         movzbl  -VEC_SIZE(%rsi, %rax), %ecx
 497         movzbl  -VEC_SIZE(%rdi, %rax), %eax
 498         subl    %ecx, %eax
 499 # endif
 500         ret
 501         /* evex256: 2-byte until next cache line. evex512: 46-bytes
 502            until next cache line.  */
 503 END (MEMCMP)
 504 #endif